In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import scipy
import sys
import os
import glob
%matplotlib inline

### Tips for reducing h5ad file size

- [x] use compression=gzip in write_h5ad function
- [x] matrix data stored as float32 rather than float64
- [x] any metadata columns (obs/var) that are 64bit can probably become 32bit
- [x] any string columns often are much smaller if made categorical
- [x] remove any extra ‘layers’ that aren’t desired in the final version
- [x] ensure X & raw.X are both sparse.csr_matrix

In [2]:
# Create an empty list to store the matching files
file_list = {}

# Use glob to find files matching the pattern and add them to the list
for file in glob.glob("objects/*.h5Seurat"):
    
    # Use the os.path.splitext() function to split the filename into two parts: the root and the extension
    file_root, file_ext = os.path.splitext(file)
    
    file_list.update({os.path.basename(file_root): file_root})

In [3]:
file_list

{'Ovarian.cancer.cell': 'objects/Ovarian.cancer.cell',
 'T.super': 'objects/T.super',
 'Myeloid.super': 'objects/Myeloid.super',
 'CD8.T': 'objects/CD8.T',
 'DCs': 'objects/DCs',
 'Macrophages': 'objects/Macrophages'}

In [4]:
# Read study description metadata
desc=pd.read_csv("https://docs.google.com/spreadsheets/d/15RWlyM8EfB7CgFGc22nIw0yBErkdHz0EFf0DYBMae2M/export?gid=712477828&format=csv")

In [5]:
desc

Unnamed: 0,object,study_description
0,Cohort,MSK SPECTRUM - All cells
1,Ovarian.cancer.cell,MSK SPECTRUM - Malignant and non-malignant epi...
2,T.super,"MSK SPECTRUM - CD4+ T cells, CD8+ T cells, ILC..."
3,Myeloid.super,MSK SPECTRUM - Macrophages and dendritic cells
4,CD8.T,MSK SPECTRUM - CD8+ T cells
5,DCs,MSK SPECTRUM - Dendritic cells
6,Macrophages,MSK SPECTRUM - Macrophages


In [6]:
def convert_float32(df):
    
    # Select columns with 'float64' dtype  
    float64_cols = list(df.select_dtypes(include='float64'))

    # The same code again calling the columns
    df[float64_cols] = df[float64_cols].astype('float32')
    return df

def convert_categoricals(df):
    cat_cols = list(df.select_dtypes(include = "object"))
    
    df[cat_cols] = df[cat_cols].astype("category")
    return(df)

In [7]:
# Create function to load and save
def format_cellxgene(h5ad_path, ct, overwrite = False, save = True, return_adata = False):
    print(f"Celltype: {ct} -- {h5ad_path}")

    final_out = f"./objects/{ct}_cellxgene.h5ad"
    
    if os.path.exists(final_out) and not overwrite:
        print(f"Skipping {ct}. Final output exists: {final_out}")
    else:
        print(f"Compressing and formating {ct} anndata")
        adata = sc.read_h5ad(h5ad_path)
        adata.var["feature_is_filtered"] = adata.var.feature_is_filtered == 1
        adata.var["gene_name"] = adata.var.index
        adata.var.set_index("gene.ids", inplace=True)
        adata.obs = convert_float32(adata.obs)
        adata.var = convert_float32(adata.var)
        adata.obs = convert_categoricals(adata.obs)
        adata.var = convert_categoricals(adata.var)
        adata.obs["is_primary_data"] = False
        adata.uns["schema_version"] = "3.0.0"
        adata.uns["title"] = desc.loc[desc["object"] == ct, "study_description"].iloc[0]
        # Convert relevant columns to categorical
        # to_cat = ["doublet", "author_sample_id", "Phase", "cell_id"]
        # adata.obs[to_cat] = adata.obs[to_cat].apply(pd.Categorical)
        # Float32 matrices
        adata.X = scipy.sparse.csr_matrix(adata.X, dtype = np.float32)
        raw = adata.raw.to_adata()
        raw.X = scipy.sparse.csr_matrix(raw.X, dtype = np.float32)    
        adata.raw = raw
        
        if save:
            print(f"Saving to {final_out}")
            adata.write_h5ad(filename=final_out, compression="gzip")
        if return_adata:
            return adata

In [9]:
# Loop and save
for ct in file_list:
    adata = format_cellxgene(h5ad_path=f"{file_list[ct]}.h5ad", ct = ct, overwrite = False, save = True)

Celltype: Ovarian.cancer.cell -- objects/Ovarian.cancer.cell.h5ad
Skipping Ovarian.cancer.cell. Final output exists: ./objects/Ovarian.cancer.cell_cellxgene.h5ad
Celltype: T.super -- objects/T.super.h5ad
Compressing and formating T.super anndata


  return anndata.AnnData(


Saving to ./objects/T.super_cellxgene.h5ad
Celltype: Myeloid.super -- objects/Myeloid.super.h5ad
Compressing and formating Myeloid.super anndata


  return anndata.AnnData(


Saving to ./objects/Myeloid.super_cellxgene.h5ad
Celltype: CD8.T -- objects/CD8.T.h5ad
Compressing and formating CD8.T anndata


  return anndata.AnnData(


Saving to ./objects/CD8.T_cellxgene.h5ad
Celltype: DCs -- objects/DCs.h5ad
Compressing and formating DCs anndata


  return anndata.AnnData(


Saving to ./objects/DCs_cellxgene.h5ad
Celltype: Macrophages -- objects/Macrophages.h5ad
Compressing and formating Macrophages anndata


  return anndata.AnnData(


Saving to ./objects/Macrophages_cellxgene.h5ad
