In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import scipy
import sys
import os
import glob
%matplotlib inline

### Tips for reducing h5ad file size

- [x] use compression=gzip in write_h5ad function
- [x] matrix data stored as float32 rather than float64
- [x] any metadata columns (obs/var) that are 64bit can probably become 32bit
- [x] any string columns often are much smaller if made categorical
- [x] remove any extra ‘layers’ that aren’t desired in the final version
- [x] ensure X & raw.X are both sparse.csr_matrix

In [53]:
# Create an empty list to store the matching files
file_list = []

# Use glob to find files matching the pattern and add them to the list
for file in glob.glob("*.h5Seurat"):
    
    # Use the os.path.splitext() function to split the filename into two parts: the root and the extension
    file_root, file_ext = os.path.splitext(file)
    file_list.append(file_root)

In [74]:
# Create function to load and save
def save_cellxgene(h5ad_path, ct):
    print(f"Celltype: {ct} -- {h5ad_path}")
    adata = sc.read_h5ad(f"{ct}.h5ad")
    adata.uns["schema_version"] = "3.0.0"
    adata.uns["title"] = "MSK SPECTRUM – Ovarian cancer mutational processes drive site-specific immune phenotypes"
    # Convert relevant columns to categorical
    to_cat = ["doublet", "author_sample_id", "Phase", "cell_id", "is_primary_data"]
    adata.obs[to_cat] = adata.obs[to_cat].apply(pd.Categorical)
    # Float32 matrices
    adata.X = scipy.sparse.csr_matrix(adata.X, dtype = np.float32)
    raw = adata.raw.to_adata()
    raw.X = scipy.sparse.csr_matrix(raw.X, dtype = np.float32)    
    adata.raw = raw
    # Write out
    adata.write_h5ad(filename = f"./{ct}_cellxgene.h5ad", compression="gzip")

In [None]:
for ct in file_list:
    save_cellxgene(h5ad_path=f"{ct}.h5ad", ct = ct)

Celltype: ovarian_cancer -- ovarian_cancer.h5ad


  return anndata.AnnData(


Celltype: T_cell -- T_cell.h5ad


  return anndata.AnnData(
