In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import scipy
import sys
%matplotlib inline

### Tips for reducing h5ad file size

- [x] use compression=gzip in write_h5ad function
- [x] matrix data stored as float32 rather than float64
- [x] any metadata columns (obs/var) that are 64bit can probably become 32bit
- [x] any string columns often are much smaller if made categorical
- [x] remove any extra ‘layers’ that aren’t desired in the final version
- [x] ensure X & raw.X are both sparse.csr_matrix

https://cellxgene.cziscience.com/docs/032__Contribute%20and%20Publish%20Data

In [None]:
# Load file with obs and gene metadata
meta = sc.read_h5ad("./integrated_seurat.h5ad")
meta

In [None]:
metadata = pd.read_csv("/work/shah/vazquezi/data/transfers/zatzmanm/release/cellxgene/20221021_conversion/scrna_cell_metadata.tsv", sep="\t", )

In [None]:
# Get columns with mixed types
def check_mixed(df): 
    for col in df.columns:
        weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis=1)
        if len(metadata[weird]) > 0:
            print(col)

In [None]:
check_mixed(metadata)

In [None]:
# Fix these
metadata["cell_type_ontology_term_id"] = metadata["cell_type_ontology_term_id"].astype(str)
metadata["tissue_ontology_term_id"] = metadata["tissue_ontology_term_id"].astype(str)

In [None]:
check_mixed(metadata)

In [None]:
metadata.set_index("cell_id", drop=False, inplace=True)

In [None]:
# Load integrated data
adata = sc.read_h5ad("./01_raw_plus_normalized.h5ad")
adata

In [None]:
all(adata.obs.index == metadata.index)

In [None]:
all(adata.obs.index == meta.obs.index)

In [None]:
adata.obs = metadata
adata.obsm = meta.obsm
adata.uns = meta.uns

In [None]:
adata.uns["schema_version"] = "3.0.0"
adata.uns["title"] = "MSK SPECTRUM – Ovarian cancer mutational processes drive site-specific immune phenotypes"

# adata.uns["batch_condition"] = ["donor_id", "procedure"] # OPTIONAL

In [None]:
sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.pl.embedding(adata = adata, basis = "X_umap50", color = "author_cell_type")

In [None]:
# Get matching gene indices for the hvgs
meta.var["gene_ids"] = meta.var["gene.ids"]

adata_var = adata.var.copy()

hvg_idx = adata_var.reset_index().reset_index().set_index("gene_ids").loc[meta.var["gene_ids"], 'index'].values

In [None]:
# Label filtered features
adata.var["feature_is_filtered"] = 'True'
adata.var.iloc[hvg_idx,2] = 'False'

In [None]:
adata.obs["is_primary_data"] = 'True'

In [None]:
# Space saving measures
adata.obs.dtypes

In [None]:
# Convert relevant columns to categorical
to_cat = ["doublet", "author_sample_id", "Phase", "cell_id", "is_primary_data"]

adata.obs[to_cat] = adata.obs[to_cat].apply(pd.Categorical)

In [None]:
# all float 64 to float32
to_float32 = ["percent.mt", "percent.rb", "S.Score", "G2M.Score", "CC.Diff", "doublet_score"]

adata.obs[to_float32] = adata.obs[to_float32].apply(np.float32)

In [None]:
check_mixed(adata.obs)

In [None]:
adata.obs.columns

In [None]:
adata.obs.head()

In [None]:
adata.var.head()

In [None]:
# Write out
adata.write_h5ad(filename = "/juno/work/shah/users/zatzmanm/repos/spectrum_scrna/SPECTRUM_scRNA_cohort.h5ad", compression="gzip")

In [None]:
# Test load
spectrum = sc.read_h5ad(filename="/juno/work/shah/users/zatzmanm/repos/spectrum_scrna/SPECTRUM_scRNA_cohort.h5ad", backed = "r")
spectrum

In [None]:
spectrum.obs.head()

In [None]:
spectrum.var.head()

In [None]:
# Save metadata
spectrum.obs.to_csv("/juno/work/shah/users/zatzmanm/repos/spectrum_scrna/spectrum_scrna_metadata.csv.gz", index=False)