In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
from pathlib import Path

In [2]:
SAMPLE_NAME = "concat_withWu2022"

In [3]:
DIR2SAVE = Path(
    f"/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/nichenet/{SAMPLE_NAME}/prepareInput/"
)
DIR2SAVE.mkdir(parents=True, exist_ok=True)

SCRNA_FILE2LOAD = Path(f"/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/cell2location/prepareInputRNA/Multiome_Che_Wu_CRC_LM_annotations_raw_no_cycling_TME.h5ad")
# ideally use new annotations with enivronment used in cellphonedb 
adata = sc.read_h5ad(SCRNA_FILE2LOAD)

In [4]:
adata

AnnData object with n_obs × n_vars = 112034 × 39559
    obs: 'Patient', 'Sample', 'Tissue', 'Therapy', 'cell_source', 'Annotation_scVI', 'Annotation_scVI_detailed', 'Annotation_source', 'technology'
    var: 'n_cells-Nasrine', 'n_cells'

In [5]:
# check is raw
np.max(adata.X)

40676.0

In [6]:
### we want to save all celltypes
microenv2pick = "all_celltypes"

### light pre-processing filter to only keep genes that are at least expressed in 5 cells no?

In [7]:
print("Before filtering: ", adata.shape)
sc.pp.filter_genes(adata, min_cells=5)
print("After gene filtering", adata.shape)
sc.pp.filter_cells(adata, min_genes=200)
print("After cell filtering", adata.shape)
print("Checking raw max value", np.max(adata.X))

sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True) # calculate metrics to have maybe for seurat

Before filtering:  (112034, 39559)
After gene filtering (112034, 34090)
After cell filtering (112034, 34090)
Checking raw max value 40676.0


### save as a sparse matrix in python:

In [8]:
DIR2SAVE

PosixPath('/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/nichenet/concat_withWu2022/prepareInput')

In [9]:
adata.write(DIR2SAVE.joinpath(f"counts_microenv{microenv2pick}.h5ad"))

from scipy.io import mmread,mmwrite
mmwrite(DIR2SAVE.joinpath(f"counts_microenv{microenv2pick}.mtx"),adata.X)

# rename index of .var and .obs to something more sensible like barrcode and gene_name
adata.var.index.set_names(names="gene_name", inplace=True)
adata.obs.index.set_names(names="barcode", inplace=True)

adata.var.drop(columns=['n_cells-Nasrine', 'n_cells'], inplace=True)

# save obs index
adata.obs.to_csv(DIR2SAVE.joinpath(f"obs_microenv{microenv2pick}.csv"), header=True, index=True)

# save var names
adata.var.to_csv(DIR2SAVE.joinpath(f"var_microenv{microenv2pick}.csv"), header=True, index=True)

In [10]:
microenv2pick

'all_celltypes'