In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
import cell2location
import matplotlib.pyplot as plt
import matplotlib as mpl
from pathlib import Path

Global seed set to 0


In [2]:
SAMPLE_NAME = "concat_withWu2022"

In [3]:
DIR2SAVE = Path(
    f"/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/cellphonedb3/{SAMPLE_NAME}/"
)
DIR2SAVE.mkdir(parents=True, exist_ok=True)

In [4]:
DIR2LOAD = Path(
    f"/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/cell2loc_spatialde2/{SAMPLE_NAME}"
)  # to change
DIR2LOAD.mkdir(parents=True, exist_ok=True)

### Use no cycling scRNA-seq dataset 

In [5]:
adata = sc.read_h5ad(
    "/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/cell2location/prepareInputRNA/Multiome_Che_Wu_CRC_LM_annotations_raw_no_cycling_TME.h5ad"
)

In [6]:
adata

AnnData object with n_obs × n_vars = 112034 × 39559
    obs: 'Patient', 'Sample', 'Tissue', 'Therapy', 'cell_source', 'Annotation_scVI', 'Annotation_scVI_detailed', 'Annotation_source', 'technology'
    var: 'n_cells-Nasrine', 'n_cells'

In [7]:
# check that the data is raw
adata.X[0:5][0:5].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [8]:
np.max(adata.X)

40676.0

### Select only cell types that we are interested in, aka, the ones that are in the environments no?

In [9]:
microenvs = pd.read_csv(DIR2SAVE.joinpath("microenviroments_cell2loc_spatialde2.tsv"), sep="\t", index_col=None)

In [10]:
set(microenvs["cell_type"])

{'B',
 'C1QC Mac',
 'CD4 Tfh',
 'CD4 Th',
 'CD4 Th HSP',
 'CD4 Th17',
 'CD4 Tn',
 'CD8 Tem',
 'CD8 Tem GZMB',
 'CD8 Tem HSP',
 'CD8 Tex',
 'Colonocyte',
 'ECM CAF',
 'FCN1 Mono',
 'Goblet',
 'HSP Mono',
 'Hypoxia',
 'IL1B Mac',
 'Intermediate',
 'Lymphatic endo',
 'MAIT',
 'Mast',
 'Myofibroblast',
 'NK1',
 'NK2',
 'NKT',
 'NLRP3 Mac',
 'Neutrophil',
 'Pericyte',
 'Plasma',
 'SPP1 Mac',
 'Stem',
 'Stem (NOTUM high)',
 'TA1',
 'TA2',
 'Tip-like endo',
 'Treg',
 'UPR',
 'cDC1',
 'cDC2',
 'gdT',
 'ipEMT',
 'migDC',
 'pDC',
 'pEMT'}

In [11]:
celltypes2consider = list(set(microenvs["cell_type"]))
celltypes2consider

['B',
 'Treg',
 'cDC1',
 'pEMT',
 'Stem (NOTUM high)',
 'MAIT',
 'FCN1 Mono',
 'ECM CAF',
 'UPR',
 'pDC',
 'gdT',
 'HSP Mono',
 'NLRP3 Mac',
 'Lymphatic endo',
 'Hypoxia',
 'CD8 Tem GZMB',
 'Stem',
 'CD4 Tn',
 'NK1',
 'TA1',
 'Pericyte',
 'ipEMT',
 'Colonocyte',
 'Myofibroblast',
 'NKT',
 'Tip-like endo',
 'IL1B Mac',
 'CD8 Tem',
 'CD8 Tem HSP',
 'C1QC Mac',
 'SPP1 Mac',
 'Mast',
 'cDC2',
 'CD4 Th17',
 'CD8 Tex',
 'NK2',
 'Goblet',
 'CD4 Tfh',
 'CD4 Th',
 'CD4 Th HSP',
 'Neutrophil',
 'migDC',
 'TA2',
 'Intermediate',
 'Plasma']

In [12]:
print(f"{len(celltypes2consider)} cell types to consider")

45 cell types to consider


In [13]:
adata.shape

(112034, 39559)

In [14]:
# select from adata only these cell types
adata = adata[adata.obs.Annotation_scVI_detailed.isin(celltypes2consider)].copy()

In [15]:
assert set(adata.obs.Annotation_scVI_detailed) == set(celltypes2consider)

In [16]:
adata.shape

(109862, 39559)

### filter to only keep genes that are at least expressed in 5 cells no? (light pre-processing)

In [17]:
sc.pp.filter_genes(adata, min_cells=5)

In [18]:
adata.shape

(109862, 33787)

In [19]:
sc.pp.filter_cells(adata, min_genes=200)

In [20]:
adata.shape

(109862, 33787)

### Save gene expression data

generate counts that are normalised (per cell) but are not log transformed data

In [21]:
adata.X.shape

(109862, 33787)

In [22]:
adata_cdb = anndata.AnnData(
    X=adata.X,
    var=adata.var,
    obs=adata.obs[["cell_source", "Annotation_scVI", "Annotation_scVI_detailed"]],
    asview=False,
)
print(adata_cdb.shape)

(109862, 33787)


In [23]:
sc.pp.normalize_total(adata_cdb, target_sum=1e4, inplace=True)
adata_cdb.X.shape

(109862, 33787)

In [24]:
adata_cdb.obs["Annotation_scVI_detailed"].value_counts()

CD4 Th               10325
CD8 Tem               8683
Treg                  7655
CD8 Tex               7531
CD4 Th HSP            7246
CD8 Tem HSP           6178
CD4 Tn                4778
SPP1 Mac              4619
MAIT                  4463
TA1                   4116
CD8 Tem GZMB          3269
NK1                   3178
CD4 Tfh               3010
Neutrophil            2892
Hypoxia               2481
NK2                   2321
B                     2116
NLRP3 Mac             2075
C1QC Mac              2006
CD4 Th17              1937
Stem (NOTUM high)     1933
Colonocyte            1913
gdT                   1761
Plasma                1610
cDC2                  1543
Stem                  1400
NKT                   1380
Intermediate          1319
FCN1 Mono              999
pEMT                   998
UPR                    759
IL1B Mac               550
ipEMT                  469
ECM CAF                441
HSP Mono               387
Goblet                 323
Tip-like endo          273
p

In [25]:
np.max(adata_cdb.X)

8511.747

In [26]:
DIR2SAVE

PosixPath('/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/cellphonedb3/concat_withWu2022')

In [27]:
adata_cdb.write(DIR2SAVE.joinpath("Multiome_Che_Wu_CRC_LM_counts_normalised.h5ad"))

In [28]:
adata_cdb.var

Unnamed: 0,n_cells-Nasrine,n_cells
A1BG,15552.0,13294
A1BG-AS1,1504.0,1315
A1CF,536.0,5241
A2M,7274.0,4858
A2M-AS1,2075.0,1841
...,...,...
ZYG11B,3529.0,6974
ZYX,19380.0,18347
ZZEF1,6826.0,11567
ZZZ3,1537.0,1254


In [29]:
adata_cdb.obs

Unnamed: 0,cell_source,Annotation_scVI,Annotation_scVI_detailed
CRC01_LM#AAACATGCATCAGCAC-1-BCI-Nuclei,BCI-Nuclei,Myeloid,SPP1 Mac
CRC01_LM#AACATAGCAGGATAAC-1-BCI-Nuclei,BCI-Nuclei,Myeloid,SPP1 Mac
CRC01_LM#AGAACAAGTGGTTATG-1-BCI-Nuclei,BCI-Nuclei,Endothelial,Lymphatic endo
CRC01_LM#AGGTTGCGTGAGCAAG-1-BCI-Nuclei,BCI-Nuclei,Endothelial,Tip-like endo
CRC01_LM#CACCTCAGTGGTTATG-1-BCI-Nuclei,BCI-Nuclei,Endothelial,Tip-like endo
...,...,...,...
CRC15_LM#TTTATGGAGTTTCCGC-1-BCI-Nuclei,BCI-Nuclei,Epithelial,TA1
CRC15_LM#TTTCCGGGTTCTTTAG-1-BCI-Nuclei,BCI-Nuclei,Epithelial,Hypoxia
CRC15_LM#TTTGGCTGTTAACGAT-1-BCI-Nuclei,BCI-Nuclei,Epithelial,TA1
CRC15_LM#TTTGTCCCAGGAATCG-1-BCI-Nuclei,BCI-Nuclei,Epithelial,TA1


In [30]:
### need to save as sparse matrix because andata version clash for old cellphonedb

In [31]:
# save expression matrix normalised
import scipy

scipy.sparse.save_npz(DIR2SAVE.joinpath("counts_normalised.npz"), adata_cdb.X)

# save obs index
adata_cdb.obs.to_csv(DIR2SAVE.joinpath("obs.csv"), header=True)

# save var names
adata_cdb.var.to_csv(DIR2SAVE.joinpath("var.csv"), header=True)

In [32]:
adata_cdb.obs

Unnamed: 0,cell_source,Annotation_scVI,Annotation_scVI_detailed
CRC01_LM#AAACATGCATCAGCAC-1-BCI-Nuclei,BCI-Nuclei,Myeloid,SPP1 Mac
CRC01_LM#AACATAGCAGGATAAC-1-BCI-Nuclei,BCI-Nuclei,Myeloid,SPP1 Mac
CRC01_LM#AGAACAAGTGGTTATG-1-BCI-Nuclei,BCI-Nuclei,Endothelial,Lymphatic endo
CRC01_LM#AGGTTGCGTGAGCAAG-1-BCI-Nuclei,BCI-Nuclei,Endothelial,Tip-like endo
CRC01_LM#CACCTCAGTGGTTATG-1-BCI-Nuclei,BCI-Nuclei,Endothelial,Tip-like endo
...,...,...,...
CRC15_LM#TTTATGGAGTTTCCGC-1-BCI-Nuclei,BCI-Nuclei,Epithelial,TA1
CRC15_LM#TTTCCGGGTTCTTTAG-1-BCI-Nuclei,BCI-Nuclei,Epithelial,Hypoxia
CRC15_LM#TTTGGCTGTTAACGAT-1-BCI-Nuclei,BCI-Nuclei,Epithelial,TA1
CRC15_LM#TTTGTCCCAGGAATCG-1-BCI-Nuclei,BCI-Nuclei,Epithelial,TA1


### Save metadata
metafile `.tsv` file. 2-column file (cell barcode, cell type assigned)

In [34]:
df_meta = pd.DataFrame(
    {
        "Cell": list(adata_cdb.obs.index),
        "cell_type": adata_cdb.obs["Annotation_scVI_detailed"].values.tolist(),
    }
)

In [35]:
df_meta

Unnamed: 0,Cell,cell_type
0,CRC01_LM#AAACATGCATCAGCAC-1-BCI-Nuclei,SPP1 Mac
1,CRC01_LM#AACATAGCAGGATAAC-1-BCI-Nuclei,SPP1 Mac
2,CRC01_LM#AGAACAAGTGGTTATG-1-BCI-Nuclei,Lymphatic endo
3,CRC01_LM#AGGTTGCGTGAGCAAG-1-BCI-Nuclei,Tip-like endo
4,CRC01_LM#CACCTCAGTGGTTATG-1-BCI-Nuclei,Tip-like endo
...,...,...
109857,CRC15_LM#TTTATGGAGTTTCCGC-1-BCI-Nuclei,TA1
109858,CRC15_LM#TTTCCGGGTTCTTTAG-1-BCI-Nuclei,Hypoxia
109859,CRC15_LM#TTTGGCTGTTAACGAT-1-BCI-Nuclei,TA1
109860,CRC15_LM#TTTGTCCCAGGAATCG-1-BCI-Nuclei,TA1


In [36]:
df_meta.set_index("Cell", inplace=True)
df_meta.head(3)

Unnamed: 0_level_0,cell_type
Cell,Unnamed: 1_level_1
CRC01_LM#AAACATGCATCAGCAC-1-BCI-Nuclei,SPP1 Mac
CRC01_LM#AACATAGCAGGATAAC-1-BCI-Nuclei,SPP1 Mac
CRC01_LM#AGAACAAGTGGTTATG-1-BCI-Nuclei,Lymphatic endo


In [37]:
len(df_meta["cell_type"].value_counts())

45

In [38]:
df_meta.to_csv(DIR2SAVE.joinpath("meta.tsv"), sep="\t")

In [39]:
adata_cdb.X.sum(axis=1)

matrix([[10000.001],
        [10000.001],
        [10000.001],
        ...,
        [10000.001],
        [10000.   ],
        [10000.001]], dtype=float32)