In [1]:
import scanpy as sc
import pandas as pd
from pathlib import Path
import anndata as ad
import numpy as np
import os

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

DPI = 300
FONTSIZE = 20  # 42

sc.settings.set_figure_params(
    scanpy=True, dpi=100, transparent=True, vector_friendly=True, dpi_save=DPI
)
from matplotlib import rcParams

rcParams["pdf.fonttype"] = 42

In [2]:
DIR2LOAD = Path(
    "/data/BCI-CRC/nasrine/data/CRC/Metastatic_CRC_LM_dataset/final_object/"
)

In [3]:
DIR2SAVE = Path(
    "/data/BCI-CRC/nasrine/data/CRC/spatial/CRC_LM_VISIUM/CRC_LM_VISIUM_04_08_09_11/cell2location/prepareInputRNA/"
)
DIR2SAVE.mkdir(parents=True, exist_ok=True)

### Load raw data with TME annotations

In [4]:
adata_raw_tme = sc.read_h5ad(
    DIR2LOAD.joinpath(
        "Multiome_Che_Wu_CRC_LM_integrated_scvi_hvg_final_annotations_raw.h5ad"
    )
)
adata_raw_tme.shape

(98312, 37102)

In [5]:
adata_raw_tme.obs.Annotation_scVI.value_counts()

T-NK-ILC       75489
Myeloid        17232
B               2116
Plasma          1610
Stromal          786
Endothelial      557
Hepatocyte       437
Mast              85
Name: Annotation_scVI, dtype: int64

In [6]:
adata_raw_tme.obs.Annotation_scVI_detailed.value_counts()

CD4 Th             10325
CD8 Tem             8683
Treg                7655
CD8 Tex             7531
CD4 Th HSP          7246
CD8 Tem HSP         6178
CD4 Tn              4778
SPP1 Mac            4619
MAIT                4463
CD8 Tem GZMB        3269
NK1                 3178
CD4 Tfh             3010
Neutrophil          2892
NK2                 2321
B                   2116
NLRP3 Mac           2075
C1QC Mac            2006
CD4 Th17            1937
T cycling           1774
gdT                 1761
Plasma              1610
cDC2                1543
NKT                 1380
FCN1 Mono            999
Kupffer              600
PLTP LYVE1 Mac       574
IL1B Mac             550
Myeloid cycling      513
ECM CAF              441
HSP Mono             387
Hepatocyte           338
Tip-like endo        273
pDC                  239
cDC1                 163
SMC                  143
SEC                  120
Myofibroblast        101
Cholangiocyte         99
Stalk-like endo       91
Mast                  85


In [7]:
# check that it is raw
print(adata_raw_tme.X[0:5, 0:5].todense())
print(np.max(adata_raw_tme.X))

[[0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.99976325 2.         0.        ]
 [0.         0.         0.         1.         0.        ]]
40676.0


### Load sam annotation for epithelial cells

In [8]:
# load sam annotation for epithelial cells
adata_epi = sc.read_h5ad(
    "/data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/Epithelial_scvi_annotations.h5ad"
)
adata_epi.shape

(16009, 2000)

In [9]:
adata_epi.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,Sample,Patient,Therapy,Tissue,TSSEnrichment,nFrags,percent.mt,...,RNA.weight,ATAC.weight,wsnn_res.0.5,S_score,G2M_score,phase,cell_cycle_diff,_scvi_batch,_scvi_labels,Cell_subtype
CRC01_LM#AACATAGCACTATGGC-1,SeuratProject,1957,1239,CRC01_LM,CRC01,Treated,LM,7.485,2784,9.197752,...,0.471572,0.528428,16,-0.168722,-0.097826,G1,-0.070896,0,0,REC
CRC01_LM#AACCTCCTCCAGCACA-1,SeuratProject,8024,4208,CRC01_LM,CRC01,Treated,LM,9.830,3301,2.243270,...,0.748984,0.251016,16,0.101411,-0.073933,S,0.175343,0,0,TA1
CRC01_LM#AACGCCCAGCTGGAAA-1,SeuratProject,15306,6289,CRC01_LM,CRC01,Treated,LM,6.133,19026,3.397361,...,0.399255,0.600745,16,-0.164477,-0.055053,G1,-0.109424,0,0,TA1
CRC01_LM#AACGGTAAGCCTGGTA-1,SeuratProject,1696,1246,CRC01_LM,CRC01,Treated,LM,4.892,8107,2.299528,...,0.904033,0.095967,2,-0.228819,-0.031942,G1,-0.196877,0,0,TA1
CRC01_LM#AACTAGTGTACTGAAT-1,SeuratProject,2389,1614,CRC01_LM,CRC01,Treated,LM,6.059,9649,1.925492,...,0.629178,0.370822,1,-0.239844,-0.069891,G1,-0.169953,0,0,REC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CRC15_LM#TTTATGGAGTTTCCGC-1,SeuratProject,1777,1425,CRC15_LM,CRC15,Treated,LM,5.043,58441,1.800788,...,0.603569,0.396431,2,-0.057693,0.234292,G2M,-0.291984,0,0,TA1
CRC15_LM#TTTCCGGGTTCTTTAG-1,SeuratProject,816,594,CRC15_LM,CRC15,Treated,LM,6.816,6892,7.107843,...,0.665964,0.334036,2,-0.012289,0.189379,G2M,-0.201668,0,0,Hypoxia
CRC15_LM#TTTGGCTGTTAACGAT-1,SeuratProject,5431,3043,CRC15_LM,CRC15,Treated,LM,9.613,4860,5.155588,...,0.693385,0.306615,2,0.107016,0.539908,G2M,-0.432892,0,0,TA1
CRC15_LM#TTTGTCCCAGGAATCG-1,SeuratProject,4131,2402,CRC15_LM,CRC15,Treated,LM,6.215,5786,4.671992,...,0.857876,0.142124,11,-0.074574,-0.077883,G1,0.003309,0,0,TA1


In [10]:
# column for the annotations
adata_epi.obs.Cell_subtype.value_counts()

TA1                4116
Hypoxia            2481
Stem NOTUM         1933
Colonocyte         1913
Stem               1400
Intermediate       1319
REC                 998
UPR                 759
iREC                469
Goblet              323
TA2                 162
Enteroendocrine      71
Tuft                 65
Name: Cell_subtype, dtype: int64

#### get raw counts for epithelial (from multiome)

In [11]:
# read whole multiome data
adata_multiome_raw = sc.read_h5ad(
    "/data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/CRCLM_decon_scvi_RAW.h5ad"
)

In [12]:
adata_multiome_raw.shape

(23119, 36485)

In [13]:
adata_multiome_raw.obs.Cell_type.value_counts()  # there is epithelial!

Epithelial     17774
Myeloid         1936
T/NK/ILC        1604
Stromal          675
Endothelial      494
Hepatocyte       474
B                162
Name: Cell_type, dtype: int64

In [14]:
# isolate only annotated epithelial cells from Sam's annotations
adata_epi_raw = adata_multiome_raw[
    adata_multiome_raw.obs.index.isin(adata_epi.obs.index)
].copy()

In [15]:
adata_epi_raw.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,Sample,Patient,Therapy,Tissue,TSSEnrichment,nFrags,percent.mt,...,Cell_type,nCount_Peaks,nFeature_Peaks,nCount_ATAC,nFeature_ATAC,nCount_integratedRNADecon,nFeature_integratedRNADecon,RNA.weight,ATAC.weight,wsnn_res.0.5
CRC01_LM#AACATAGCACTATGGC-1,SeuratProject,1957,1239,CRC01_LM,CRC01,NAC,LM,7.485,2784,9.197752,...,Epithelial,1040,1023,724,376,1675.916173,1213,0.471572,0.528428,16
CRC01_LM#AACCTCCTCCAGCACA-1,SeuratProject,8024,4208,CRC01_LM,CRC01,NAC,LM,9.830,3301,2.243270,...,Epithelial,1654,1620,1000,531,7597.421457,4131,0.748984,0.251016,16
CRC01_LM#AACGCCCAGCTGGAAA-1,SeuratProject,15306,6289,CRC01_LM,CRC01,NAC,LM,6.133,19026,3.397361,...,Epithelial,4962,4657,3476,1712,13876.588890,6189,0.399255,0.600745,16
CRC01_LM#AACGGTAAGCCTGGTA-1,SeuratProject,1696,1246,CRC01_LM,CRC01,NAC,LM,4.892,8107,2.299528,...,Epithelial,1324,1299,954,486,1334.581036,1172,0.904033,0.095967,2
CRC01_LM#AACTAGTGTACTGAAT-1,SeuratProject,2389,1614,CRC01_LM,CRC01,NAC,LM,6.059,9649,1.925492,...,Epithelial,2205,2136,1418,745,2094.881992,1554,0.629178,0.370822,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CRC15_LM#TTTATGGAGTTTCCGC-1,SeuratProject,1777,1425,CRC15_LM,CRC15,NAC,LM,5.043,58441,1.800788,...,Epithelial,7415,6766,6275,3034,1429.192266,1392,0.603569,0.396431,2
CRC15_LM#TTTCCGGGTTCTTTAG-1,SeuratProject,816,594,CRC15_LM,CRC15,NAC,LM,6.816,6892,7.107843,...,Epithelial,1205,1174,1025,530,725.042091,569,0.665964,0.334036,2
CRC15_LM#TTTGGCTGTTAACGAT-1,SeuratProject,5431,3043,CRC15_LM,CRC15,NAC,LM,9.613,4860,5.155588,...,Epithelial,1852,1794,1889,965,4793.664042,2954,0.693385,0.306615,2
CRC15_LM#TTTGTCCCAGGAATCG-1,SeuratProject,4131,2402,CRC15_LM,CRC15,NAC,LM,6.215,5786,4.671992,...,Epithelial,1173,1144,954,513,3072.652178,2316,0.857876,0.142124,11


In [16]:
# append annotations to raw epithelial counts anndata
adata_epi_raw.obs = adata_epi_raw.obs.merge(
    right=adata_epi.obs[["Cell_subtype"]], how="left", left_index=True, right_index=True
)

In [17]:
adata_epi_raw.shape

(16009, 36485)

In [18]:
# check that it is raw
print(adata_epi_raw.X[0:5, 0:5].todense())
print(np.max(adata_epi_raw.X))

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
4195.273


In [19]:
adata_epi_raw.obs.Cell_subtype.value_counts()

TA1                4116
Hypoxia            2481
Stem NOTUM         1933
Colonocyte         1913
Stem               1400
Intermediate       1319
REC                 998
UPR                 759
iREC                469
Goblet              323
TA2                 162
Enteroendocrine      71
Tuft                 65
Name: Cell_subtype, dtype: int64

In [20]:
# rename sam column to epi_annot
adata_epi_raw.obs.rename(
    columns={"Cell_subtype": "Annotation_scVI_detailed"}, inplace=True
)

In [21]:
# rename sam column to epi_annot
adata_epi_raw.obs.rename(columns={"Cell_type": "Annotation_scVI"}, inplace=True)

In [22]:
# add cell_source column
adata_epi_raw.obs["cell_source"] = "BCI-Nuclei"

In [23]:
adata_epi_raw.obs.head(5)

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,Sample,Patient,Therapy,Tissue,TSSEnrichment,nFrags,percent.mt,...,nFeature_Peaks,nCount_ATAC,nFeature_ATAC,nCount_integratedRNADecon,nFeature_integratedRNADecon,RNA.weight,ATAC.weight,wsnn_res.0.5,Annotation_scVI_detailed,cell_source
CRC01_LM#AACATAGCACTATGGC-1,SeuratProject,1957,1239,CRC01_LM,CRC01,NAC,LM,7.485,2784,9.197752,...,1023,724,376,1675.916173,1213,0.471572,0.528428,16,REC,BCI-Nuclei
CRC01_LM#AACCTCCTCCAGCACA-1,SeuratProject,8024,4208,CRC01_LM,CRC01,NAC,LM,9.83,3301,2.24327,...,1620,1000,531,7597.421457,4131,0.748984,0.251016,16,TA1,BCI-Nuclei
CRC01_LM#AACGCCCAGCTGGAAA-1,SeuratProject,15306,6289,CRC01_LM,CRC01,NAC,LM,6.133,19026,3.397361,...,4657,3476,1712,13876.58889,6189,0.399255,0.600745,16,TA1,BCI-Nuclei
CRC01_LM#AACGGTAAGCCTGGTA-1,SeuratProject,1696,1246,CRC01_LM,CRC01,NAC,LM,4.892,8107,2.299528,...,1299,954,486,1334.581036,1172,0.904033,0.095967,2,TA1,BCI-Nuclei
CRC01_LM#AACTAGTGTACTGAAT-1,SeuratProject,2389,1614,CRC01_LM,CRC01,NAC,LM,6.059,9649,1.925492,...,2136,1418,745,2094.881992,1554,0.629178,0.370822,1,REC,BCI-Nuclei


#### need to add -BCI-Nuclei at the end of the barcodes so that it matches in our concat object

In [24]:
adata_epi_raw.obs.index = adata_epi_raw.obs.index + "-BCI-Nuclei"

In [25]:
adata_epi_raw.obs.head(5)

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,Sample,Patient,Therapy,Tissue,TSSEnrichment,nFrags,percent.mt,...,nFeature_Peaks,nCount_ATAC,nFeature_ATAC,nCount_integratedRNADecon,nFeature_integratedRNADecon,RNA.weight,ATAC.weight,wsnn_res.0.5,Annotation_scVI_detailed,cell_source
CRC01_LM#AACATAGCACTATGGC-1-BCI-Nuclei,SeuratProject,1957,1239,CRC01_LM,CRC01,NAC,LM,7.485,2784,9.197752,...,1023,724,376,1675.916173,1213,0.471572,0.528428,16,REC,BCI-Nuclei
CRC01_LM#AACCTCCTCCAGCACA-1-BCI-Nuclei,SeuratProject,8024,4208,CRC01_LM,CRC01,NAC,LM,9.83,3301,2.24327,...,1620,1000,531,7597.421457,4131,0.748984,0.251016,16,TA1,BCI-Nuclei
CRC01_LM#AACGCCCAGCTGGAAA-1-BCI-Nuclei,SeuratProject,15306,6289,CRC01_LM,CRC01,NAC,LM,6.133,19026,3.397361,...,4657,3476,1712,13876.58889,6189,0.399255,0.600745,16,TA1,BCI-Nuclei
CRC01_LM#AACGGTAAGCCTGGTA-1-BCI-Nuclei,SeuratProject,1696,1246,CRC01_LM,CRC01,NAC,LM,4.892,8107,2.299528,...,1299,954,486,1334.581036,1172,0.904033,0.095967,2,TA1,BCI-Nuclei
CRC01_LM#AACTAGTGTACTGAAT-1-BCI-Nuclei,SeuratProject,2389,1614,CRC01_LM,CRC01,NAC,LM,6.059,9649,1.925492,...,2136,1418,745,2094.881992,1554,0.629178,0.370822,1,REC,BCI-Nuclei


### Need to merge raw TME annotations with raw epithelial annotations

In [26]:
def format_adata(adata):
    """Select only relevant info for merging datasets, assumes raw is in .X"""
    return ad.AnnData(
        X=adata.X.copy(),  # take raw counts
        obs=adata.obs[
            [
                "Patient",
                "Sample",
                "Tissue",
                "Therapy",
                "cell_source",
                "Annotation_scVI",
                "Annotation_scVI_detailed",
            ]
        ],
        var=adata.var  # [
        # [
        #   "n_cells_by_counts",
        #  "total_counts",
        #  "mean_counts",
        # "pct_dropout_by_counts",
        # ]
        # ],
    )

In [27]:
adata_tme_format = format_adata(adata_raw_tme)
adata_tme_format

AnnData object with n_obs × n_vars = 98312 × 37102
    obs: 'Patient', 'Sample', 'Tissue', 'Therapy', 'cell_source', 'Annotation_scVI', 'Annotation_scVI_detailed'
    var: 'n_cells'

In [28]:
adata_epi_format = format_adata(adata_epi_raw)
adata_epi_format

AnnData object with n_obs × n_vars = 16009 × 36485
    obs: 'Patient', 'Sample', 'Tissue', 'Therapy', 'cell_source', 'Annotation_scVI', 'Annotation_scVI_detailed'

In [29]:
np.max(adata_tme_format.X)  # check data is raw

40676.0

In [30]:
np.max(adata_epi_format.X)  # check data is raw

4195.273

### Merge TME and epi datasets into single adata 

In [31]:
adata_liver = adata_tme_format.concatenate(
    adata_epi_format,
    batch_key="Annotation_source",
    batch_categories=["Nasrine", "Sam"],
    join="outer",
    index_unique=None,  # Provide `None` to keep existing indices.
)

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [32]:
adata_liver.shape

(114321, 42739)

In [33]:
assert adata_liver.shape[0] == adata_tme_format.shape[0] + adata_epi_format.shape[0]

In [34]:
adata_liver

AnnData object with n_obs × n_vars = 114321 × 42739
    obs: 'Patient', 'Sample', 'Tissue', 'Therapy', 'cell_source', 'Annotation_scVI', 'Annotation_scVI_detailed', 'Annotation_source'
    var: 'n_cells-Nasrine'

In [35]:
# set dtype of .obs variables to category
adata_liver.obs["Tissue"] = adata_liver.obs["Tissue"].astype("category")
adata_liver.obs["Therapy"] = adata_liver.obs["Therapy"].astype("category")
adata_liver.obs["cell_source"] = adata_liver.obs["cell_source"].astype("category")
adata_liver.obs["Annotation_scVI"] = adata_liver.obs["Annotation_scVI"].astype(
    "category"
)
adata_liver.obs["Annotation_scVI_detailed"] = adata_liver.obs[
    "Annotation_scVI_detailed"
].astype("category")
adata_liver.obs["Patient"] = adata_liver.obs["Patient"].astype("category")
adata_liver.obs["Sample"] = adata_liver.obs["Sample"].astype("category")
adata_liver.obs["Annotation_source"] = adata_liver.obs["Annotation_source"].astype(
    "category"
)

In [36]:
adata_liver.obs

Unnamed: 0,Patient,Sample,Tissue,Therapy,cell_source,Annotation_scVI,Annotation_scVI_detailed,Annotation_source
CRC01_LM#AAACATGCATCAGCAC-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Myeloid,SPP1 Mac,Nasrine
CRC01_LM#AACATAGCAGGATAAC-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Myeloid,SPP1 Mac,Nasrine
CRC01_LM#AGAACAAGTGGTTATG-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Endothelial,Lymphatic endo,Nasrine
CRC01_LM#AGGTTGCGTGAGCAAG-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Endothelial,Tip-like endo,Nasrine
CRC01_LM#CACCTCAGTGGTTATG-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Endothelial,Tip-like endo,Nasrine
...,...,...,...,...,...,...,...,...
CRC15_LM#TTTATGGAGTTTCCGC-1-BCI-Nuclei,CRC15,CRC15_LM,LM,NAC,BCI-Nuclei,Epithelial,TA1,Sam
CRC15_LM#TTTCCGGGTTCTTTAG-1-BCI-Nuclei,CRC15,CRC15_LM,LM,NAC,BCI-Nuclei,Epithelial,Hypoxia,Sam
CRC15_LM#TTTGGCTGTTAACGAT-1-BCI-Nuclei,CRC15,CRC15_LM,LM,NAC,BCI-Nuclei,Epithelial,TA1,Sam
CRC15_LM#TTTGTCCCAGGAATCG-1-BCI-Nuclei,CRC15,CRC15_LM,LM,NAC,BCI-Nuclei,Epithelial,TA1,Sam


In [37]:
adata_liver.obs.Annotation_scVI.value_counts()

T-NK-ILC       75489
Myeloid        17232
Epithelial     16009
B               2116
Plasma          1610
Stromal          786
Endothelial      557
Hepatocyte       437
Mast              85
Name: Annotation_scVI, dtype: int64

In [38]:
adata_liver.obs.Annotation_scVI_detailed.value_counts()

CD4 Th             10325
CD8 Tem             8683
Treg                7655
CD8 Tex             7531
CD4 Th HSP          7246
CD8 Tem HSP         6178
CD4 Tn              4778
SPP1 Mac            4619
MAIT                4463
TA1                 4116
CD8 Tem GZMB        3269
NK1                 3178
CD4 Tfh             3010
Neutrophil          2892
Hypoxia             2481
NK2                 2321
B                   2116
NLRP3 Mac           2075
C1QC Mac            2006
CD4 Th17            1937
Stem NOTUM          1933
Colonocyte          1913
T cycling           1774
gdT                 1761
Plasma              1610
cDC2                1543
Stem                1400
NKT                 1380
Intermediate        1319
FCN1 Mono            999
REC                  998
UPR                  759
Kupffer              600
PLTP LYVE1 Mac       574
IL1B Mac             550
Myeloid cycling      513
iREC                 469
ECM CAF              441
HSP Mono             387
Hepatocyte           338


In [39]:
adata_liver[adata_liver.obs.Annotation_scVI_detailed.isna()]

View of AnnData object with n_obs × n_vars = 0 × 42739
    obs: 'Patient', 'Sample', 'Tissue', 'Therapy', 'cell_source', 'Annotation_scVI', 'Annotation_scVI_detailed', 'Annotation_source'
    var: 'n_cells-Nasrine'

In [40]:
adata_liver[adata_liver.obs.Annotation_scVI.isna()]

View of AnnData object with n_obs × n_vars = 0 × 42739
    obs: 'Patient', 'Sample', 'Tissue', 'Therapy', 'cell_source', 'Annotation_scVI', 'Annotation_scVI_detailed', 'Annotation_source'
    var: 'n_cells-Nasrine'

### create "technology" column for single cell vs single nuclei

In [41]:
adata_liver.obs["technology"] = np.where(
    adata_liver.obs.cell_source.str.contains("Nuclei"), "Nuclei", "Cells"
)

In [42]:
adata_liver.obs

Unnamed: 0,Patient,Sample,Tissue,Therapy,cell_source,Annotation_scVI,Annotation_scVI_detailed,Annotation_source,technology
CRC01_LM#AAACATGCATCAGCAC-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Myeloid,SPP1 Mac,Nasrine,Nuclei
CRC01_LM#AACATAGCAGGATAAC-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Myeloid,SPP1 Mac,Nasrine,Nuclei
CRC01_LM#AGAACAAGTGGTTATG-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Endothelial,Lymphatic endo,Nasrine,Nuclei
CRC01_LM#AGGTTGCGTGAGCAAG-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Endothelial,Tip-like endo,Nasrine,Nuclei
CRC01_LM#CACCTCAGTGGTTATG-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Endothelial,Tip-like endo,Nasrine,Nuclei
...,...,...,...,...,...,...,...,...,...
CRC15_LM#TTTATGGAGTTTCCGC-1-BCI-Nuclei,CRC15,CRC15_LM,LM,NAC,BCI-Nuclei,Epithelial,TA1,Sam,Nuclei
CRC15_LM#TTTCCGGGTTCTTTAG-1-BCI-Nuclei,CRC15,CRC15_LM,LM,NAC,BCI-Nuclei,Epithelial,Hypoxia,Sam,Nuclei
CRC15_LM#TTTGGCTGTTAACGAT-1-BCI-Nuclei,CRC15,CRC15_LM,LM,NAC,BCI-Nuclei,Epithelial,TA1,Sam,Nuclei
CRC15_LM#TTTGTCCCAGGAATCG-1-BCI-Nuclei,CRC15,CRC15_LM,LM,NAC,BCI-Nuclei,Epithelial,TA1,Sam,Nuclei


In [43]:
adata_liver.obs.technology.value_counts()

Cells     93286
Nuclei    21035
Name: technology, dtype: int64

### CAREFUL: NEED RAW COUNTS (INTEGER) for reference data in cell2location because assumed to follow a Gamma Poisson distribution

You need to get raw/untransformed/unnormalised counts. It is generally a good practice to keep that data in the course of analysis. Some normalisation workflows can be undone, for example:

`normalised_data = data / total_per_cell * 10000
lognormalised_data = log(normalised_data + 1)`

can be undone as

`normalised_data = exp(lognormalised_data) - 1
data = normalised_data / 10000 * total_per_cell
data = data.astype(int) # make integer`

In [44]:
adata_liver.X

<114321x42739 sparse matrix of type '<class 'numpy.float32'>'
	with 153254687 stored elements in Compressed Sparse Row format>

In [45]:
adata_liver.X.astype(int)[0:5, 0:5].todense()

matrix([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 2, 0],
        [0, 0, 0, 1, 0]])

In [46]:
round(adata_liver.X)[0:5, 0:5].todense()

matrix([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 1., 2., 0.],
        [0., 0., 0., 1., 0.]], dtype=float32)

In [47]:
adata_liver.write(DIR2SAVE.joinpath("Multiome_Che_Wu_CRC_LM_annotations_raw_decontx.h5ad"))

In [52]:
adata_liver[adata_liver.obs.cell_source=='BCI-Nuclei'].X[0:10, 0:10].todense() # see it is not count

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.99976325, 2.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.99991417],
        [0.        , 0.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.    

In [47]:
# get count data, round to nearest integer
adata_liver.X = round(adata_liver.X)

In [48]:
adata_liver.X[0:5, 0:5].todense()

matrix([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 1., 2., 0.],
        [0., 0., 0., 1., 0.]], dtype=float32)

### filter genes

In [49]:
adata_liver.shape

(114321, 42739)

In [50]:
sc.pp.filter_genes(adata_liver, min_cells=1)

In [51]:
adata_liver.shape

(114321, 39559)

In [52]:
adata_liver.obs[adata_liver.obs["Annotation_scVI"] == "Epithelial"]

Unnamed: 0,Patient,Sample,Tissue,Therapy,cell_source,Annotation_scVI,Annotation_scVI_detailed,Annotation_source,technology
CRC01_LM#AACATAGCACTATGGC-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Epithelial,pEMT,Sam,Nuclei
CRC01_LM#AACCTCCTCCAGCACA-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Epithelial,TA1,Sam,Nuclei
CRC01_LM#AACGCCCAGCTGGAAA-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Epithelial,TA1,Sam,Nuclei
CRC01_LM#AACGGTAAGCCTGGTA-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Epithelial,TA1,Sam,Nuclei
CRC01_LM#AACTAGTGTACTGAAT-1-BCI-Nuclei,CRC01,CRC01_LM,LM,NAC,BCI-Nuclei,Epithelial,pEMT,Sam,Nuclei
...,...,...,...,...,...,...,...,...,...
CRC15_LM#TTTATGGAGTTTCCGC-1-BCI-Nuclei,CRC15,CRC15_LM,LM,NAC,BCI-Nuclei,Epithelial,TA1,Sam,Nuclei
CRC15_LM#TTTCCGGGTTCTTTAG-1-BCI-Nuclei,CRC15,CRC15_LM,LM,NAC,BCI-Nuclei,Epithelial,Hypoxia,Sam,Nuclei
CRC15_LM#TTTGGCTGTTAACGAT-1-BCI-Nuclei,CRC15,CRC15_LM,LM,NAC,BCI-Nuclei,Epithelial,TA1,Sam,Nuclei
CRC15_LM#TTTGTCCCAGGAATCG-1-BCI-Nuclei,CRC15,CRC15_LM,LM,NAC,BCI-Nuclei,Epithelial,TA1,Sam,Nuclei


### save to file 

In [53]:
adata_liver.write(DIR2SAVE.joinpath("Multiome_Che_Wu_CRC_LM_annotations_raw.h5ad"))

In [4]:
adata_liver = sc.read_h5ad(
    DIR2SAVE.joinpath("Multiome_Che_Wu_CRC_LM_annotations_raw.h5ad")
)

In [5]:
adata_liver.obs.cell_source.value_counts()

Wu-Cells      49302
Che-Cells     43984
BCI-Nuclei    21035
Name: cell_source, dtype: int64

In [8]:
adata_liver[adata_liver.obs.cell_source.isin(["Wu-Cells", "Che-Cells"])].obs.Patient.unique()

['COL07', 'COL12', 'COL15', 'COL16', 'COL17', ..., 'Wu2021_P15', 'Wu2021_P17', 'Wu2021_P18', 'Wu2021_P19', 'Wu2021_P20']
Length: 26
Categories (26, object): ['COL07', 'COL12', 'COL15', 'COL16', ..., 'Wu2021_P6', 'Wu2021_P7', 'Wu2021_P8', 'Wu2021_P9']

In [6]:
49302+43984

93286

In [55]:
adata_liver.obs[["Therapy", "Annotation_scVI_detailed"]].groupby(
    ["Therapy", "Annotation_scVI_detailed"]
).size()

Therapy  Annotation_scVI_detailed
NAC      B                           1248
         C1QC Mac                    1287
         C3 iCAF                       39
         CD4 Tfh                     1076
         CD4 Th                      7013
                                     ... 
naive    gdT                          251
         ipEMT                        381
         migDC                         45
         pDC                          133
         pEMT                         848
Length: 114, dtype: int64

In [56]:
temp = (
    adata_liver.obs[["Therapy", "Annotation_scVI_detailed"]]
    .groupby(["Therapy", "Annotation_scVI_detailed"])
    .size()
    .reset_index()
)
temp

Unnamed: 0,Therapy,Annotation_scVI_detailed,0
0,NAC,B,1248
1,NAC,C1QC Mac,1287
2,NAC,C3 iCAF,39
3,NAC,CD4 Tfh,1076
4,NAC,CD4 Th,7013
...,...,...,...
109,naive,gdT,251
110,naive,ipEMT,381
111,naive,migDC,45
112,naive,pDC,133


In [57]:
temp[temp["Therapy"] == "naive"]

Unnamed: 0,Therapy,Annotation_scVI_detailed,0
57,naive,B,868
58,naive,C1QC Mac,719
59,naive,C3 iCAF,32
60,naive,CD4 Tfh,1934
61,naive,CD4 Th,3312
62,naive,CD4 Th HSP,2883
63,naive,CD4 Th17,491
64,naive,CD4 Tn,2664
65,naive,CD8 Tem,2580
66,naive,CD8 Tem GZMB,1687


### save also annotations for naive 

In [58]:
adata_raw_naive = adata_liver[adata_liver.obs.Therapy == "naive"].copy()

In [59]:
adata_raw_naive.write(
    DIR2SAVE.joinpath("Multiome_Che_Wu_CRC_LM_annotations_raw_naive.h5ad")
)