In [1]:
import pyreadr
import scanpy as sc
import numpy as np
import os
import pandas as pd
import anndata as ad
from pathlib import Path

* Load datasets 

In [2]:
# load datasets
adata_multiome_raw = sc.read_h5ad(
    "/data/BCI-CRC/SO/data/CRC_multiome/scanpy/CRCLM_finalAnalysis/CRCLM_decon_scvi_RAW.h5ad"
)
adata_multiome_annot = sc.read_h5ad(
    "/data/BCI-CRC/nasrine/data/CRC/multiome/CRC_LM_01_15/final_object/CRC_LM_decon_scvi_final_annotations.h5ad"
)
adata_multiome_raw.var_names_make_unique()

adata_che2021 = sc.read_10x_mtx("/data/BCI-CRC/SO/data/public/Che2021_CRC-LM_scRNAseq/")
adata_che2021.var_names_make_unique()

adata_wu2022 = sc.read_h5ad(
    "/data/BCI-CRC/nasrine/data/CRC/Wu_Yingcheng_2021_cancer_discovery/Wu2021_counts.h5ad"
)
adata_wu2022.var_names_make_unique()

#### For multiome
* select only annotated cells from final object
* get raw counts for them
* append annotations

In [3]:
adata_multiome_raw

AnnData object with n_obs × n_vars = 23119 × 36485
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Sample', 'Patient', 'Therapy', 'Tissue', 'TSSEnrichment', 'nFrags', 'percent.mt', 'percent.ribo', 'RNA_snn_res.0.5', 'seurat_clusters', 'integrated_snn_res.0.5', 'Clusters_all_cells_preDecon', 'Cell_type_preDecon', 'ident', 'decontX_contamination', 'decontX_clusters', 'integratedRNADecon_snn_res.0.5', 'X_scvi_batch', 'X_scvi_labels', 'leiden', 'Cell_type', 'nCount_Peaks', 'nFeature_Peaks', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_integratedRNADecon', 'nFeature_integratedRNADecon', 'RNA.weight', 'ATAC.weight', 'wsnn_res.0.5'
    obsm: 'X_ATAC', 'X_UMAP', 'X_WNN'

In [4]:
# replace annotation convention
adata_multiome_raw.obs["Cell_type"] = np.where(
    adata_multiome_raw.obs["Cell_type"] == "T/NK/ILC",
    "T-NK-ILC",
    adata_multiome_raw.obs["Cell_type"],
)

In [5]:
adata_multiome_raw.obs["Cell_type"]

CRC01_LM#AAACATGCATCAGCAC-1       Myeloid
CRC01_LM#AACATAGCACTATGGC-1    Epithelial
CRC01_LM#AACATAGCAGGATAAC-1       Myeloid
CRC01_LM#AACCTCCTCCAGCACA-1    Epithelial
CRC01_LM#AACGCCCAGCTGGAAA-1    Epithelial
                                  ...    
CRC15_LM#TTTGTCCCAGGAATCG-1    Epithelial
CRC15_LM#TTTGTCTAGGTCCACA-1      T-NK-ILC
CRC15_LM#TTTGTGAAGCATGAAG-1      T-NK-ILC
CRC15_LM#TTTGTGAAGGACTTAC-1    Epithelial
CRC15_LM#TTTGTGAAGGAGCATA-1      T-NK-ILC
Name: Cell_type, Length: 23119, dtype: object

In [6]:
set(adata_multiome_raw.obs.Tissue)

{'LM'}

In [7]:
# for now for the muliome no annotation
adata_multiome_raw.obs["Cell_subtype"] = "No annotation"

In [8]:
adata_multiome_raw.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,Sample,Patient,Therapy,Tissue,TSSEnrichment,nFrags,percent.mt,...,nCount_Peaks,nFeature_Peaks,nCount_ATAC,nFeature_ATAC,nCount_integratedRNADecon,nFeature_integratedRNADecon,RNA.weight,ATAC.weight,wsnn_res.0.5,Cell_subtype
CRC01_LM#AAACATGCATCAGCAC-1,SeuratProject,981,727,CRC01_LM,CRC01,NAC,LM,9.078,1890,3.771662,...,454,450,305,162,926.971718,704,0.296429,0.703571,3,No annotation
CRC01_LM#AACATAGCACTATGGC-1,SeuratProject,1957,1239,CRC01_LM,CRC01,NAC,LM,7.485,2784,9.197752,...,1040,1023,724,376,1675.916173,1213,0.471572,0.528428,16,No annotation
CRC01_LM#AACATAGCAGGATAAC-1,SeuratProject,461,365,CRC01_LM,CRC01,NAC,LM,6.041,1923,1.301518,...,442,441,251,133,432.982580,344,0.512933,0.487067,3,No annotation
CRC01_LM#AACCTCCTCCAGCACA-1,SeuratProject,8024,4208,CRC01_LM,CRC01,NAC,LM,9.830,3301,2.243270,...,1654,1620,1000,531,7597.421457,4131,0.748984,0.251016,16,No annotation
CRC01_LM#AACGCCCAGCTGGAAA-1,SeuratProject,15306,6289,CRC01_LM,CRC01,NAC,LM,6.133,19026,3.397361,...,4962,4657,3476,1712,13876.588890,6189,0.399255,0.600745,16,No annotation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CRC15_LM#TTTGTCCCAGGAATCG-1,SeuratProject,4131,2402,CRC15_LM,CRC15,NAC,LM,6.215,5786,4.671992,...,1173,1144,954,513,3072.652178,2316,0.857876,0.142124,11,No annotation
CRC15_LM#TTTGTCTAGGTCCACA-1,SeuratProject,1110,806,CRC15_LM,CRC15,NAC,LM,10.594,1538,4.414414,...,355,352,356,192,974.865972,766,0.204801,0.795199,15,No annotation
CRC15_LM#TTTGTGAAGCATGAAG-1,SeuratProject,1142,810,CRC15_LM,CRC15,NAC,LM,5.428,9636,8.406305,...,1322,1280,1155,589,981.736507,776,0.503726,0.496274,6,No annotation
CRC15_LM#TTTGTGAAGGACTTAC-1,SeuratProject,3550,2256,CRC15_LM,CRC15,NAC,LM,4.999,7830,0.309859,...,1677,1645,1269,680,3377.198380,2188,0.520234,0.479766,5,No annotation


In [9]:
adata_multiome_raw.obs.Cell_type.value_counts()

Epithelial     17774
Myeloid         1936
T-NK-ILC        1604
Stromal          675
Endothelial      494
Hepatocyte       474
B                162
Name: Cell_type, dtype: int64

In [10]:
adata_multiome_raw[adata_multiome_raw.obs["Cell_type"] != "Epithelial"].copy()

AnnData object with n_obs × n_vars = 5345 × 36485
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Sample', 'Patient', 'Therapy', 'Tissue', 'TSSEnrichment', 'nFrags', 'percent.mt', 'percent.ribo', 'RNA_snn_res.0.5', 'seurat_clusters', 'integrated_snn_res.0.5', 'Clusters_all_cells_preDecon', 'Cell_type_preDecon', 'ident', 'decontX_contamination', 'decontX_clusters', 'integratedRNADecon_snn_res.0.5', 'X_scvi_batch', 'X_scvi_labels', 'leiden', 'Cell_type', 'nCount_Peaks', 'nFeature_Peaks', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_integratedRNADecon', 'nFeature_integratedRNADecon', 'RNA.weight', 'ATAC.weight', 'wsnn_res.0.5', 'Cell_subtype'
    obsm: 'X_ATAC', 'X_UMAP', 'X_WNN'

In [11]:
adata_multiome_raw[
    adata_multiome_raw.obs["Cell_type"] != "Epithelial"
].obs.Cell_type.value_counts()

Myeloid        1936
T-NK-ILC       1604
Stromal         675
Endothelial     494
Hepatocyte      474
B               162
Name: Cell_type, dtype: int64

In [12]:
# remove epithelial cells from multiome because Sam is annotating them
adata_multiome_raw_liver = adata_multiome_raw[
    adata_multiome_raw.obs["Cell_type"] != "Epithelial"
].copy()

In [13]:
adata_multiome_raw_liver.obs.Cell_type.value_counts()

Myeloid        1936
T-NK-ILC       1604
Stromal         675
Endothelial     494
Hepatocyte      474
B               162
Name: Cell_type, dtype: int64

#### create metadata for che2021 dataset 

In [14]:
adata_che2021.obs["Patient"] = [
    ele[1] for ele in adata_che2021.obs.index.str.split("_")
]
adata_che2021.obs["Tissue"] = [ele[2] for ele in adata_che2021.obs.index.str.split("_")]
adata_che2021.obs["Therapy"] = np.where(
    (
        (adata_che2021.obs["Patient"] == "COL15")
        | (adata_che2021.obs["Patient"] == "COL17")
        | (adata_che2021.obs["Patient"] == "COL18")
    ),
    "NAC",  # neoadjuvant chemotherapy, preoperative chemotherapy
    "naive",
)

In [15]:
# load annotations from previous analysis
# read in annotations
gen_annot = pd.read_csv(
    "/data/BCI-CRC/nasrine/data/CRC/Che2021_CRC-LM_scRNAseq/Che2021_CRC_LM_clustering_annotations.txt",
    header=0,
    index_col=0,
    sep="\t",
)
gen_annot.rename(columns={"Annotation": "Cell_type"}, inplace=True)

# merge to add annot to .obs
adata_che2021.obs = adata_che2021.obs.merge(
    right=gen_annot, how="left", right_index=True, left_index=True
)

tcell_annot_che2021 = pd.read_csv(
    "/data/BCI-CRC/nasrine/data/CRC/Che2021_CRC-LM_scRNAseq/Che2021_CRC_LM_TNKILC_clustering_annotations.txt",
    header=0,
    index_col=0,
    sep="\t",
)

cDC1_migDC_annot_che2021 = pd.read_csv(
    "/data/BCI-CRC/nasrine/data/CRC/Che2021_CRC-LM_scRNAseq/Che2021_CRC_LM_cDC1_migDC_annotations.txt",
    header=0,
    index_col=0,
    sep="\t",
)

myeloid_annot_che2021 = pd.read_csv(
    "/data/BCI-CRC/nasrine/data/CRC/Che2021_CRC-LM_scRNAseq/Che2021_CRC_LM_myeloid_clustering_annotations.txt",
    header=0,
    index_col=0,
    sep="\t",
)

stromal_annot_che2021 = pd.read_csv(
    "/data/BCI-CRC/nasrine/data/CRC/Che2021_CRC-LM_scRNAseq/Che2021_CRC_LM_stromal_clustering_annotations.txt",
    header=0,
    index_col=0,
    sep="\t",
)

complete_annot_che2021 = pd.concat(
    [
        tcell_annot_che2021,
        cDC1_migDC_annot_che2021,
        myeloid_annot_che2021,
        stromal_annot_che2021,
    ],
    axis=0,
)
complete_annot_che2021.rename(columns={"Annotation": "Cell_subtype"}, inplace=True)

# add annot to .obs
adata_che2021.obs = adata_che2021.obs.merge(
    right=complete_annot_che2021, how="left", right_index=True, left_index=True
)

adata_che2021.obs["Cell_subtype"] = np.where(
    adata_che2021.obs["Cell_subtype"].isna(),
    adata_che2021.obs["Cell_type"],
    adata_che2021.obs["Cell_subtype"],
)

In [16]:
adata_che2021.obs

Unnamed: 0,Patient,Tissue,Therapy,Cell_type,Cell_subtype
AAACCTGAGAAACCTA_COL07_CRC,COL07,CRC,naive,Epithelial,Epithelial
AAACCTGAGACAATAC_COL07_CRC,COL07,CRC,naive,Myeloid,cDC2
AAACCTGAGACGCAAC_COL07_CRC,COL07,CRC,naive,B,B
AAACCTGAGCAGATCG_COL07_CRC,COL07,CRC,naive,T/NK/ILC,Treg
AAACCTGAGCTATGCT_COL07_CRC,COL07,CRC,naive,,
...,...,...,...,...,...
TTTGTCAGTTGGGACA_COL18_PBMC,COL18,PBMC,NAC,,
TTTGTCATCATGTGGT_COL18_PBMC,COL18,PBMC,NAC,,
TTTGTCATCCGCATCT_COL18_PBMC,COL18,PBMC,NAC,,
TTTGTCATCGGAATCT_COL18_PBMC,COL18,PBMC,NAC,,


### Isolate metastatic CRC LM samples only

* multiome is done

In [17]:
set(adata_multiome_raw_liver.obs.Tissue)

{'LM'}

* Che2021

In [18]:
set(adata_che2021.obs.Tissue)

{'CRC', 'LM', 'PBMC'}

In [19]:
adata_che2021_liver = adata_che2021[adata_che2021.obs.Tissue == "LM"].copy()
print(adata_che2021_liver.shape)
print(set(adata_che2021_liver.obs.Tissue))

(57596, 33694)
{'LM'}


In [20]:
adata_che2021_liver.obs

Unnamed: 0,Patient,Tissue,Therapy,Cell_type,Cell_subtype
AAACCTGAGACTAGGC_COL07_LM,COL07,LM,naive,Epithelial,Epithelial
AAACCTGAGAGTACAT_COL07_LM,COL07,LM,naive,cDC1/migDC,cDC1
AAACCTGAGAGTGACC_COL07_LM,COL07,LM,naive,pDC,pDC
AAACCTGAGGATCGCA_COL07_LM,COL07,LM,naive,Epithelial,Epithelial
AAACCTGAGGCTAGAC_COL07_LM,COL07,LM,naive,T/NK/ILC,MKI67 T
...,...,...,...,...,...
TTTGTCATCGGTCTAA_COL18_LM,COL18,LM,NAC,B,B
TTTGTCATCTCGCATC_COL18_LM,COL18,LM,NAC,Myeloid,Myeloid/T doublet
TTTGTCATCTCGGACG_COL18_LM,COL18,LM,NAC,T/NK/ILC,NK2
TTTGTCATCTGTGCAA_COL18_LM,COL18,LM,NAC,T/NK/ILC,Naive CD4 T


In [21]:
adata_che2021_liver.obs.Cell_type.value_counts()

T/NK/ILC       39186
Myeloid         8285
Epithelial      3377
Plasma           826
B                788
Stromal          432
pDC              216
cDC1/migDC       178
Mast              91
Endothelial       89
Name: Cell_type, dtype: int64

In [22]:
# remove epithelial cells from Che2021 as sam is not analysing them
adata_che2021_liver = adata_che2021_liver[
    adata_che2021_liver.obs.Cell_type != "Epithelial"
].copy()

* Wu2022

In [23]:
set(adata_wu2022.obs.Tissue)

{'CRC', 'Colon_P', 'LM', 'LN', 'Liver_P', 'PBMC'}

In [24]:
adata_wu2022_liver = adata_wu2022[adata_wu2022.obs.Tissue == "LM"].copy()
print(adata_wu2022_liver.shape)
print(set(adata_wu2022_liver.obs.Tissue))

(55245, 20610)
{'LM'}


In [26]:
adata_wu2022_liver.obs.main_cell_type.value_counts()

CD8        17168
CD4        13873
Myeloid     6523
MAIT        4845
NK          4626
NEU         3319
Treg        2779
B           1346
Plasma       766
Name: main_cell_type, dtype: int64

* check that they are raw 

In [27]:
np.max(adata_multiome_raw_liver.X)

11459.0

In [28]:
np.max(adata_che2021_liver.X)

23832.0

In [29]:
np.max(adata_wu2022_liver.X)

40676.0

### add  columns of interest and rename some so that it is all consistent in the workflow
* Sample
* Patient
* Tissue
* Therapy
* Cell_type
* Cell_subtype

In [30]:
adata_multiome_raw_liver.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,Sample,Patient,Therapy,Tissue,TSSEnrichment,nFrags,percent.mt,...,nCount_Peaks,nFeature_Peaks,nCount_ATAC,nFeature_ATAC,nCount_integratedRNADecon,nFeature_integratedRNADecon,RNA.weight,ATAC.weight,wsnn_res.0.5,Cell_subtype
CRC01_LM#AAACATGCATCAGCAC-1,SeuratProject,981,727,CRC01_LM,CRC01,NAC,LM,9.078,1890,3.771662,...,454,450,305,162,926.971718,704,0.296429,0.703571,3,No annotation
CRC01_LM#AACATAGCAGGATAAC-1,SeuratProject,461,365,CRC01_LM,CRC01,NAC,LM,6.041,1923,1.301518,...,442,441,251,133,432.982580,344,0.512933,0.487067,3,No annotation
CRC01_LM#AACTCACAGAATGACG-1,SeuratProject,1710,1363,CRC01_LM,CRC01,NAC,LM,15.662,1683,2.456140,...,460,456,387,195,1552.642975,1301,0.382379,0.617621,3,No annotation
CRC01_LM#AGAACAAGTGGTTATG-1,SeuratProject,1220,983,CRC01_LM,CRC01,NAC,LM,5.246,10832,2.377049,...,1760,1713,1102,573,1170.950610,957,0.465358,0.534642,18,No annotation
CRC01_LM#AGGTTGCGTGAGCAAG-1,SeuratProject,10885,5086,CRC01_LM,CRC01,NAC,LM,7.196,8739,2.351860,...,2909,2801,1966,1019,10480.888601,5010,0.977986,0.022014,14,No annotation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CRC15_LM#TTTCCGGGTAGGTTGC-1,SeuratProject,982,709,CRC15_LM,CRC15,NAC,LM,8.245,1543,3.767821,...,277,273,291,154,911.653883,680,0.168251,0.831749,15,No annotation
CRC15_LM#TTTGCGGAGTGAACAG-1,SeuratProject,1033,747,CRC15_LM,CRC15,NAC,LM,11.347,2734,4.549855,...,964,930,1125,560,943.988647,714,0.821833,0.178167,6,No annotation
CRC15_LM#TTTGTCTAGGTCCACA-1,SeuratProject,1110,806,CRC15_LM,CRC15,NAC,LM,10.594,1538,4.414414,...,355,352,356,192,974.865972,766,0.204801,0.795199,15,No annotation
CRC15_LM#TTTGTGAAGCATGAAG-1,SeuratProject,1142,810,CRC15_LM,CRC15,NAC,LM,5.428,9636,8.406305,...,1322,1280,1155,589,981.736507,776,0.503726,0.496274,6,No annotation


In [31]:
adata_che2021_liver.obs

Unnamed: 0,Patient,Tissue,Therapy,Cell_type,Cell_subtype
AAACCTGAGAGTACAT_COL07_LM,COL07,LM,naive,cDC1/migDC,cDC1
AAACCTGAGAGTGACC_COL07_LM,COL07,LM,naive,pDC,pDC
AAACCTGAGGCTAGAC_COL07_LM,COL07,LM,naive,T/NK/ILC,MKI67 T
AAACCTGAGTGCCAGA_COL07_LM,COL07,LM,naive,Myeloid,TAM SPP1
AAACCTGAGTGGTAGC_COL07_LM,COL07,LM,naive,T/NK/ILC,Naive CD4 T
...,...,...,...,...,...
TTTGTCATCGGTCTAA_COL18_LM,COL18,LM,NAC,B,B
TTTGTCATCTCGCATC_COL18_LM,COL18,LM,NAC,Myeloid,Myeloid/T doublet
TTTGTCATCTCGGACG_COL18_LM,COL18,LM,NAC,T/NK/ILC,NK2
TTTGTCATCTGTGCAA_COL18_LM,COL18,LM,NAC,T/NK/ILC,Naive CD4 T


In [32]:
adata_wu2022_liver.obs

Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,patient,tissue,patient_tissue,tissueunique,RNA_snn_res.0.5,seurat_clusters,RNA_snn_res.0.1,doublet,RNA_snn_res.1,main_cell_type,sub_cell_type,chemo,Therapy,Patient,Tissue
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
P11_Liver_T_TACCTGCTCAGCAATC,P11_Liver_T,2265,974,P11,Liver_T,P11_Liver_T,Liver_T,1,1,0,Singlet,3,CD4,CCR6+ Th17,pretreated,naive,Wu2021_P11,LM
P12_Liver_T_CATTGCCCATTCTCTA,P12_Liver_T,2919,1071,P12,Liver_T,P12_Liver_T,Liver_T,1,1,0,Singlet,3,CD4,CCR6+ Th17,pretreated,naive,Wu2021_P12,LM
P12_Liver_T_TCGAACAGTAACACGG,P12_Liver_T,2103,903,P12,Liver_T,P12_Liver_T,Liver_T,1,1,0,Singlet,3,CD4,CCL5+ CD4+ T cells,pretreated,naive,Wu2021_P12,LM
P12_Liver_T_GTCAGCGAGGAGATAG,P12_Liver_T,1580,765,P12,Liver_T,P12_Liver_T,Liver_T,1,1,0,Singlet,3,CD8,GZMK+ CD8+ T cells,pretreated,naive,Wu2021_P12,LM
P10_Liver_T2_GTTAGTGGTCGCATCG,P10_Liver_T2,3892,1368,P10,Liver_T2,P10_Liver_T2,Liver_T,2,2,0,Singlet,4,CD4,CCL5+ CD4+ T cells,pretreated,naive,Wu2021_P10,LM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P8_Liver_T_GAGAAATAGGCCTGAA,P8_Liver_T,17023,2624,P8,Liver_T,P8_Liver_T,Liver_T,5,5,3,Singlet,24,Plasma,IGHG+ Plasma B cells,pretreated,naive,Wu2021_P8,LM
P15_Liver_T_TTATTGCTCGCTAATG-1,P15_Liver_T,3453,903,P15,Liver_T,P15_Liver_T,Liver_T,5,5,3,Singlet,24,Plasma,IGHA+ Plasma B cells,treated-PD/SD,NAC,Wu2021_P15,LM
P17_Liver_T_AAATGGACATGGAAGC-1,P17_Liver_T,7002,1680,P17,Liver_T,P17_Liver_T,Liver_T,5,5,3,Singlet,24,Plasma,IGHG+ Plasma B cells,pretreated,naive,Wu2021_P17,LM
P18_Liver_T_TAACCAGCACTGCGAC-1,P18_Liver_T,2594,776,P18,Liver_T,P18_Liver_T,Liver_T,5,5,3,Singlet,24,Plasma,IGHA+ Plasma B cells,treated-PD/SD,NAC,Wu2021_P18,LM


In [33]:
# change column names for wu2022
adata_wu2022_liver.obs.rename(columns={"main_cell_type": "Cell_type"}, inplace=True)
adata_wu2022_liver.obs.rename(columns={"sub_cell_type": "Cell_subtype"}, inplace=True)
# create sample column
adata_wu2022_liver.obs.rename(columns={"orig.ident": "Sample"}, inplace=True)

# create sample column for che2021
adata_che2021_liver.obs["Sample"] = (
    adata_che2021_liver.obs["Patient"].astype("str")
    + "_"
    + adata_che2021_liver.obs["Tissue"].astype("str")
)

In [34]:
adata_multiome_raw_liver

AnnData object with n_obs × n_vars = 5345 × 36485
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'Sample', 'Patient', 'Therapy', 'Tissue', 'TSSEnrichment', 'nFrags', 'percent.mt', 'percent.ribo', 'RNA_snn_res.0.5', 'seurat_clusters', 'integrated_snn_res.0.5', 'Clusters_all_cells_preDecon', 'Cell_type_preDecon', 'ident', 'decontX_contamination', 'decontX_clusters', 'integratedRNADecon_snn_res.0.5', 'X_scvi_batch', 'X_scvi_labels', 'leiden', 'Cell_type', 'nCount_Peaks', 'nFeature_Peaks', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_integratedRNADecon', 'nFeature_integratedRNADecon', 'RNA.weight', 'ATAC.weight', 'wsnn_res.0.5', 'Cell_subtype'
    obsm: 'X_ATAC', 'X_UMAP', 'X_WNN'

In [35]:
adata_wu2022_liver

AnnData object with n_obs × n_vars = 55245 × 20610
    obs: 'Sample', 'nCount_RNA', 'nFeature_RNA', 'patient', 'tissue', 'patient_tissue', 'tissueunique', 'RNA_snn_res.0.5', 'seurat_clusters', 'RNA_snn_res.0.1', 'doublet', 'RNA_snn_res.1', 'Cell_type', 'Cell_subtype', 'chemo', 'Therapy', 'Patient', 'Tissue'

In [36]:
adata_che2021_liver

AnnData object with n_obs × n_vars = 54219 × 33694
    obs: 'Patient', 'Tissue', 'Therapy', 'Cell_type', 'Cell_subtype', 'Sample'
    var: 'gene_ids', 'feature_types'

In [37]:
adata_multiome_raw_liver.obs.Cell_type.value_counts()

Myeloid        1936
T-NK-ILC       1604
Stromal         675
Endothelial     494
Hepatocyte      474
B               162
Name: Cell_type, dtype: int64

In [38]:
adata_wu2022_liver.obs.Cell_type.value_counts()

CD8        17168
CD4        13873
Myeloid     6523
MAIT        4845
NK          4626
NEU         3319
Treg        2779
B           1346
Plasma       766
Name: Cell_type, dtype: int64

In [39]:
adata_che2021_liver.obs.Cell_type.value_counts()

T/NK/ILC       39186
Myeloid         8285
Plasma           826
B                788
Stromal          432
pDC              216
cDC1/migDC       178
Mast              91
Endothelial       89
Name: Cell_type, dtype: int64

### save data to file

In [42]:
DIR2SAVE = Path("/data/BCI-CRC/nasrine/data/CRC/Metastatic_CRC_LM_dataset/raw/")
DIR2SAVE.mkdir(parents=True, exist_ok=True)

In [43]:
# our multiome
adata_multiome_raw_liver.write(DIR2SAVE.joinpath("CRC_LM_decon_raw.h5ad"))

# che2021
adata_che2021_liver.write(DIR2SAVE.joinpath("Che2021_CRC_LM_raw.h5ad"))

# wu2022
adata_wu2022_liver.write(DIR2SAVE.joinpath("Wu2022_CRC_LM_raw.h5ad"))