In [1]:
import pyreadr
import scanpy as sc
import numpy as np
import os
import pandas as pd
import anndata as ad
from pathlib import Path

* Load datasets 

In [2]:
# load datasets
adata_smc = sc.read_h5ad(
    "/data/BCI-CRC/SO/data/Lee2020_CRC_scRNAseq/scanpy_all2/SMC.h5ad"
)
adata_smc.var_names_make_unique()

adata_kul = sc.read_h5ad(
    "/data/BCI-CRC/SO/data/Lee2020_CRC_scRNAseq/scanpy_all2/KUL.h5ad"
)
adata_kul.var_names_make_unique()

adata_pelka = sc.read_h5ad(
    "/data/BCI-CRC/SO/data/Pelka2021_CRC_scRNAseq/Pelka_counts.h5ad"
)
adata_pelka.var_names_make_unique()

adata_che2021 = sc.read_10x_mtx("/data/BCI-CRC/SO/data/Che2021_CRC-LM_scRNAseq/")
adata_che2021.var_names_make_unique()

adata_wu2022 = sc.read_h5ad(
    "/data/BCI-CRC/nasrine/data/CRC/Wu_Yingcheng_2021_cancer_discovery/Wu2021_counts.h5ad"
)
adata_wu2022.var_names_make_unique()

#### create metadata for che2021 dataset 

In [3]:
adata_che2021.obs["Patient"] = [
    ele[1] for ele in adata_che2021.obs.index.str.split("_")
]
adata_che2021.obs["Tissue"] = [ele[2] for ele in adata_che2021.obs.index.str.split("_")]
adata_che2021.obs["Therapy"] = np.where(
    (
        (adata_che2021.obs["Patient"] == "COL15")
        | (adata_che2021.obs["Patient"] == "COL17")
        | (adata_che2021.obs["Patient"] == "COL18")
    ),
    "NAC",  # neoadjuvant chemotherapy, preoperative chemotherapy
    "naive",
)

In [4]:
# load annotations from previous analysis
# read in annotations
gen_annot = pd.read_csv(
    "/data/BCI-CRC/nasrine/data/CRC/Che2021_CRC-LM_scRNAseq/Che2021_CRC_LM_clustering_annotations.txt",
    header=0,
    index_col=0,
    sep="\t",
)
gen_annot.rename(columns={"Annotation": "Cell_type"}, inplace=True)

# merge to add annot to .obs
adata_che2021.obs = adata_che2021.obs.merge(
    right=gen_annot, how="left", right_index=True, left_index=True
)

tcell_annot_che2021 = pd.read_csv(
    "/data/BCI-CRC/nasrine/data/CRC/Che2021_CRC-LM_scRNAseq/Che2021_CRC_LM_TNKILC_clustering_annotations.txt",
    header=0,
    index_col=0,
    sep="\t",
)

cDC1_migDC_annot_che2021 = pd.read_csv(
    "/data/BCI-CRC/nasrine/data/CRC/Che2021_CRC-LM_scRNAseq/Che2021_CRC_LM_cDC1_migDC_annotations.txt",
    header=0,
    index_col=0,
    sep="\t",
)

myeloid_annot_che2021 = pd.read_csv(
    "/data/BCI-CRC/nasrine/data/CRC/Che2021_CRC-LM_scRNAseq/Che2021_CRC_LM_myeloid_clustering_annotations.txt",
    header=0,
    index_col=0,
    sep="\t",
)

stromal_annot_che2021 = pd.read_csv(
    "/data/BCI-CRC/nasrine/data/CRC/Che2021_CRC-LM_scRNAseq/Che2021_CRC_LM_stromal_clustering_annotations.txt",
    header=0,
    index_col=0,
    sep="\t",
)

complete_annot_che2021 = pd.concat(
    [
        tcell_annot_che2021,
        cDC1_migDC_annot_che2021,
        myeloid_annot_che2021,
        stromal_annot_che2021,
    ],
    axis=0,
)
complete_annot_che2021.rename(columns={"Annotation": "Cell_subtype"}, inplace=True)

# add annot to .obs
adata_che2021.obs = adata_che2021.obs.merge(
    right=complete_annot_che2021, how="left", right_index=True, left_index=True
)

adata_che2021.obs["Cell_subtype"] = np.where(
    adata_che2021.obs["Cell_subtype"].isna(),
    adata_che2021.obs["Cell_type"],
    adata_che2021.obs["Cell_subtype"],
)

### Isolate primary CRC samples only

* SMC

In [5]:
adata_smc_crc = adata_smc[adata_smc.obs.Class == "Tumor"].copy()
print(adata_smc_crc.shape)
print(set(adata_smc_crc.obs.Class))

(47285, 33694)
{'Tumor'}


* KUL

In [6]:
adata_kul_crc = adata_kul[adata_kul.obs.Class == "Tumor"].copy()
print(adata_kul_crc.shape)
print(set(adata_kul_crc.obs.Class))

(8254, 33694)
{'Tumor'}


* Pelka

In [7]:
set(adata_pelka.obs["HistologicTypeSimple"])
adata_pelka_crc = adata_pelka[
    adata_pelka.obs["HistologicTypeSimple"] != "Normal colon"
].copy()
print(adata_pelka_crc.shape)
print(set(adata_pelka_crc.obs.HistologicTypeSimple))

(257251, 43113)
{'Adenocarcinoma;Mucinous;Neuroendocrine', 'Adenocarcinoma;Medullary (with solid growth pattern)', 'Adenocarcinoma', 'Adenocarcinoma;Mucinous', 'Medullary'}


* Che2021

In [8]:
adata_che2021_crc = adata_che2021[adata_che2021.obs.Tissue == "CRC"].copy()
print(adata_che2021_crc.shape)
print(set(adata_che2021_crc.obs.Tissue))

(55735, 33694)
{'CRC'}


* Wu2022

In [9]:
adata_wu2022_crc = adata_wu2022[adata_wu2022.obs.Tissue == "CRC"].copy()
print(adata_wu2022_crc.shape)
print(set(adata_wu2022_crc.obs.Tissue))

(29856, 20610)
{'CRC'}


* check that they are raw 

In [10]:
np.max(adata_smc_crc.X)

26489.0

In [11]:
np.max(adata_kul_crc.X)

21283.0

In [12]:
# pelka has raw data
np.max(adata_pelka_crc.X)

59020.0

In [13]:
np.max(adata_che2021_crc.X)

22428.0

In [14]:
np.max(adata_wu2022_crc.X)

43586.0

### add  columns of interest and rename some so that it is all consistent in the workflow
* Sample
* Patient
* Tissue
* Therapy
* Cell_type
* Cell_subtype

In [15]:
adata_smc_crc.obs

Unnamed: 0_level_0,Patient,Class,Sample,Cell_type,Cell_subtype,Dataset
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SMC01-T_AAACCTGCATACGCCG,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,SMC
SMC01-T_AAACCTGGTCGCATAT,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,SMC
SMC01-T_AAACCTGTCCCTTGCA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,SMC
SMC01-T_AAACGGGAGGGAAACA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,SMC
SMC01-T_AAACGGGGTATAGGTA,SMC01,Tumor,SMC01-T,Epithelial cells,CMS2,SMC
...,...,...,...,...,...,...
SMC25-T_TTTGGTTCAACGCACC,SMC25,Tumor,SMC25-T,B cells,IgG+ Plasma,SMC
SMC25-T_TTTGTCAAGCGCTCCA,SMC25,Tumor,SMC25-T,B cells,CD19+CD20+ B,SMC
SMC06-T_TCTTCGGCAAACAACA,SMC06,Tumor,SMC06-T,Mast cells,Mast cells,SMC
SMC07-T_TGAGAGGGTTTAGGAA,SMC07,Tumor,SMC07-T,Mast cells,Mast cells,SMC


In [16]:
adata_kul_crc.obs

Unnamed: 0_level_0,Patient,Class,Sample,Cell_type,Cell_subtype,Dataset
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KUL01-T_AAACCTGGTCTTTCAT,KUL01,Tumor,KUL01-T,Epithelial cells,CMS1,KUL
KUL01-T_AAACGGGTCGGTTAAC,KUL01,Tumor,KUL01-T,Epithelial cells,CMS3,KUL
KUL01-T_AAAGATGGTATAGGGC,KUL01,Tumor,KUL01-T,Epithelial cells,CMS3,KUL
KUL01-T_AAAGATGGTGGCCCTA,KUL01,Tumor,KUL01-T,Epithelial cells,CMS1,KUL
KUL01-T_AAAGCAAGTAAACACA,KUL01,Tumor,KUL01-T,Epithelial cells,CMS3,KUL
...,...,...,...,...,...,...
KUL28-T_GGATTACAGAAACCAT,KUL28,Tumor,KUL28-T,Mast cells,Mast cells,KUL
KUL28-T_TCATTTGGTTGTCTTT,KUL28,Tumor,KUL28-T,Mast cells,Mast cells,KUL
KUL28-T_TCCACACAGTCATGCT,KUL28,Tumor,KUL28-T,Mast cells,Mast cells,KUL
KUL30-T_CATATTCGTCTTCAAG,KUL30,Tumor,KUL30-T,Mast cells,Mast cells,KUL


In [17]:
adata_che2021_crc

AnnData object with n_obs × n_vars = 55735 × 33694
    obs: 'Patient', 'Tissue', 'Therapy', 'Cell_type', 'Cell_subtype'
    var: 'gene_ids', 'feature_types'

In [18]:
adata_wu2022_crc

AnnData object with n_obs × n_vars = 29856 × 20610
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'patient', 'tissue', 'patient_tissue', 'tissueunique', 'RNA_snn_res.0.5', 'seurat_clusters', 'RNA_snn_res.0.1', 'doublet', 'RNA_snn_res.1', 'main_cell_type', 'sub_cell_type', 'chemo', 'Therapy', 'Patient', 'Tissue'

In [19]:
adata_pelka_crc

AnnData object with n_obs × n_vars = 257251 × 43113
    obs: 'SPECIMEN_TYPE', 'SOURCE_HOSPITAL', 'TISSUE_PROCESSING_TEAM', 'PROCESSING_TYPE', 'SINGLECELL_TYPE', 'PatientTypeID', 'HistologicTypeSimple', 'MMR_IHC', 'MMRStatus', 'MLH1Status', 'MMRMLH1Tumor', 'TissueSite_detailed', 'TissueSiteSimple', 'HistologicGrade_detailed', 'HistologicGradeSimple', 'TumorStage', 'NodeStatus_detailed', 'NodeStatusSimple', 'MetastasisStatus', 'TumorSize', 'SizeQuantile', 'PID', 'Sex', 'Age', 'Ethnicity', 'Race', 'batchID', 'clTopLevel', 'clMidwayPr', 'cl295v11SubShort', 'cl295v11SubFull'
    var: 'gene_ids', 'feature_types', 'genome'

In [20]:
adata_pelka_crc

AnnData object with n_obs × n_vars = 257251 × 43113
    obs: 'SPECIMEN_TYPE', 'SOURCE_HOSPITAL', 'TISSUE_PROCESSING_TEAM', 'PROCESSING_TYPE', 'SINGLECELL_TYPE', 'PatientTypeID', 'HistologicTypeSimple', 'MMR_IHC', 'MMRStatus', 'MLH1Status', 'MMRMLH1Tumor', 'TissueSite_detailed', 'TissueSiteSimple', 'HistologicGrade_detailed', 'HistologicGradeSimple', 'TumorStage', 'NodeStatus_detailed', 'NodeStatusSimple', 'MetastasisStatus', 'TumorSize', 'SizeQuantile', 'PID', 'Sex', 'Age', 'Ethnicity', 'Race', 'batchID', 'clTopLevel', 'clMidwayPr', 'cl295v11SubShort', 'cl295v11SubFull'
    var: 'gene_ids', 'feature_types', 'genome'

In [21]:
# add Tissue, Therapy columns for SMC, KUL
adata_smc_crc.obs["Tissue"] = "CRC"
adata_smc_crc.obs["Therapy"] = "naive"
adata_smc_crc.obs.rename(columns={"Dataset": "cell_source"}, inplace=True)

adata_kul_crc.obs["Tissue"] = "CRC"
adata_kul_crc.obs["Therapy"] = "naive"
adata_kul_crc.obs.rename(columns={"Dataset": "cell_source"}, inplace=True)

# change column names for wu2022
adata_wu2022_crc.obs.rename(columns={"main_cell_type": "Cell_type"}, inplace=True)
adata_wu2022_crc.obs.rename(columns={"sub_cell_type": "Cell_subtype"}, inplace=True)
# create sample column
adata_wu2022_crc.obs.rename(columns={"orig.ident": "Sample"}, inplace=True)

# change column names for pelka and Tissue, Therapy columns
adata_pelka_crc.obs.rename(columns={"clTopLevel": "Cell_type"}, inplace=True)
adata_pelka_crc.obs.rename(columns={"cl295v11SubFull": "Cell_subtype"}, inplace=True)
adata_pelka_crc.obs["Tissue"] = "CRC"
adata_pelka_crc.obs["Therapy"] = "naive"
# create sample column
adata_pelka_crc.obs.rename(columns={"batchID": "Sample"}, inplace=True)
# rename to patient
adata_pelka_crc.obs.rename(columns={"PatientTypeID": "Patient"}, inplace=True)


# create sample column for che2021
adata_che2021_crc.obs["Sample"] = (
    adata_che2021_crc.obs["Patient"].astype("str")
    + "_"
    + adata_che2021_crc.obs["Tissue"].astype("str")
)

In [22]:
adata_smc_crc

AnnData object with n_obs × n_vars = 47285 × 33694
    obs: 'Patient', 'Class', 'Sample', 'Cell_type', 'Cell_subtype', 'cell_source', 'Tissue', 'Therapy'

In [23]:
adata_kul_crc

AnnData object with n_obs × n_vars = 8254 × 33694
    obs: 'Patient', 'Class', 'Sample', 'Cell_type', 'Cell_subtype', 'cell_source', 'Tissue', 'Therapy'

In [24]:
adata_wu2022_crc

AnnData object with n_obs × n_vars = 29856 × 20610
    obs: 'Sample', 'nCount_RNA', 'nFeature_RNA', 'patient', 'tissue', 'patient_tissue', 'tissueunique', 'RNA_snn_res.0.5', 'seurat_clusters', 'RNA_snn_res.0.1', 'doublet', 'RNA_snn_res.1', 'Cell_type', 'Cell_subtype', 'chemo', 'Therapy', 'Patient', 'Tissue'

In [25]:
adata_pelka_crc

AnnData object with n_obs × n_vars = 257251 × 43113
    obs: 'SPECIMEN_TYPE', 'SOURCE_HOSPITAL', 'TISSUE_PROCESSING_TEAM', 'PROCESSING_TYPE', 'SINGLECELL_TYPE', 'Patient', 'HistologicTypeSimple', 'MMR_IHC', 'MMRStatus', 'MLH1Status', 'MMRMLH1Tumor', 'TissueSite_detailed', 'TissueSiteSimple', 'HistologicGrade_detailed', 'HistologicGradeSimple', 'TumorStage', 'NodeStatus_detailed', 'NodeStatusSimple', 'MetastasisStatus', 'TumorSize', 'SizeQuantile', 'PID', 'Sex', 'Age', 'Ethnicity', 'Race', 'Sample', 'Cell_type', 'clMidwayPr', 'cl295v11SubShort', 'Cell_subtype', 'Tissue', 'Therapy'
    var: 'gene_ids', 'feature_types', 'genome'

In [26]:
adata_che2021_crc

AnnData object with n_obs × n_vars = 55735 × 33694
    obs: 'Patient', 'Tissue', 'Therapy', 'Cell_type', 'Cell_subtype', 'Sample'
    var: 'gene_ids', 'feature_types'

### save data to file

In [27]:
DIR2SAVE = Path("/data/BCI-CRC/nasrine/data/CRC/Primary_CRC_dataset/raw/")

In [28]:
# smc
adata_smc_crc.write(DIR2SAVE.joinpath("Lee2020_SMC_CRC_raw.h5ad"))

# kul
adata_kul_crc.write(DIR2SAVE.joinpath("Lee2020_KUL_CRC_raw.h5ad"))

# pelka
adata_pelka_crc.write(DIR2SAVE.joinpath("Pelka2021_CRC_raw.h5ad"))

# che2021
adata_che2021_crc.write(DIR2SAVE.joinpath("Che2021_CRC_raw.h5ad"))

# wu2022
adata_wu2022_crc.write(DIR2SAVE.joinpath("Wu2022_CRC_raw.h5ad"))