In [1]:
import subprocess
import os
import sys
import matplotlib.backends.backend_pdf
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns
import scvelo as scv
scv.settings.verbosity=1

from pathlib import Path

# Jupyter stuff
from tqdm.notebook import tqdm
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

%matplotlib inline

# Custom functions
sys.path.insert(1, '../')
from utils import *

# scperturb package
sys.path.insert(1, '../package/src/')
from scperturb import *

from pathlib import Path
figure_path = Path('../figures/')

In [2]:
TEMPDIR = Path('/fast/scratch/users/peidlis_c/perturbation_resource_paper/')

In [4]:
sorted([file.name for file in (TEMPDIR / 'QinTape2023').glob('*')])

['CellChat-Follow-up_all-cells.csv.zip',
 'Count Matrices & CellRanger Reports',
 'Count Matrices & CellRanger Reports.zip',
 'DA_INTepi_fib.rds',
 'DA_INTepi_geno.rds',
 'INTepi.rds',
 'Signal-Perturbation_all-cells.csv.zip',
 'WENR-Permutation_all-cells.csv.zip',
 'WNT-EGF-Competition_all-cells.csv.zip',
 '__MACOSX',
 'aug21_WT.rds',
 'jan21_crctme_A.rds',
 'jan21_crctme_AK.rds',
 'jan21_crctme_AKP.rds',
 'jan21_crctme_WT.rds']

# scRNA-seq

In [83]:
files = sorted([file for file in (TEMPDIR / 'QinTape2023/Count Matrices & CellRanger Reports').glob('*') if 'DS_Store' not in file.name])

In [109]:
adatas = {}
for file in tqdm(files):
    adata = sc.read_10x_mtx(file)
    sample_name = file.name
    sample_number, sample_description = sample_name.split('_', 1)
    
    reg = re.match('[AKP]+|(WT)', sample_description)
    adata.obs['perturbation'] = reg.group() if reg else None
    adata.obs['perturbation_type'] = 'genotype'
    
    adata.obs['perturbation_2'] = 'Mac-Fib' if 'Mac-Fib' in sample_description else 'Mac' if 'Mac' in sample_description else 'Fib' if 'Fib' in sample_description else None
    adata.obs['perturbation_type_2'] = 'coculture'
    
    adata.obs['sample_number']=sample_number
    adatas[sample_name] = adata
    if int(sample_number) >= 5:
        # stop processing, additional controls with other conditions.
        # we focus on the core datasets (1-19)
        break

 17%|█▋        | 4/24 [01:41<08:28, 25.41s/it]


In [123]:
adata = sc.concat(adatas, label='batch')

In [124]:
adata.obs['disease'] = "colorectal cancer"
adata.obs['cancer'] = True
adata.obs['tissue_type']="cell_line"
adata.obs["cell_line"] = "CRC organoid"
adata.obs["celltype"] = 'colon epithelial cell'
adata.obs['organism'] = 'mouse'
adata.obs['nperts'] = (~adata.obs.perturbation.isna()*1 + ~adata.obs.perturbation_2.isna()*1)
annotate_qc(adata, species='mouse')
adata.obs.index.name = 'cell_barcode'

In [125]:
adata.obs

Unnamed: 0_level_0,perturbation,perturbation_type,perturbation_2,perturbation_type_2,sample_number,batch,disease,cancer,tissue_type,cell_line,celltype,organism,nperts,ncounts,ngenes,percent_mito,percent_ribo
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AAACGAAGTACTCAAC-1,WT,genotype,,coculture,01,01_WT_1,colorectal cancer,True,cell_line,CRC organoid,colon epithelial cell,human,1,63470.0,5616,0.975264,7.704427
AAACGCTGTCAACCAT-1,WT,genotype,,coculture,01,01_WT_1,colorectal cancer,True,cell_line,CRC organoid,colon epithelial cell,human,1,53907.0,5292,0.931233,2.090638
AAACGCTTCCTTGACC-1,WT,genotype,,coculture,01,01_WT_1,colorectal cancer,True,cell_line,CRC organoid,colon epithelial cell,human,1,22710.0,3636,0.400705,1.083223
AAAGAACCACTGCATA-1,WT,genotype,,coculture,01,01_WT_1,colorectal cancer,True,cell_line,CRC organoid,colon epithelial cell,human,1,14462.0,3439,1.023372,20.495090
AAAGGATAGGCATGCA-1,WT,genotype,,coculture,01,01_WT_1,colorectal cancer,True,cell_line,CRC organoid,colon epithelial cell,human,1,29846.0,3851,1.239697,4.784561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAGTCAGCC-1,A,genotype,,coculture,05,05_A,colorectal cancer,True,cell_line,CRC organoid,colon epithelial cell,human,1,3577.0,1730,5.591278,16.354486
TTTGGTTCATGACTAC-1,A,genotype,,coculture,05,05_A,colorectal cancer,True,cell_line,CRC organoid,colon epithelial cell,human,1,782.0,367,28.260870,12.915601
TTTGGTTTCACGATCA-1,A,genotype,,coculture,05,05_A,colorectal cancer,True,cell_line,CRC organoid,colon epithelial cell,human,1,16557.0,3232,0.869723,8.304645
TTTGTTGCAATTGCAC-1,A,genotype,,coculture,05,05_A,colorectal cancer,True,cell_line,CRC organoid,colon epithelial cell,human,1,569.0,391,0.351494,23.022846


# Cytof (TODO)

In [52]:
# cytof
tab = pd.read_csv(TEMPDIR / 'QinTape2023/Signal-Perturbation_all-cells.csv')
tab.index = [f'{ci}_{b}' for ci, b in zip(tab.Cell_Index, tab.condition)]

In [178]:
# build adata
features = [c for c in tab.columns[8:62] if 'Barcode' not in c]
adata = sc.AnnData(tab[features], 
                   obs=tab[['batch', 'barcode','genotype', 'media', 'replicate', 'culture', 'condition', 'ligand', 'inhibitor', 'A', 'K']])
adata.var_names = features
adata.var['channel'] = [x.split('_', 1)[0] for x in adata.var_names]
adata.var_names = [x.split('_', 1)[1] for x in adata.var_names]  # remove channel from name
adata.obsm['cytof_details'] = tab[['Cell_Index', 'Time', 'Event_length', 'Center', 'Width', 'Residual', 'Offset', 'Amplitude', 'bc_separation_dist', 'mahalanobis_dist']]
adata.obsm['barcodes'] = tab[[c for c in tab.columns[8:62] if 'Barcode' in c]]

# harmonize metadata
adata.obs = adata.obs.rename({'genotype': 'perturbation_3', 
                  'ligand': 'perturbation_2', 
                  'inhibitor': 'perturbation'}, axis=1).drop(['media', 'culture', 'condition', 'A', 'K'], axis=1)
adata.obs = adata.obs[["perturbation", "perturbation_2", "perturbation_3", "batch", "barcode", "replicate"]]
adata.obs['perturbation_type'] = 'drug'
adata.obs['perturbation_type_2'] = 'ligand'
adata.obs['perturbation_type_3'] = 'genotype'
adata.obs['perturbation'][adata.obs.perturbation=='Ctrl'] = 'control'
adata.obs['perturbation_2'][adata.obs.perturbation_2=='Ctrl'] = 'control'
adata.obs['perturbation_3'][adata.obs.perturbation_3=='WT'] = 'control'
adata.obs['nperts'] = (adata.obs.perturbation != 'control')*1 + (adata.obs.perturbation_2 != 'control')*1 + (adata.obs.perturbation_3 != 'control')*1

In [185]:
tab = pd.read_csv(TEMPDIR / 'QinTape2023/WENR-Permutation_all-cells.csv')
tab.index = [f'{ci}_{b}' for ci, b in zip(tab.Cell_Index, tab.condition)]

In [197]:
# build adata
features = [c for c in tab.columns[3:61] if 'Barcode' not in c]
adata = sc.AnnData(tab[features], 
                   obs=tab[['batch', 'replicate', 'condition', 'culture', 'format','media','genotype']])
adata.var_names = features
adata.var['channel'] = [x.split('_', 1)[0] for x in adata.var_names]
adata.var_names = [x.split('_', 1)[1] for x in adata.var_names]  # remove channel from name
adata.obsm['cytof_details'] = tab[['Cell_Index', 'Time', 'Event_length', 'Center', 'Width', 'Residual', 'Offset', 'bc_separation_dist', 'mahalanobis_dist']]
adata.obsm['barcodes'] = tab[[c for c in tab.columns[3:61] if 'Barcode' in c]]

In [198]:
adata.obs

Unnamed: 0,batch,replicate,condition,culture,format,media,genotype
1_K-Fib_E_Org_2,B1,2,K-Fib_E_Org_2,K-Fib_E,co-culture,E,K
2_K-Fib_E_Org_2,B1,2,K-Fib_E_Org_2,K-Fib_E,co-culture,E,K
3_K-Fib_E_Org_2,B1,2,K-Fib_E_Org_2,K-Fib_E,co-culture,E,K
4_K-Fib_E_Org_2,B1,2,K-Fib_E_Org_2,K-Fib_E,co-culture,E,K
5_K-Fib_E_Org_2,B1,2,K-Fib_E_Org_2,K-Fib_E,co-culture,E,K
...,...,...,...,...,...,...,...
13639_KP-Fib_N_Org_1,B1,1,KP-Fib_N_Org_1,KP-Fib_N,co-culture,N,KP
13640_KP-Fib_N_Org_1,B1,1,KP-Fib_N_Org_1,KP-Fib_N,co-culture,N,KP
13641_KP-Fib_N_Org_1,B1,1,KP-Fib_N_Org_1,KP-Fib_N,co-culture,N,KP
13642_KP-Fib_N_Org_1,B1,1,KP-Fib_N_Org_1,KP-Fib_N,co-culture,N,KP


In [196]:
adata.obs['format'].unique()

array(['co-culture', 'monoculture'], dtype=object)