In [1]:
!pip install mygene statannotations scrublet scanpy scvelo decoupler goatools gseapy scperturb chembl_webresource_client biomart PyComplexHeatmap statsmodels omnipath git+https://github.com/saezlab/pypath.git --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyopenssl 23.0.0 requires cryptography<40,>=38.0.0, but you have cryptography 41.0.5 which is incompatible.[0m[31m
[0m

In [2]:
import subprocess
import os
import sys
import matplotlib.backends.backend_pdf
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns
import scvelo as scv
scv.settings.verbosity=1

from pathlib import Path

# Jupyter stuff
from tqdm.notebook import tqdm
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

%matplotlib inline

# Custom functions
sys.path.insert(1, '../../')
from utils import *

# scperturb package
sys.path.insert(1, '../../package/src/')
from scperturb import *

from pathlib import Path
figure_path = Path('../../figures/')

In [3]:
TEMPDIR = Path('/scratch/peidli/scPerturb/')
DATADIR = Path('/home/peidli/data/scPerturb/')

In [4]:
from scipy.io import mmread
from scipy.sparse import csr_matrix

In [30]:
files

['GSE213511_inVivo_OP3_ckit_14d_1.h5',
 'GSE213511_DM_CITEseq-1_NA_NM_1.h5',
 'GSE213511_DM_Test1_NM_6d_1.h5',
 'GSE213511_LSK_OP1_NM_9d_1.h5',
 'GSE213511_inVivo_OP4_lin-_14d_1.h5',
 'GSE213511_DM_OP3_NM_6d_1.h5',
 'GSE213511_inVivo_OP2_ckit_14d_1.h5',
 'GSE213511_LSK_OP2_NM_7d_1.h5',
 'GSE213511_DM_OP2_NM_6d_2.h5',
 'GSE213511_DM_CITEseq-2_NA_NM_1.h5',
 'GSE213511_DM_OP1_NM_6d_1.h5',
 'GSE213511_DM_OP2_NM_6d_3.h5',
 'GSE213511_LSK_OP3_NM_9d_1.h5',
 'GSE213511_inVivo_NTC_lin-andckit_14d_1.h5',
 'GSE213511_DM_OP1_NM_6d_2.h5',
 'GSE213511_LSK_OP1_NM_7d_1.h5',
 'GSE213511_inVivo_OP3_lin-_14d_1.h5',
 'GSE213511_LSK_OP2_NM_9d_1.h5',
 'GSE213511_DM_OP5_NM_6d_1.h5',
 'GSE213511_inVivo_OP1_lin-_28d_1.h5',
 'GSE213511_DM_OP2_NM_6d_1.h5',
 'GSE213511_inVivo_OP4_ckit_14d_1.h5',
 'GSE213511_LSK_OP4_NM_7d_1.h5',
 'GSE213511_DM_OP0_NM_6d_1.h5',
 'GSE213511_LSK_OP3_NM_7d_1.h5',
 'GSE213511_DM_Test2_NM_6d_1.h5',
 'GSE213511_inVivo_OP1_lin-_28d_2.h5',
 'GSE213511_LSK_OP4_NM_9d_1.h5',
 'GSE213511_inViv

In [24]:
# maps experiments to file identifiers
sample_dict = {
    'leukemia': 'DM',
    'invivo': 'inVivo',
    'exvivo': 'LSK'
}
files = [x.name for x in (TEMPDIR / 'LaraAstiasoHuntly2023').glob('*.h5')]

def merge_data(key):
    identifier = sample_dict[key]
    files_ = [x for x in files if f'_{identifier}_' in x]
    annot = pd.read_csv(TEMPDIR / 'LaraAstiasoHuntly2023' / f'GSE213511_CellAnnotation_{key}.tsv.gz', sep='\t', index_col=0)
    adatas = {}
    for file in tqdm(files_):
        tempdata = sc.read_10x_h5(TEMPDIR / 'LaraAstiasoHuntly2023' / file)
        tempdata.var_names_make_unique()
        adatas[file.replace('GSE213511_', '').replace('.h5', '')] = tempdata
    adata = sc.concat(adatas, label='Sample', index_unique='-')
    # make indiced unique
    adata.obs.index = [x.replace('-1-', '-') for x in adata.obs.index]
    annot.index = [f'{x.split("-")[0]}-{sample}' for x, sample in zip(annot.index, annot.Sample)]
    # merge annotation
    if key=='invivo':
        annot = annot[[x in adata.obs.index for x in annot.index]]  # samples ['inVivo_OP2_Lin-_28d_1', 'inVivo_OP3_Lin-_28d_1'] are missing in the data
    adata.obs = pd.concat([adata.obs, annot], axis=1)
    return adata

def harmonize_data(adata, key):
    # harmonize
    adata.var.index.name = 'gene_symbol'
    adata.obs.index.name = 'cell_barcode'
    adata.obs['organism'] = 'Mus musculus'
    adata.obs['disease'] = 'leukemia' if key=='leukemia' else 'healthy'
    adata.obs['cancer'] = key=='leukemia'
    adata.obs['perturbation_type'] = 'CRISPR-cas9'
    adata.obs['tissue_type'] = 'primary'
    adata.obs['tissue'] = 'bone marrow transplant'
    
    adata.obsm['X_umap'] = adata.obs[['UMAP1', 'UMAP2']].values
    adata.obs = adata.obs.loc[:, ~adata.obs.columns.duplicated(keep='first')]  # remove duplicated "Sample" column
    adata.obs['perturbation'] = np.array([None if pd.isna(x) else 'control' if x[:3]=='NTC' else x.split('_')[0] for x in adata.obs.Guide])
    adata.obs.rename({
        'Sample': 'sample',
        'Phase': 'cellcycle_phase',
        'Clusters': 'celltype',
        'Mixscape': 'Mixscape_classification',
        'mixscape': 'Mixscape_classification',
        'Guide': 'guide_id',
        'Timepoint': 'time'
    }, axis=1, inplace=True)
    # reorder
    order = ['perturbation', 'guide_id', 'sample', 'cellcycle_phase', 'Mixscape_classification',
       'celltype', 'organism', 'disease', 'cancer',
       'perturbation_type', 'tissue_type', 'tissue']
    if 'time' in adata.obs.columns: order = ['time'] + order 
    adata.obs = adata.obs[order]
    return adata

In [25]:
adata = merge_data('leukemia')
bdata = harmonize_data(adata, 'leukemia')

100%|██████████| 13/13 [00:30<00:00,  2.37s/it]


In [26]:
bdata.obs

Unnamed: 0_level_0,perturbation,guide_id,sample,cellcycle_phase,Mixscape_classification,celltype,organism,disease,cancer,perturbation_type,tissue_type,tissue
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AAACCCAAGCAGCCCT-DM_CITEseq-1_NA_NM_1,,,DM_CITEseq-1_NA_NM_1,,,,Mus musculus,leukemia,True,CRISPR-cas9,primary,bone marrow transplant
AAACCCAAGCGCCTTG-DM_CITEseq-1_NA_NM_1,,,DM_CITEseq-1_NA_NM_1,,,,Mus musculus,leukemia,True,CRISPR-cas9,primary,bone marrow transplant
AAACCCAAGGAGAGTA-DM_CITEseq-1_NA_NM_1,,,DM_CITEseq-1_NA_NM_1,G2M,,LSC,Mus musculus,leukemia,True,CRISPR-cas9,primary,bone marrow transplant
AAACCCAAGGATTTAG-DM_CITEseq-1_NA_NM_1,,,DM_CITEseq-1_NA_NM_1,,,,Mus musculus,leukemia,True,CRISPR-cas9,primary,bone marrow transplant
AAACCCAAGGGCCAAT-DM_CITEseq-1_NA_NM_1,,,DM_CITEseq-1_NA_NM_1,G1,,LSC,Mus musculus,leukemia,True,CRISPR-cas9,primary,bone marrow transplant
...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTATTTCGG-DM_OP4_NM_6d_1,Kdm6a,Kdm6a_AS_45253,DM_OP4_NM_6d_1,G1,NP,LSC,Mus musculus,leukemia,True,CRISPR-cas9,primary,bone marrow transplant
TTTGTTGGTCTTCATT-DM_OP4_NM_6d_1,Stag2,Stag2_B_13072,DM_OP4_NM_6d_1,S,KO,LSC,Mus musculus,leukemia,True,CRISPR-cas9,primary,bone marrow transplant
TTTGTTGGTTTCAGAC-DM_OP4_NM_6d_1,Hmgxb4,Hmgxb4_R2.BR_36203,DM_OP4_NM_6d_1,G2M,NP,LSC,Mus musculus,leukemia,True,CRISPR-cas9,primary,bone marrow transplant
TTTGTTGTCGCCGATG-DM_OP4_NM_6d_1,,,DM_OP4_NM_6d_1,,,,Mus musculus,leukemia,True,CRISPR-cas9,primary,bone marrow transplant


In [27]:
adata = merge_data('invivo')
cdata = harmonize_data(adata, 'invivo')

100%|██████████| 12/12 [00:20<00:00,  1.67s/it]


In [28]:
cdata.obs

Unnamed: 0_level_0,time,perturbation,guide_id,sample,cellcycle_phase,Mixscape_classification,celltype,organism,disease,cancer,perturbation_type,tissue_type,tissue
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAACCCAAGCTAAACA-inVivo_OP3_ckit_14d_1,,,,inVivo_OP3_ckit_14d_1,,,,Mus musculus,healthy,False,CRISPR-cas9,primary,bone marrow transplant
AAACCCAAGCTCTATG-inVivo_OP3_ckit_14d_1,,,,inVivo_OP3_ckit_14d_1,,,,Mus musculus,healthy,False,CRISPR-cas9,primary,bone marrow transplant
AAACCCAAGGCACTAG-inVivo_OP3_ckit_14d_1,,,,inVivo_OP3_ckit_14d_1,,,,Mus musculus,healthy,False,CRISPR-cas9,primary,bone marrow transplant
AAACCCACACAGTGAG-inVivo_OP3_ckit_14d_1,,,,inVivo_OP3_ckit_14d_1,,,,Mus musculus,healthy,False,CRISPR-cas9,primary,bone marrow transplant
AAACCCACACTCACTC-inVivo_OP3_ckit_14d_1,14d,Prmt5,Prmt5_B_24454,inVivo_OP3_ckit_14d_1,G2M,NP,Gran. P,Mus musculus,healthy,False,CRISPR-cas9,primary,bone marrow transplant
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCAAAGAAC-inVivo_OP1_lin-_14d_1,14d,,,inVivo_OP1_lin-_14d_1,G1,,MEP (G1),Mus musculus,healthy,False,CRISPR-cas9,primary,bone marrow transplant
TTTGTTGTCACCTTGC-inVivo_OP1_lin-_14d_1,14d,Kmt2d,Kmt2d_BR_72116,inVivo_OP1_lin-_14d_1,G2M,KO,MEP,Mus musculus,healthy,False,CRISPR-cas9,primary,bone marrow transplant
TTTGTTGTCATTTGTC-inVivo_OP1_lin-_14d_1,,,,inVivo_OP1_lin-_14d_1,,,,Mus musculus,healthy,False,CRISPR-cas9,primary,bone marrow transplant
TTTGTTGTCGCGTAGC-inVivo_OP1_lin-_14d_1,14d,Smarcd1,Smarcd1_AS_15166,inVivo_OP1_lin-_14d_1,G2M,KO,MEP,Mus musculus,healthy,False,CRISPR-cas9,primary,bone marrow transplant


In [29]:
adata = merge_data('exvivo')
ddata = harmonize_data(adata, 'exvivo')

100%|██████████| 9/9 [00:30<00:00,  3.36s/it]


In [47]:
ddata.obs

Unnamed: 0,Sample,Phase,Guide,mixscape,Timepoint,Sample.1,UMAP1,UMAP2,Clusters
AAACCCAAGACCTCCG-LSK_OP1_NM_9d_1,LSK_OP1_NM_9d_1,G1,Rcor1_AS_21752,KO,9d,LSK_OP1_NM_9d_1,2.876267,0.598122,GMP
AAACCCAAGCTGGTGA-LSK_OP1_NM_9d_1,LSK_OP1_NM_9d_1,G1,,,9d,LSK_OP1_NM_9d_1,2.258468,-2.574814,GMP
AAACCCAAGCTTCGTA-LSK_OP1_NM_9d_1,LSK_OP1_NM_9d_1,S,Rbbp4_BR_14486,KO,9d,LSK_OP1_NM_9d_1,5.954404,2.318502,GMP (late)
AAACCCAAGTTTAGGA-LSK_OP1_NM_9d_1,LSK_OP1_NM_9d_1,G1,Mbd2_AS_41068,NP,9d,LSK_OP1_NM_9d_1,3.115234,1.101170,GMP
AAACCCACAGGTTCGC-LSK_OP1_NM_9d_1,LSK_OP1_NM_9d_1,G1,Mbd3_AS_41065,NP,9d,LSK_OP1_NM_9d_1,3.339583,0.542644,GMP
...,...,...,...,...,...,...,...,...,...
TTTGTTGGTGGTCTTA-LSK_OP0_NM_7d_1,LSK_OP0_NM_7d_1,S,,,7d,LSK_OP0_NM_7d_1,5.890830,0.523673,GMP (late)
TTTGTTGTCAGTCATG-LSK_OP0_NM_7d_1,LSK_OP0_NM_7d_1,G1,,,7d,LSK_OP0_NM_7d_1,1.998967,-1.380838,GMP
TTTGTTGTCCCGTTGT-LSK_OP0_NM_7d_1,LSK_OP0_NM_7d_1,G1,Smarcd1_AS_15166,NP,7d,LSK_OP0_NM_7d_1,-5.249389,1.900537,HSC
TTTGTTGTCGGAAGGT-LSK_OP0_NM_7d_1,LSK_OP0_NM_7d_1,G2M,Smarcd2_BR_45401,KO,7d,LSK_OP0_NM_7d_1,-2.424002,-0.357827,EBMP
