In [1]:
!pip install mygene statannotations scrublet scanpy scvelo decoupler goatools gseapy scperturb chembl_webresource_client biomart PyComplexHeatmap statsmodels omnipath git+https://github.com/saezlab/pypath.git --quiet

In [2]:
import subprocess
import os
import sys
import matplotlib.backends.backend_pdf
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns
import scvelo as scv
scv.settings.verbosity=1

from pathlib import Path

# Jupyter stuff
from tqdm.notebook import tqdm
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

%matplotlib inline

# Custom functions
sys.path.insert(1, '../../')
from utils import *

# scperturb package
sys.path.insert(1, '../../package/src/')
from scperturb import *

from pathlib import Path
figure_path = Path('../../figures/')

In [3]:
TEMPDIR = Path('/scratch/peidli/scPerturb/')
DATADIR = Path('/home/peidli/data/scPerturb/')

In [4]:
from scipy.io import mmread
from scipy.sparse import csr_matrix

In [37]:
# maps experiments to file identifiers
sample_dict = {
    'leukemia': 'DM',
    'invivo': 'inVivo',
    'exvivo': 'LSK'
}
files = [x.name for x in (TEMPDIR / 'LaraAstiasoHuntly2023').glob('*.h5')]

def merge_data(key):
    identifier = sample_dict[key]
    files_ = [x for x in files if f'_{identifier}_' in x]
    annot = pd.read_csv(TEMPDIR / 'LaraAstiasoHuntly2023' / f'GSE213511_CellAnnotation_{key}.tsv.gz', sep='\t', index_col=0)
    adatas = {}
    for file in tqdm(files_):
        tempdata = sc.read_10x_h5(TEMPDIR / 'LaraAstiasoHuntly2023' / file)
        tempdata.var_names_make_unique()
        adatas[file.replace('GSE213511_', '').replace('.h5', '')] = tempdata
    adata = sc.concat(adatas, label='Sample', index_unique='-')
    # make indiced unique
    adata.obs.index = [x.replace('-1-', '-') for x in adata.obs.index]
    annot.index = [f'{x.split("-")[0]}-{sample}' for x, sample in zip(annot.index, annot.Sample)]
    # merge annotation
    adata.obs = pd.concat([adata.obs, annot], axis=1)
    return adata

def harmonize_data(adata, key):
    # harmonize
    adata.var.index.name = 'gene_symbol'
    adata.obs.index.name = 'cell_barcode'
    adata.obs['organism'] = 'Mus musculus'
    adata.obs['disease'] = 'leukemia' if key=='leukemia' else 'healthy'
    adata.obs['cancer'] = key=='leukemia'
    adata.obs['perturbation_type'] = 'CRISPR-cas9'
    adata.obs['tissue_type'] = 'primary'
    adata.obs['tissue'] = 'bone marrow transplant'
    
    adata.obsm['X_umap'] = adata.obs[['UMAP1', 'UMAP2']].values
    adata.obs = adata.obs.loc[:, ~adata.obs.columns.duplicated(keep='first')]  # remove duplicated "Sample" column
    adata.obs['perturbation'] = np.array([None if pd.isna(x) else 'control' if x[:3]=='NTC' else x.split('_')[0] for x in adata.obs.Guide])
    adata.obs.rename({
        'Sample': 'sample',
        'Phase': 'cellcycle_phase',
        'Clusters': 'celltype',
        'Mixscape': 'Mixscape_classification',
        'mixscape': 'Mixscape_classification',
        'Guide': 'guide_id'
    }, axis=1, inplace=True)
    # reorder
    adata = adata.obs[['perturbation', 'guide_id', 'sample', 'cellcycle_phase', 'Mixscape_classification',
       'celltype', 'organism', 'disease', 'cancer',
       'perturbation_type', 'tissue_type', 'tissue']]
    return adata

In [34]:
adata = merge_data('leukemia')

100%|██████████| 13/13 [00:31<00:00,  2.45s/it]


In [38]:
bdata = harmonize_data(adata, 'leukemia')

In [40]:
bdata.obs.columns

Index(['sample', 'cellcycle_phase', 'Mixscape_classification', 'guide_id',
       'UMAP1', 'UMAP2', 'celltype', 'organism', 'disease', 'cancer',
       'perturbation_type', 'tissue_type', 'tissue', 'perturbation'],
      dtype='object')

In [48]:
# SUMTINGWONG here
cdata = merge_data('invivo')

100%|██████████| 12/12 [00:31<00:00,  2.60s/it]


ValueError: Length of passed value for obs_names is 144689, but this AnnData has shape: (135836, 32287)

In [49]:
key = 'invivo'
identifier = sample_dict[key]
files_ = [x for x in files if f'_{identifier}_' in x]
annot = pd.read_csv(TEMPDIR / 'LaraAstiasoHuntly2023' / f'GSE213511_CellAnnotation_{key}.tsv.gz', sep='\t', index_col=0)
adatas = {}
for file in tqdm(files_):
    tempdata = sc.read_10x_h5(TEMPDIR / 'LaraAstiasoHuntly2023' / file)
    tempdata.var_names_make_unique()
    adatas[file.replace('GSE213511_', '').replace('.h5', '')] = tempdata
adata = sc.concat(adatas, label='Sample', index_unique='-')
# make indiced unique
adata.obs.index = [x.replace('-1-', '-') for x in adata.obs.index]
annot.index = [f'{x.split("-")[0]}-{sample}' for x, sample in zip(annot.index, annot.Sample)]
# merge annotation
adata.obs = pd.concat([adata.obs, annot], axis=1)

100%|██████████| 12/12 [00:23<00:00,  1.92s/it]


ValueError: Length of passed value for obs_names is 144689, but this AnnData has shape: (135836, 32287)

In [54]:
s = [x in adata.obs.index for x in annot.index]

In [55]:
np.sum(s), len(annot)

(77753, 86606)

In [62]:
list(annot[~np.array(s)].Sample.unique())

['inVivo_OP2_Lin-_28d_1', 'inVivo_OP3_Lin-_28d_1']

In [61]:
# this is missing two samples????
list(adata.obs.Sample.unique())

['inVivo_OP3_ckit_14d_1',
 'inVivo_OP4_lin-_14d_1',
 'inVivo_OP2_ckit_14d_1',
 'inVivo_NTC_lin-andckit_14d_1',
 'inVivo_OP3_lin-_14d_1',
 'inVivo_OP1_lin-_28d_1',
 'inVivo_OP4_ckit_14d_1',
 'inVivo_OP1_lin-_28d_2',
 'inVivo_OP1_ckit_14d_1',
 'inVivo_OP1_lin-_14d_2',
 'inVivo_OP2_lin-_14d_1',
 'inVivo_OP1_lin-_14d_1']

In [45]:
edata = merge_data('exvivo')

100%|██████████| 9/9 [00:32<00:00,  3.57s/it]


In [47]:
edata.obs

Unnamed: 0,Sample,Phase,Guide,mixscape,Timepoint,Sample.1,UMAP1,UMAP2,Clusters
AAACCCAAGACCTCCG-LSK_OP1_NM_9d_1,LSK_OP1_NM_9d_1,G1,Rcor1_AS_21752,KO,9d,LSK_OP1_NM_9d_1,2.876267,0.598122,GMP
AAACCCAAGCTGGTGA-LSK_OP1_NM_9d_1,LSK_OP1_NM_9d_1,G1,,,9d,LSK_OP1_NM_9d_1,2.258468,-2.574814,GMP
AAACCCAAGCTTCGTA-LSK_OP1_NM_9d_1,LSK_OP1_NM_9d_1,S,Rbbp4_BR_14486,KO,9d,LSK_OP1_NM_9d_1,5.954404,2.318502,GMP (late)
AAACCCAAGTTTAGGA-LSK_OP1_NM_9d_1,LSK_OP1_NM_9d_1,G1,Mbd2_AS_41068,NP,9d,LSK_OP1_NM_9d_1,3.115234,1.101170,GMP
AAACCCACAGGTTCGC-LSK_OP1_NM_9d_1,LSK_OP1_NM_9d_1,G1,Mbd3_AS_41065,NP,9d,LSK_OP1_NM_9d_1,3.339583,0.542644,GMP
...,...,...,...,...,...,...,...,...,...
TTTGTTGGTGGTCTTA-LSK_OP0_NM_7d_1,LSK_OP0_NM_7d_1,S,,,7d,LSK_OP0_NM_7d_1,5.890830,0.523673,GMP (late)
TTTGTTGTCAGTCATG-LSK_OP0_NM_7d_1,LSK_OP0_NM_7d_1,G1,,,7d,LSK_OP0_NM_7d_1,1.998967,-1.380838,GMP
TTTGTTGTCCCGTTGT-LSK_OP0_NM_7d_1,LSK_OP0_NM_7d_1,G1,Smarcd1_AS_15166,NP,7d,LSK_OP0_NM_7d_1,-5.249389,1.900537,HSC
TTTGTTGTCGGAAGGT-LSK_OP0_NM_7d_1,LSK_OP0_NM_7d_1,G2M,Smarcd2_BR_45401,KO,7d,LSK_OP0_NM_7d_1,-2.424002,-0.357827,EBMP
