In [1]:
!pip install mygene statannotations scrublet scanpy scvelo decoupler goatools gseapy scperturb chembl_webresource_client biomart PyComplexHeatmap statsmodels omnipath git+https://github.com/saezlab/pypath.git --quiet

In [2]:
import subprocess
import os
import sys
import matplotlib.backends.backend_pdf
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns
import scvelo as scv
scv.settings.verbosity=1

from pathlib import Path

# Jupyter stuff
from tqdm.notebook import tqdm
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

%matplotlib inline

# Custom functions
sys.path.insert(1, '../../')
from utils import *

# scperturb package
sys.path.insert(1, '../../package/src/')
from scperturb import *

from pathlib import Path
figure_path = Path('../../figures/')

In [3]:
TEMPDIR = Path('/scratch/peidli/scPerturb/')
DATADIR = Path('/home/peidli/data/scPerturb/')

In [4]:
from scipy.io import mmread
from scipy.sparse import csr_matrix

In [10]:
files = [x.name for x in (TEMPDIR / 'LaraAstiasoHuntly2023').glob('*.h5')]

In [33]:
adatas = {}
for file in tqdm(LSK_files):
    tempdata = sc.read_10x_h5(TEMPDIR / 'LaraAstiasoHuntly2023' / file)
    tempdata.var_names_make_unique()
    adatas[file.replace('GSE213511_', '').replace('.h5', '')] = tempdata

100%|██████████| 9/9 [00:27<00:00,  3.07s/it]


In [34]:
annot_DM = pd.read_csv(TEMPDIR / 'LaraAstiasoHuntly2023' / 'GSE213511_CellAnnotation_leukemia.tsv.gz', sep='\t', index_col=0)
annot_inVivo = pd.read_csv(TEMPDIR / 'LaraAstiasoHuntly2023' / 'GSE213511_CellAnnotation_invivo.tsv.gz', sep='\t', index_col=0)
annot_LSK = pd.read_csv(TEMPDIR / 'LaraAstiasoHuntly2023' / 'GSE213511_CellAnnotation_exvivo.tsv.gz', sep='\t', index_col=0)

In [43]:
adata = sc.concat(adatas, label='Sample', index_unique='-')
# make indiced unique
adata.obs.index = [x.replace('-1-', '-') for x in adata.obs.index]
annot_LSK.index = [f'{x.split("-")[0]}-{sample}' for x, sample in zip(annot_LSK.index, annot_LSK.Sample)]
# merge annotation
adata.obs = pd.concat([adata.obs, annot_LSK], axis=1)

In [57]:
# harmonize
adata.var.index.name = 'gene_symbol'

In [59]:
sample_dict = {
    'DM': 'leukemia',
    'inVivo': 'invivo',
    'LSK': 'exvivo'
}

In [65]:
# key, value from dict
def merge_data(key, value):
    files_ = [x for x in files if f'_{key}_' in x]
    annot = pd.read_csv(TEMPDIR / 'LaraAstiasoHuntly2023' / f'GSE213511_CellAnnotation_{value}.tsv.gz', sep='\t', index_col=0)
    adatas = {}
    for file in tqdm(files_):
        tempdata = sc.read_10x_h5(TEMPDIR / 'LaraAstiasoHuntly2023' / file)
        tempdata.var_names_make_unique()
        adatas[file.replace('GSE213511_', '').replace('.h5', '')] = tempdata
    adata = sc.concat(adatas, label='Sample', index_unique='-')
    # make indiced unique
    adata.obs.index = [x.replace('-1-', '-') for x in adata.obs.index]
    annot.index = [f'{x.split("-")[0]}-{sample}' for x, sample in zip(annot.index, annot.Sample)]
    # merge annotation
    adata.obs = pd.concat([adata.obs, annot], axis=1)
    return adata

def harmonize_data(adata, value):
    # harmonize
    adata.var.index.name = 'gene_symbol'
    adata.obs.index.name = 'cell_barcode'
    adata.obs['organism'] = 'Mus musculus'
    adata.obs['disease'] = 'leukemia' if value=='leukemia' else 'healthy'
    adata.obs['cancer'] = value=='leukemia'
    adata.obs['perturbation_type'] = 'CRISPR-cas9'
    adata.obs['tissue_type'] = 'complicated'  # TODO!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # ...
    return adata

In [66]:
adata = merge_data('DM', 'leukemia')

100%|██████████| 13/13 [00:32<00:00,  2.52s/it]


In [67]:
adata.obs

Unnamed: 0,Sample,Phase,Sample.1,Mixscape,Guide,UMAP1,UMAP2,Clusters
AAACCCAAGCAGCCCT-DM_CITEseq-1_NA_NM_1,DM_CITEseq-1_NA_NM_1,,,,,,,
AAACCCAAGCGCCTTG-DM_CITEseq-1_NA_NM_1,DM_CITEseq-1_NA_NM_1,,,,,,,
AAACCCAAGGAGAGTA-DM_CITEseq-1_NA_NM_1,DM_CITEseq-1_NA_NM_1,G2M,DM_CITEseq-1_NA_NM_1,,,2.824239,0.346523,LSC
AAACCCAAGGATTTAG-DM_CITEseq-1_NA_NM_1,DM_CITEseq-1_NA_NM_1,,,,,,,
AAACCCAAGGGCCAAT-DM_CITEseq-1_NA_NM_1,DM_CITEseq-1_NA_NM_1,G1,DM_CITEseq-1_NA_NM_1,,,3.430067,1.632668,LSC
...,...,...,...,...,...,...,...,...
TTTGTTGGTATTTCGG-DM_OP4_NM_6d_1,DM_OP4_NM_6d_1,G1,DM_OP4_NM_6d_1,NP,Kdm6a_AS_45253,-0.047214,-3.652060,LSC
TTTGTTGGTCTTCATT-DM_OP4_NM_6d_1,DM_OP4_NM_6d_1,S,DM_OP4_NM_6d_1,KO,Stag2_B_13072,-2.482602,-4.425437,LSC
TTTGTTGGTTTCAGAC-DM_OP4_NM_6d_1,DM_OP4_NM_6d_1,G2M,DM_OP4_NM_6d_1,NP,Hmgxb4_R2.BR_36203,-4.458491,-0.274728,LSC
TTTGTTGTCGCCGATG-DM_OP4_NM_6d_1,DM_OP4_NM_6d_1,,,,,,,


In [68]:
adata.obs.rename({'Phase': ''}, axis=1)

Unnamed: 0,Sample,Phase,Sample.1,Mixscape,Guide,UMAP1,UMAP2,Clusters
AAACCCAAGCAGCCCT-DM_CITEseq-1_NA_NM_1,DM_CITEseq-1_NA_NM_1,,,,,,,
AAACCCAAGCGCCTTG-DM_CITEseq-1_NA_NM_1,DM_CITEseq-1_NA_NM_1,,,,,,,
AAACCCAAGGAGAGTA-DM_CITEseq-1_NA_NM_1,DM_CITEseq-1_NA_NM_1,G2M,DM_CITEseq-1_NA_NM_1,,,2.824239,0.346523,LSC
AAACCCAAGGATTTAG-DM_CITEseq-1_NA_NM_1,DM_CITEseq-1_NA_NM_1,,,,,,,
AAACCCAAGGGCCAAT-DM_CITEseq-1_NA_NM_1,DM_CITEseq-1_NA_NM_1,G1,DM_CITEseq-1_NA_NM_1,,,3.430067,1.632668,LSC
...,...,...,...,...,...,...,...,...
TTTGTTGGTATTTCGG-DM_OP4_NM_6d_1,DM_OP4_NM_6d_1,G1,DM_OP4_NM_6d_1,NP,Kdm6a_AS_45253,-0.047214,-3.652060,LSC
TTTGTTGGTCTTCATT-DM_OP4_NM_6d_1,DM_OP4_NM_6d_1,S,DM_OP4_NM_6d_1,KO,Stag2_B_13072,-2.482602,-4.425437,LSC
TTTGTTGGTTTCAGAC-DM_OP4_NM_6d_1,DM_OP4_NM_6d_1,G2M,DM_OP4_NM_6d_1,NP,Hmgxb4_R2.BR_36203,-4.458491,-0.274728,LSC
TTTGTTGTCGCCGATG-DM_OP4_NM_6d_1,DM_OP4_NM_6d_1,,,,,,,
