In [1]:
import subprocess
import os
import sys
import matplotlib.backends.backend_pdf
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns
import scvelo as scv
scv.settings.verbosity=1

from pathlib import Path

# Jupyter stuff
from tqdm.notebook import tqdm
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

%matplotlib inline

# Custom functions
sys.path.insert(1, '../')
from utils import *

# scperturb package
sys.path.insert(1, '../package/src/')
from scperturb import *

from pathlib import Path
figure_path = Path('../figures/')

In [2]:
TEMPDIR = Path('/fast/scratch/users/peidlis_c/perturbation_resource_paper/')

In [3]:
sorted([file.name for file in (TEMPDIR / 'YaoCleary2023').glob('*')])

['GSM6858447_KO_conventional.h5ad',
 'GSM6858447_KO_conventional.rds',
 'GSM6858447_KO_conventional_perturbations.txt',
 'GSM6858448_KO_cell_pooled.h5ad',
 'GSM6858448_KO_cell_pooled.rds',
 'GSM6858448_KO_cell_pooled_perturbations.txt',
 'GSM6858449_KD_conventional.h5ad',
 'GSM6858449_KD_conventional.rds',
 'GSM6858449_KD_conventional_perturbations.txt',
 'GSM6858450_KD_guide_pooled.h5ad',
 'GSM6858450_KD_guide_pooled.rds',
 'GSM6858450_KD_guide_pooled_perturbations.txt',
 'downloaded.flag']

In [4]:
def process_adata(key):
    # build adata
    adata = sc.read(TEMPDIR / f'YaoCleary2023/{key}.h5ad')
    adata.obs = adata.obs.rename({'Total_RNA_count': 'ncounts', 'Total_unique_genes': 'ngenes', 'Biological_replicate': 'replicate',
                      'Percent_mitochondrial_reads': 'percent_mito', 'Guides': 'full_guides', 
                      'Guides_collapsed_by_gene': 'guides', 'Total_number_of_guides': 'nguides'}, axis=1)
    adata.var = adata.var.set_index('features').drop('_index', axis=1)
    tab = pd.read_csv(TEMPDIR / f'YaoCleary2023/{key}_perturbations.txt', sep='\t').T
    stab = tab.astype(pd.SparseDtype("int", 0))  # make sparse
    adata.obsm['barcodes'] = stab

    # harmonize metadata
    adata.obs['perturbation'] = [x.replace('safe-targeting', 'control').replace('non-targeting', 'control') for x in adata.obs.guides]
    adata.obs['perturbation'] = [x.replace('--', '_') for x in adata.obs['perturbation']]
    adata.obs.perturbation = [k.replace('control_', '').replace('_control', '') for k in adata.obs.perturbation]  # collapse controls
    adata.obs['perturbation_type'] = 'CRISPR-cas9' if '_KO_' in key else 'CRISPRi'
    if 'replicate' in adata.obs.columns:
        cols = ['perturbation', 'replicate', 'ncounts', 'ngenes', 'nguides', '10X_channel', 'percent_mito', 'guides', 'full_guides', 'S_score', 'G2M_score', 'Cell_cycle_phase', 'perturbation_type']
    else:
        cols = ['perturbation', 'ncounts', 'ngenes', 'nguides', '10X_channel', 'percent_mito', 'guides', 'full_guides', 'S_score', 'G2M_score', 'Cell_cycle_phase', 'perturbation_type']
    adata.obs = adata.obs[cols]
    adata.obs['disease'] = "leukemia"
    adata.obs['cancer'] = True
    adata.obs['tissue_type']="cell_line"
    adata.obs["cell_line"] = "THP-1"
    adata.obs["celltype"] = 'monocytes'
    adata.obs['organism'] = 'human'
    adata.obs['nperts'] = [p.count('_')+1-p.count('control') if type(p)==str else 0 for p in adata.obs.perturbation]
    annotate_qc(adata, species='human')
    adata.obs.index.name = 'cell_barcode'
    return adata

In [100]:
adata = process_adata(key='GSM6858447_KO_conventional')

In [102]:
adata = process_adata(key='GSM6858449_KD_conventional')

KeyboardInterrupt: 

In [5]:
adata = process_adata(key='GSM6858448_KO_cell_pooled')

In [None]:
adata = process_adata(key='GSM6858450_KD_guide_pooled')

In [9]:
assert_annotations(adata)

In [10]:
adatas = {}
for key in ['GSM6858447_KO_conventional', 'GSM6858449_KD_conventional', 'GSM6858448_KO_cell_pooled', 'GSM6858450_KD_guide_pooled']:
    print(key)
    adata = process_adata(key='GSM6858449_KD_conventional')
    assert_annotations(adata)
    adatas[key] = adata

GSM6858447_KO_conventional
GSM6858449_KD_conventional
GSM6858448_KO_cell_pooled
GSM6858450_KD_guide_pooled


In [11]:
adata = sc.concat(adatas, label='dataset')

In [12]:
adata

AnnData object with n_obs × n_vars = 265132 × 18017
    obs: 'perturbation', 'replicate', 'ncounts', 'ngenes', 'nguides', '10X_channel', 'percent_mito', 'guides', 'full_guides', 'S_score', 'G2M_score', 'Cell_cycle_phase', 'perturbation_type', 'disease', 'cancer', 'tissue_type', 'cell_line', 'celltype', 'organism', 'nperts', 'percent_ribo', 'dataset'
    obsm: 'barcodes'