In [2]:
!pip install mygene statannotations scrublet scanpy scvelo decoupler matplotlib_venn goatools gseapy scperturb biomart PyComplexHeatmap statsmodels omnipath git+https://github.com/saezlab/pypath.git --quiet

In [3]:
import subprocess
import os
import sys
import matplotlib.backends.backend_pdf
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns
import scvelo as scv
scv.settings.verbosity=1

from pathlib import Path

# Jupyter stuff
from tqdm.notebook import tqdm
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

%matplotlib inline

# Custom functions
sys.path.insert(1, '../')
from utils import *

# scperturb package
sys.path.insert(1, '../package/src/')
from scperturb import *

from pathlib import Path
figure_path = Path('../figures/')

In [4]:
TEMPDIR = Path('/scratch/peidli/scPerturb/')

# Check

In [5]:
adata = sc.read_h5ad(TEMPDIR / 'YaoCleary2023.h5ad')

In [6]:
adata

AnnData object with n_obs × n_vars = 210131 × 14275
    obs: 'perturbation', 'ncounts', 'ngenes', 'nguides', '10X_channel', 'percent_mito', 'guides', 'full_guides', 'S_score', 'G2M_score', 'Cell_cycle_phase', 'perturbation_type', 'disease', 'cancer', 'tissue_type', 'cell_line', 'celltype', 'organism', 'nperts', 'percent_ribo', 'dataset'

In [7]:
adata.obs.perturbation.value_counts()

control           16243
NFIC                395
XKR6                378
TNFRSF14            369
TYK2                360
                  ...  
HLA-E_RAD54L          1
HLA-E_RPS26           1
HLA-E_S100A8          1
HLA-E_SBNO2           1
LZTS1_TMEM176A        1
Name: perturbation, Length: 38789, dtype: int64

# Processing

In [24]:
sorted([file.name for file in (TEMPDIR / 'YaoCleary2023').glob('*')])

['GSM6858447_KO_all_FRPerturb_effect_sizes.csv',
 'GSM6858447_KO_conventional.h5ad',
 'GSM6858447_KO_conventional_temp.h5ad',
 'GSM6858448_KO_cell_pooled.h5ad',
 'GSM6858448_KO_cell_pooled_temp.h5ad',
 'GSM6858449_KD_all_FRPerturb_effect_sizes.csv',
 'GSM6858449_KD_conventional.h5ad',
 'GSM6858449_KD_conventional_temp.h5ad',
 'GSM6858450_KD_guide_pooled.h5ad',
 'GSM6858450_KD_guide_pooled_temp.h5ad']

In [32]:
def process_adata(key):
    # build adata
    adata = sc.read(TEMPDIR / f'YaoCleary2023/{key}_temp.h5ad')
    adata.obs = adata.obs.rename({'Total_RNA_count': 'ncounts', 'Total_unique_genes': 'ngenes', 'Biological_replicate': 'replicate',
                      'Percent_mitochondrial_reads': 'percent_mito', 'Guides': 'full_guides', 
                      'Guides_collapsed_by_gene': 'guides', 'Total_number_of_guides': 'nguides'}, axis=1)
    adata.var.index.name = 'gene_symbol'
    tab = pd.read_csv(TEMPDIR / f'YaoCleary2023/{key}_perturbations.txt', sep='\t').T
    stab = tab.astype(pd.SparseDtype("int", 0))  # make sparse
    adata.obsm['barcodes'] = stab

    # harmonize metadata
    adata.obs['perturbation'] = [x.replace('safe-targeting', 'control').replace('non-targeting', 'control') for x in adata.obs.guides]
    adata.obs['perturbation'] = [x.replace('--', '_') for x in adata.obs['perturbation']]
    adata.obs.perturbation = [k.replace('control_', '').replace('_control', '') for k in adata.obs.perturbation]  # collapse controls
    adata.obs['perturbation_type'] = 'CRISPR-cas9' if '_KO_' in key else 'CRISPRi'
    if 'replicate' in adata.obs.columns:
        cols = ['perturbation', 'replicate', 'ncounts', 'ngenes', 'nguides', '10X_channel', 'percent_mito', 'guides', 'full_guides', 'S_score', 'G2M_score', 'Cell_cycle_phase', 'perturbation_type']
    else:
        cols = ['perturbation', 'ncounts', 'ngenes', 'nguides', '10X_channel', 'percent_mito', 'guides', 'full_guides', 'S_score', 'G2M_score', 'Cell_cycle_phase', 'perturbation_type']
    adata.obs = adata.obs[cols]
    adata.obs['disease'] = "leukemia"
    adata.obs['cancer'] = True
    adata.obs['tissue_type']="cell_line"
    adata.obs["cell_line"] = "THP-1"
    adata.obs["celltype"] = 'monocytes'
    adata.obs['organism'] = 'human'
    adata.obs['nperts'] = [p.count('_')+1-p.count('control') if type(p)==str else 0 for p in adata.obs.perturbation]
    annotate_qc(adata, species='human')
    adata.obs.index.name = 'cell_barcode'
    return adata

In [33]:
adata = process_adata(key='GSM6858447_KO_conventional')

FileNotFoundError: [Errno 2] No such file or directory: '/scratch/peidli/scPerturb/YaoCleary2023/GSM6858447_KO_conventional_perturbations.txt'

In [27]:
key='GSM6858447_KO_conventional'
adata = sc.read(TEMPDIR / f'YaoCleary2023/{key}_temp.h5ad')

In [31]:
adata.var

AL627309.1
AL669831.5
FAM87B
LINC00115
FAM41C
...
AC011043.1
AL592183.1
AC007325.4
AL354822.1
AC240274.1


In [102]:
adata = process_adata(key='GSM6858449_KD_conventional')

KeyboardInterrupt: 

In [5]:
adata = process_adata(key='GSM6858448_KO_cell_pooled')

In [None]:
adata = process_adata(key='GSM6858450_KD_guide_pooled')

In [9]:
assert_annotations(adata)

In [10]:
adatas = {}
for key in ['GSM6858447_KO_conventional', 'GSM6858449_KD_conventional', 'GSM6858448_KO_cell_pooled', 'GSM6858450_KD_guide_pooled']:
    print(key)
    adata = process_adata(key='GSM6858449_KD_conventional')
    assert_annotations(adata)
    adatas[key] = adata

GSM6858447_KO_conventional
GSM6858449_KD_conventional
GSM6858448_KO_cell_pooled
GSM6858450_KD_guide_pooled


In [11]:
adata = sc.concat(adatas, label='dataset')

In [12]:
adata

AnnData object with n_obs × n_vars = 265132 × 18017
    obs: 'perturbation', 'replicate', 'ncounts', 'ngenes', 'nguides', '10X_channel', 'percent_mito', 'guides', 'full_guides', 'S_score', 'G2M_score', 'Cell_cycle_phase', 'perturbation_type', 'disease', 'cancer', 'tissue_type', 'cell_line', 'celltype', 'organism', 'nperts', 'percent_ribo', 'dataset'
    obsm: 'barcodes'

In [34]:
adata = sc.read(TEMPDIR / 'YaoCleary2023.h5ad')

In [35]:
adata

AnnData object with n_obs × n_vars = 210131 × 14275
    obs: 'perturbation', 'ncounts', 'ngenes', 'nguides', '10X_channel', 'percent_mito', 'guides', 'full_guides', 'S_score', 'G2M_score', 'Cell_cycle_phase', 'perturbation_type', 'disease', 'cancer', 'tissue_type', 'cell_line', 'celltype', 'organism', 'nperts', 'percent_ribo', 'dataset'

In [37]:
adata.obs

Unnamed: 0_level_0,perturbation,ncounts,ngenes,nguides,10X_channel,percent_mito,guides,full_guides,S_score,G2M_score,...,perturbation_type,disease,cancer,tissue_type,cell_line,celltype,organism,nperts,percent_ribo,dataset
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGAGTGACC-1,TNFRSF11A_STK4-AS1,2413,1278,2,Normal_1,5.719022,TNFRSF11A--STK4-AS1,TNFRSF11A_TTTGTGGCACTGGATCAATG_CRISPRko--STK4-...,-0.072268,-0.068220,...,CRISPR-cas9,leukemia,True,cell_line,THP-1,monocytes,human,2,11.023622,GSM6858447_KO_conventional
AAACCCACACAGGATG-1,GSDMC_LRP1B,9174,2666,2,Normal_1,5.646392,GSDMC--LRP1B,GSDMC_GGAGCATCCATGGTCCACAG_CRISPRko--LRP1B_TGC...,-0.091449,-0.078500,...,CRISPR-cas9,leukemia,True,cell_line,THP-1,monocytes,human,2,13.876172,GSM6858447_KO_conventional
AAACCCACACCTCTAC-1,NFATC2_TAB3,3779,1864,2,Normal_1,6.086266,NFATC2--TAB3,NFATC2_GCGGAGGCATTCGTGCGCCG_CRISPRko--TAB3_AAA...,0.341332,0.082301,...,CRISPR-cas9,leukemia,True,cell_line,THP-1,monocytes,human,2,4.842551,GSM6858447_KO_conventional
AAACCCACATTGCCGG-1,TXNDC17,9882,2892,1,Normal_1,4.604331,TXNDC17,TXNDC17_CTGAACCAGTCGTACGAGAG_CRISPRko,0.034257,-0.086085,...,CRISPR-cas9,leukemia,True,cell_line,THP-1,monocytes,human,1,16.413681,GSM6858447_KO_conventional
AAACCCAGTCATCACA-1,TNFSF11,10399,2514,1,Normal_1,3.413790,TNFSF11,TNFSF11_CGTGGCTCGGAGGAGATGGG_CRISPRko,-0.012602,-0.122855,...,CRISPR-cas9,leukemia,True,cell_line,THP-1,monocytes,human,1,9.539379,GSM6858447_KO_conventional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCACCAGCCA-1_2,PIP4K2A_UNG_OTUD3,903,565,3,High_MOI_2,3.433001,PIP4K2A--UNG--OTUD3,PIP4K2A_GGCGGGCGCAGGATACGGGC_CRISPRi--UNG_GCGC...,0.081307,-0.004186,...,CRISPRi,leukemia,True,cell_line,THP-1,monocytes,human,3,7.973422,GSM6858450_KD_guide_pooled
TTTGGTTCACCATTCC-1_2,MAN1C1_NLRP3_C5_TRAM1_ZNF79,44541,6082,5,High_MOI_2,4.708022,MAN1C1--NLRP3--C5--TRAM1--ZNF79,MAN1C1_GGTCCCTCCCAATATCAAAG_CRISPRi--NLRP3_CAG...,-0.099051,-0.053815,...,CRISPRi,leukemia,True,cell_line,THP-1,monocytes,human,5,10.217552,GSM6858450_KD_guide_pooled
TTTGGTTGTCCTCATC-1_2,HLA-DMB_CLIC1,3815,1513,2,High_MOI_2,3.958060,HLA-DMB--CLIC1,HLA-DMB_CCCCCCAAATGAGTGATGTG_CRISPRi--CLIC1_GC...,-0.068920,-0.052420,...,CRISPRi,leukemia,True,cell_line,THP-1,monocytes,human,2,1.939712,GSM6858450_KD_guide_pooled
TTTGGTTGTGGCCCAT-1_2,AP1AR_MRPL51_C2,5169,1531,3,High_MOI_2,2.418263,AP1AR--MRPL51--C2,AP1AR_CTGAGGCGAGAAGGGCCATG_CRISPRi--MRPL51_CCA...,0.001523,-0.052025,...,CRISPRi,leukemia,True,cell_line,THP-1,monocytes,human,3,11.955891,GSM6858450_KD_guide_pooled
