In [1]:
import subprocess
import os
import sys

import matplotlib.backends.backend_pdf
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns
import scvelo as scv
scv.settings.verbosity=1

# Jupyter stuff
from tqdm.notebook import tqdm
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
%matplotlib inline

# Custom functions
sys.path.insert(1, '../..')
from utils import *

# paths
at_home = False if '/fast/work/users/' in os.getcwd() else True
data_path = '/extra/stefan/data/perturbation_resource_paper/' if at_home else '/fast/work/users/peidlis_c/data/perturbation_resource_paper/'
signatures_path = '/home/peidli/utils/scrnaseq_signature_collection/' if at_home else '/fast/work/users/peidlis_c/utils/scrnaseq_signature_collection/'
utils_path = '/extra/stefan/utils/scrnaseq_utils/' if at_home else '/fast/work/users/peidlis_c/utils/single_cell_rna_seq/scrnaseq_utils/'

# Stefan's utils
sys.path.insert(1, utils_path)
from scrnaseq_util_functions import *

In [2]:
SDIR = '/fast/scratch/users/peidlis_c/perturbation_resource_paper/'

In [10]:
# get indecs of dataset
h5_files = {}
for path, subdirs, files in os.walk(data_path):
    for name in files:
        if '.h5ad' in name: h5_files[name.split('.h5ad')[0]] = os.path.join(path, name)
del h5_files['PapalexiSatija2021_eccite_arrayed_protein']
del h5_files['PapalexiSatija2021_eccite_protein']
del h5_files['FrangiehIzar2021_protein']
del h5_files['XieHon2017']
datasets = list(h5_files.keys())
print(len(datasets))

34


In [11]:
h5_files

{'TianKampmann2021_CRISPRa': '/fast/work/users/peidlis_c/data/perturbation_resource_paper/TianKampmann2021_CRISPRa/TianKampmann2021_CRISPRa.h5ad',
 'AissaBenevolenskaya2021': '/fast/work/users/peidlis_c/data/perturbation_resource_paper/AissaBenevolenskaya2021/AissaBenevolenskaya2021.h5ad',
 'TianKampmann2019_day7neuron': '/fast/work/users/peidlis_c/data/perturbation_resource_paper/TianKampmann2019_day7neuron/TianKampmann2019_day7neuron.h5ad',
 'SrivatsanTrapnell2020_sciplex2': '/fast/work/users/peidlis_c/data/perturbation_resource_paper/SrivatsanTrapnell2020_sciplex2/SrivatsanTrapnell2020_sciplex2.h5ad',
 'FrangiehIzar2021_RNA': '/fast/work/users/peidlis_c/data/perturbation_resource_paper/FrangiehIzar2021_RNA/FrangiehIzar2021_RNA.h5ad',
 'ZhaoSims2021': '/fast/work/users/peidlis_c/data/perturbation_resource_paper/ZhaoSims2021/ZhaoSims2021.h5ad',
 'DixitRegev2016': '/fast/work/users/peidlis_c/data/perturbation_resource_paper/DixitRegev2016/DixitRegev2016.h5ad',
 'SrivatsanTrapnell2020_s

## GasperiniShendure2019_highMOI

In [27]:
adata=sc.read(h5_files['GasperiniShendure2019_highMOI'])

In [28]:
adata

AnnData object with n_obs × n_vars = 47650 × 32738
    obs: 'sample', 'total_umis', 'Size_Factor', 'gene', 'all_gene', 'barcode', 'read_count', 'umi_count', 'proportion', 'guide_count', 'sample_directory', 'ko_barcode_file', 'sample_name', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo'
    var: 'ensembl_id', 'ncounts', 'ncells'

In [29]:
adata.obs.perturbation

AAACCTGAGCGAGAAA    CERS2_chr10:23105418-23105441_chr11:65708668-6...
AAACCTGAGCTGTCTA    C16orf91_chr10:72420587-72420610_chr10:7406807...
AAACCTGAGTTATCGC    ADIPOR1_APEX1_chr1:182308166-182308189_chr1:20...
AAACCTGAGTTCCACA    chr1:224392421-224392444_chr1:39251330-3925135...
AAACCTGCAAATACAG    chr1:154601602-154601625_chr12:131713939-13171...
                                          ...                        
TTTGTCAGTGTTTGTG    ATP5F1_chr1:114922540-114922563_chr1:202074787...
TTTGTCATCACAAACC    chr1:32420277-32420300_chr17:27192091-27192114...
TTTGTCATCAGAGGTG    BEX4_chr10:112617343-112617366_chr10:120850440...
TTTGTCATCGCGATCG    CENPK_chr1:182308166-182308189_chr1:202074787-...
TTTGTCATCTGGCGTG    chr12:120754676-120754699_chr17:46599317-46599...
Name: perturbation, Length: 47650, dtype: category
Categories (39088, object): ['ACTB_ACTG1_APEX1_CAPZA2_chr10:112617343-11261..., 'ACTB_ACTG1_ARL5A_BANF1_chr10:22517874-2251789..., 'ACTB_ACTG1_ARL5A_BANF1_chr10:22517874-2251789..

In [30]:
[x for x in adata.obs.perturbation if 'control' in x]

['chr10:74050665-74050688_chr11:75944834-75944857_chr1:201520936-201520959_chr1:43988240-43988263_pos_control_HS2_Klann_mosaic_TMSB4X_UCHL5',
 'BCAP29_chr17:38689446-38689469_chr18:55708851-55708874_chr19:40951731-40951754_chr22:27028440-27028463_chr6:125628316-125628339_chr6:17715167-17715190_NDUFB7_pos_control_Klannchr1_HS3_PPP1R11_PSMG4_random_7_TOP1',
 'CENPK_chr1:201987385-201987408_chr13:29106285-29106308_chr1:53168523-53168546_chr1:9470840-9470863_chr2:28589203-28589226_chr5:10317967-10317990_chr7:106647453-106647476_chr7:23513484-23513507_chr7:99404651-99404674_CHRAC1_chrX:65095355-65095378_ELAVL1_HNRNPF_pos_control_HBE1_tss_Klann_mosaic_RNF181_SLC25A33',
 'CCT5_chr10:22306289-22306312_chr11:2888386-2888409_chr11:73490224-73490247_chr1:26946583-26946606_chr1:27831005-27831028_chr18:55500879-55500902_chr19:40922926-40922949_chr20:22909290-22909313_chr2:28582004-28582027_chr2:43154568-43154591_chr3:9903850-9903873_chr4:154354191-154354214_chr4:6893738-6893761_chr5:10320939-103209

## GehringPachter2019

In [31]:
adata=sc.read(h5_files['GehringPachter2019'])

In [32]:
adata

AnnData object with n_obs × n_vars = 20382 × 13256
    obs: 'batch', 'disease', 'cancer', 'tissue_type', 'celltype', 'perturbation', 'perturbation_2', 'perturbation_3', 'perturbation_4', 'dose_unit', 'dose_unit_2', 'dose_unit_3', 'dose_unit_4', 'organism', 'perturbation_type', 'dose_value', 'dose_value_2', 'dose_value_3', 'dose_value_4', 'nperts', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo'
    var: 'ensembl_id', 'ncounts', 'ncells'

In [33]:
adata.obs.perturbation

AAACCTGCACACATGT    BMP4
AAACCTGCACGTCAGC    BMP4
AAACCTGCATTGGTAC    BMP4
AAACCTGGTCGCATAT    BMP4
AAACCTGGTGCAGGTA    BMP4
                    ... 
TTTGGTTAGGAACTGC    BMP4
TTTGGTTGTCCAGTAT    BMP4
TTTGTCAAGCCACTAT    BMP4
TTTGTCAGTCCAAGTT    BMP4
TTTGTCATCGGTGTCG    BMP4
Name: perturbation, Length: 20382, dtype: category
Categories (1, object): ['BMP4']

In [42]:
pd.unique(adata.obs.dose_value_4)

array([0])

In [36]:
pd.unique([a+'+'+b+'+'+c+'+'+d for a,b,c,d in zip(adata.obs.perturbation, adata.obs.perturbation_2, adata.obs.perturbation_3, adata.obs.perturbation_4)])

array(['BMP4+EGF and bFGF+1:5 Scriptaid:decitabine+retinoic acid'],
      dtype=object)

## SchraivogelSteinmetz2020_TAP_SCREEN__chromosome_8_screen

In [52]:
adata=sc.read(h5_files['SchraivogelSteinmetz2020_TAP_SCREEN__chromosome_8_screen'])

In [53]:
adata

AnnData object with n_obs × n_vars = 112260 × 4191
    obs: 'replicate', 'tissue_type', 'cell_line', 'cancer', 'disease', 'celltype', 'organism', 'perturbation', 'perturbation_type', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts'
    var: 'ncounts', 'ncells'

In [54]:
adata.obs.perturbation

cell_barcode
TGATTGACAAACCTGAGAGCTATA-sample_14         RIPK2_+_90770127.23-P1P2
TGATTGACAAACCTGAGGTGACCA-sample_14                        multiplet
TGATTGACAAACCTGAGGTGCTTT-sample_14                        multiplet
TGATTGACAAACCTGAGTCGAGTG-sample_14        DSCC1_-_120868119.23-P1P2
TGATTGACAAACCTGCAACTTGAC-sample_14              OXR1_+_107670106.23
                                                  ...              
TCCTGAGCCCGTACTAGGTGCAAC-sample_2     chr8:103387756-103388165_14_+
TCCTGAGCCCGTACTAGTTCGATC-sample_2     chr8:102345144-102345514_25_-
TGATTGACCCGTACTAGCTGCGAA-sample_2         DSCC1_+_120868042.23-P1P2
TGATTGACCCGTACTAGGCCCTCA-sample_2               non-targeting_00008
TGATTGACCCGTACTAGTGCGATG-sample_2                             MYC-A
Name: perturbation, Length: 112260, dtype: category
Categories (4115, object): ['CCNE2_+_95907328.23-P1P2', 'CCNE2_+_95907382.23-P1P2', 'CCNE2_+_95907406.23-P1P2', 'CCNE2_-_95907017.23-P1P2', ..., 'non-targeting_00026', 'non-targeting_0002

In [55]:
adata.layers['counts'] = adata.X.copy()

In [56]:
# basic qc and pp
sc.pp.filter_cells(adata, min_counts=1000)
sc.pp.normalize_per_cell(adata)
sc.pp.filter_genes(adata, min_cells=50)
sc.pp.log1p(adata)

In [58]:
# high class imbalance
adata = equal_subsampling(adata, 'perturbation', N_min=50)
sc.pp.filter_genes(adata, min_cells=3)  # sanity cleaning

In [59]:
adata

AnnData object with n_obs × n_vars = 2200 × 1492
    obs: 'replicate', 'tissue_type', 'cell_line', 'cancer', 'disease', 'celltype', 'organism', 'perturbation', 'perturbation_type', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'n_counts'
    var: 'ncounts', 'ncells', 'n_cells'
    uns: 'log1p'
    layers: 'counts'

In [61]:
# select HVGs
n_var_max = 2000  # max total features to select
sc.pp.highly_variable_genes(adata, n_top_genes=n_var_max, subset=False)
sc.pp.pca(adata, use_highly_variable=True)
sc.pp.neighbors(adata)