In [1]:
!pip install mygene statannotations scrublet scanpy scvelo decoupler matplotlib_venn goatools gseapy scperturb biomart PyComplexHeatmap statsmodels omnipath git+https://github.com/saezlab/pypath.git --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyopenssl 23.0.0 requires cryptography<40,>=38.0.0, but you have cryptography 41.0.7 which is incompatible.[0m[31m
[0m

In [2]:
import subprocess
import os
import sys
import matplotlib.backends.backend_pdf
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns

from pathlib import Path

# Jupyter stuff
from tqdm.notebook import tqdm
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

%matplotlib inline

# Custom functions
sys.path.insert(1, '../')
from utils import *

# scperturb package
sys.path.insert(1, '../package/src/')
from scperturb import *

from pathlib import Path
figure_path = Path('../figures/')

  from IPython.core.display import display, HTML


In [3]:
TEMPDIR = Path('/scratch/peidli/scPerturb/')
path = TEMPDIR / 'DixitRegev2016'

# Processing

In [4]:
from scipy.sparse import csr_matrix
from scipy.io import mmread

In [5]:
name = 'GSM2396858_k562_tfs_7'

time = 7 if name.endswith('7') else 3 if name.endswith('3') else None
pert_key = 'grna' if time!=None else 'grna_strict'
moi = 'normal' if time!=None else 'high'

X = csr_matrix(mmread(TEMPDIR / 'DixitRegev2016' / f'{name}.mtx.txt'))
adata = sc.AnnData(X.T)

var = pd.read_csv(TEMPDIR / 'DixitRegev2016' / f'{name}_genenames.csv', index_col=0)
splitted = np.array([x.split('_', 1) for x in var.values[:,0]])
adata.var_names = splitted[:,1]
adata.var['gene_id'] = splitted[:,0]
adata.var_names_make_unique()

obs = pd.read_csv(TEMPDIR / 'DixitRegev2016' / f'{name}_cellnames.csv', index_col=0)
splitted = np.array([x.split('_', 2) for x in obs.values[:,0]])
adata.obs_names = obs.values[:,0]
adata.obs['identifier_0'] = splitted[:,1]
adata.obs['identifier_1'] = splitted[:,2]

# annotation:
files = [TEMPDIR / 'DixitRegev2016' / f'{name}_cbc_gbc_dict.csv', TEMPDIR / 'DixitRegev2016' / f'{name}_cbc_gbc_dict_strict.csv', TEMPDIR / 'DixitRegev2016' / f'{name}_cbc_gbc_dict_lenient.csv']
keys = ['grna', 'grna_strict', 'grna_lenient']
for file, key in zip(files, keys):
    if os.path.isfile(file):
        cbc_gbc_dict = pd.read_csv(file, index_col=0, header=None)
        adata.obs[key]=None
        for grna in list(cbc_gbc_dict.index):
            for barcode in cbc_gbc_dict.loc[grna][1].replace(' ','').split(','):
                if barcode in adata.obs_names:
                    val = adata.obs.loc[barcode][key]
                    adata.obs.loc[barcode][key] = grna if val is None else val+' + '+grna

adata.obs['target'] = [x.replace('p_sg', '').replace('p_', '').split('_')[0] if type(x)==str else None for x in adata.obs[pert_key]]
adata.obs = adata.obs.rename({pert_key: 'perturbation'}, axis=1).drop(['identifier_1', 'identifier_0'], axis=1)
adata.obs['moi'] = moi
adata.obs['time'] = time*24  if time!=None else 'None'
adata.obs['cell_line'] = 'K562'
adata.obs['celltype'] = 'lymphoblasts'
adata.obs['perturbation_type'] = 'CRISPR'
adata.obs['cancer'] = True
adata.obs['disease'] = 'myelogenous leukemia'
adata.obs['library'] = name

In [6]:
adata

AnnData object with n_obs × n_vars = 33013 × 23111
    obs: 'perturbation', 'target', 'moi', 'time', 'cell_line', 'celltype', 'perturbation_type', 'cancer', 'disease', 'library'
    var: 'gene_id'

In [7]:
adata.obs

Unnamed: 0,perturbation,target,moi,time,cell_line,celltype,perturbation_type,cancer,disease,library
AAACATACGCTACA_p7d_A1,p_sgELK1_1,ELK1,normal,168,K562,lymphoblasts,CRISPR,True,myelogenous leukemia,GSM2396858_k562_tfs_7
AAACATACTCCTGC_p7d_A1,p_sgELF1_2,ELF1,normal,168,K562,lymphoblasts,CRISPR,True,myelogenous leukemia,GSM2396858_k562_tfs_7
AAACATTGACGTTG_p7d_A1,p_sgELF1_5,ELF1,normal,168,K562,lymphoblasts,CRISPR,True,myelogenous leukemia,GSM2396858_k562_tfs_7
AAACATTGATTCGG_p7d_A1,p_sgELF1_2,ELF1,normal,168,K562,lymphoblasts,CRISPR,True,myelogenous leukemia,GSM2396858_k562_tfs_7
AAACATTGGAGCAG_p7d_A1,p_sgELK1_7,ELK1,normal,168,K562,lymphoblasts,CRISPR,True,myelogenous leukemia,GSM2396858_k562_tfs_7
...,...,...,...,...,...,...,...,...,...,...
TTTGACTGTGCGTA_p7d_C2,p_sgETS1_3,ETS1,normal,168,K562,lymphoblasts,CRISPR,True,myelogenous leukemia,GSM2396858_k562_tfs_7
TTTGCATGAGAGGC_p7d_C2,p_INTERGENIC216151,INTERGENIC216151,normal,168,K562,lymphoblasts,CRISPR,True,myelogenous leukemia,GSM2396858_k562_tfs_7
TTTGCATGAGCTAC_p7d_C2,p_INTERGENIC1216445,INTERGENIC1216445,normal,168,K562,lymphoblasts,CRISPR,True,myelogenous leukemia,GSM2396858_k562_tfs_7
TTTGCATGCCGCTT_p7d_C2,p_sgCREB1_4,CREB1,normal,168,K562,lymphoblasts,CRISPR,True,myelogenous leukemia,GSM2396858_k562_tfs_7


array(['p_sgELK1_1', 'p_sgELF1_2', 'p_sgELF1_5', 'p_sgELK1_7',
       'p_sgIRF1_2', 'p_sgETS1_5', None, 'p_INTERGENIC216151',
       'p_sgEGR1_4', 'p_INTERGENIC393453', 'p_sgYY1_3', 'p_sgETS1_3',
       'p_sgELK1_6', 'p_sgGABPA_1', 'p_sgE2F4_7', 'p_sgNR2C2_5',
       'p_sgCREB1_2', 'p_INTERGENIC1216445', 'p_sgNR2C2_2', 'p_sgEGR1_3',
       'p_sgCREB1_5', 'p_INTERGENIC1144056', 'p_sgELF1_1', 'p_sgIRF1_3',
       'p_sgEGR1_2', 'p_sgYY1_10', 'p_sgGABPA_9', 'p_sgCREB1_4',
       'p_sgNR2C2_3', 'p_sgE2F4_6', 'p_sgELF1_4'], dtype=object)

In [16]:
# GSM2396859_K562_TFs__13_days
name = 'GSM2396859_k562_tfs_13'

X = csr_matrix(mmread(path / f'{name}.mtx.txt'))
adata = sc.AnnData(X.T)

var = pd.read_csv(path / f'{name}_genenames.csv', index_col=0)
splitted = np.array([x.split('_', 1) for x in var.values[:,0]])
adata.var_names = splitted[:,1]
adata.var['gene_id'] = splitted[:,0]
adata.var_names_make_unique()

obs = pd.read_csv(path / f'{name}_cellnames.csv', index_col=0)
splitted = np.array([x.split('_', 2) for x in obs.values[:,0]])
adata.obs_names = obs.values[:,0]
adata.obs['identifier_0'] = splitted[:,1]
adata.obs['identifier_1'] = splitted[:,2]

# annotation:
files = [path / f'{name}_cbc_gbc_dict.csv', path / f'{name}_cbc_gbc_dict_strict.csv', path / f'{name}_cbc_gbc_dict_lenient.csv']
keys = ['grna', 'grna_strict', 'grna_lenient']
for file, key in zip(files, keys):
    if os.path.isfile(file):
        cbc_gbc_dict = pd.read_csv(file, index_col=0, header=None)
        adata.obs[key]=None
        for grna in list(cbc_gbc_dict.index):
            for barcode in cbc_gbc_dict.loc[grna][1].replace(' ','').split(','):
                if barcode in adata.obs_names:
                    val = adata.obs.loc[barcode][key]
                    adata.obs.loc[barcode][key] = grna if val is None else val+' + '+grna
adata.obs['target'] = [x.replace('p_sg', '').replace('p_', '').split('_')[0] if type(x)==str else None for x in adata.obs.grna]
adata.obs = adata.obs.rename({'grna': 'perturbation'}, axis=1).drop(['identifier_1', 'identifier_0'], axis=1)
adata.obs['moi'] = 'normal'
adata.obs['time'] = 13*24  # 13 days
adata.obs['cell_line'] = 'K562'
adata.obs['celltype'] = 'lymphoblasts'
adata.obs['perturbation_type'] = 'CRISPR'
adata.obs['cancer'] = True
adata.obs['disease'] = 'myelogenous leukemia'
adatas['K562_TFs__13_days'] = adata

In [17]:
# GSM2396860_K562_TFs__High_MOI
name = 'GSM2396860_k562_tfs_highmoi'

X = csr_matrix(mmread(path / f'{name}.mtx.txt'))
adata = sc.AnnData(X.T)

var = pd.read_csv(path / f'{name}_genenames.csv', index_col=0)
splitted = np.array([x.split('_', 1) for x in var.values[:,0]])
adata.var_names = splitted[:,1]
adata.var['gene_id'] = splitted[:,0]
adata.var_names_make_unique()

obs = pd.read_csv(path / f'{name}_cellnames.csv', index_col=0)
splitted = np.array([x.split('_', 2) for x in obs.values[:,0]])
adata.obs_names = obs.values[:,0]
adata.obs['identifier_0'] = splitted[:,1]
adata.obs['identifier_1'] = splitted[:,2]

# annotation:
files = [path / f'{name}_cbc_gbc_dict.csv', path / f'{name}_cbc_gbc_dict_strict.csv', path / f'{name}_cbc_gbc_dict_lenient.csv']
keys = ['grna', 'grna_strict', 'grna_lenient']
for file, key in zip(files, keys):
    if os.path.isfile(file):
        cbc_gbc_dict = pd.read_csv(file, index_col=0, header=None)
        adata.obs[key]=None
        for grna in list(cbc_gbc_dict.index):
            for barcode in cbc_gbc_dict.loc[grna][1].replace(' ','').split(','):
                if barcode in adata.obs_names:
                    val = adata.obs.loc[barcode][key]
                    adata.obs.loc[barcode][key] = grna if val is None else val+' + '+grna
adata.obs['target'] = [x.replace('p_sg', '').replace('p_', '').split('_')[0] if type(x)==str else None for x in adata.obs.grna_strict]
adata.obs = adata.obs.rename({'grna_strict': 'perturbation'}, axis=1).drop(['identifier_1', 'identifier_0'], axis=1)
adata.obs['moi'] = 'high'
adata.obs['cell_line'] = 'K562'
adata.obs['celltype'] = 'lymphoblasts'
adata.obs['perturbation_type'] = 'CRISPR'
adata.obs['cancer'] = True
adata.obs['disease'] = 'myelogenous leukemia'
adatas['K562_TFs__High_MOI'] = adata

# Rewrite existing

In [14]:
adata = sc.read_h5ad(TEMPDIR / 'DixitRegev2016.h5ad')

Hello,
In the data of your GSM2396860_K562_TFs__High_MOI data of DixitRegev2016.h5ad, you give the cell barocde which is not included in cbc_gbc_dict a tag "control" in the adata.obs['perturbation'], but the true control groups is cells knocked by 'INTERGENIC216151','INTERGENIC1144056','INTERGENIC393453' and 'INTERGENIC1216445'. This will influence the downsteam work if the 'control' tag is given to these cells without sgRNA knocking.
May the 'None' tag is better than 'control'? This is such an excellent works.


In [19]:
adata = sc.concat(adatas, index_unique='-', label='library', join='outer')

In [21]:
adata.obs['tissue_type']='cell_line'
adata.obs['organism'] = 'human'

obs = adata.obs.copy()
#obs['grna_lenient']=obs['grna_lenient'].str.replace(' + ',';', regex=False)
#obs['guide_id']= obs['perturbation']


In [22]:
obs.perturbation = obs.perturbation.astype(str)

In [26]:
obs.perturbation.unique()

array(['ELK1', 'ELF1', 'IRF1', 'ETS1', 'None', 'control', 'EGR1', 'YY1',
       'GABPA', 'E2F4', 'NR2C2', 'CREB1'], dtype=object)

In [24]:
obs.perturbation = [x.replace('_', '-').replace('p-sg', '').replace('p-', '').replace(' + ', '_') for x in obs.perturbation]  # formatting
obs.perturbation = ['control' if ('INTERGENIC' in x) and ('_' not in x) else x for x in obs.perturbation]  # annotate control
obs.perturbation = ['_'.join(np.unique([y.split('-')[0] if 'INTERGENIC' not in y else 'INTERGENIC' for y in x.split('_')])) for x in obs.perturbation]  # collapse guides
obs.perturbation = [x.replace('_INTERGENIC', '').replace('INTERGENIC_', '') for x in obs.perturbation]  # remove intergenic from combis
obs.perturbation[obs.perturbation == 'INTERGENIC'] = 'control'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  obs.perturbation[obs.perturbation == 'INTERGENIC'] = 'control'


In [32]:
obs['grna_lenient']=obs['grna_lenient'].str.replace('_','-', regex=False)
obs['perturbation']=obs['perturbation'].str.replace('_','-', regex=False)
obs['perturbation']=obs['perturbation'].str.replace(' + ','_', regex=False)
adata.obs = obs.copy()

In [40]:
adata.obs.value_counts('perturbation')

perturbation
control                                       22653
None                                           4864
p-sgELF1-2                                     4047
p-sgGABPA-1                                    3356
p-sgELF1-5                                     3301
                                              ...  
p-sgELK1-7_p-sgELF1-1_p-sgELF1-2                  1
p-sgELK1-7_p-sgELF1-1_p-sgELF1-4                  1
p-sgCREB1-5_p-sgYY1-3_p-INTERGENIC393453          1
p-sgCREB1-5_p-sgYY1-3_p-INTERGENIC1216445         1
p-sgEGR1-4_p-sgNR2C2-3_p-INTERGENIC1216445        1
Length: 1726, dtype: int64

In [39]:
adata.obs['nperts'] = 1 + adata.obs['perturbation'].str.count('_') \
                    - adata.obs['perturbation'].str.count('control') \
                    - adata.obs['perturbation'].str.count('INTERGENIC') \
                    - adata.obs['perturbation'].str.count('None')

# Check

In [15]:
adata = sc.read_h5ad(TEMPDIR / 'DixitRegev2016.h5ad')

In [16]:
adata

AnnData object with n_obs × n_vars = 104179 × 20575
    obs: 'perturbation', 'target', 'moi', 'cell_line', 'celltype', 'perturbation_type', 'cancer', 'disease', 'library', 'tissue_type', 'organism', 'nperts', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo'
    var: 'ncounts', 'ncells'

In [17]:
adata.obs.perturbation.value_counts()

None    104179
Name: perturbation, dtype: int64

In [18]:
adata.obs.perturbation

cell_barcode
AAACATACGCTACA_p7d_A1-K562_TFs__7_days        None
AAACATACTCCTGC_p7d_A1-K562_TFs__7_days        None
AAACATTGACGTTG_p7d_A1-K562_TFs__7_days        None
AAACATTGATTCGG_p7d_A1-K562_TFs__7_days        None
AAACATTGGAGCAG_p7d_A1-K562_TFs__7_days        None
                                              ... 
TTTCTACTGATGAA_ph14d_F7-K562_TFs__High_MOI    None
TTTGACTGACGGGA_ph14d_F7-K562_TFs__High_MOI    None
TTTGACTGAGCATC_ph14d_F7-K562_TFs__High_MOI    None
TTTGACTGCTACCC_ph14d_F7-K562_TFs__High_MOI    None
TTTGCATGACCCTC_ph14d_F7-K562_TFs__High_MOI    None
Name: perturbation, Length: 104179, dtype: category
Categories (1, object): ['None']