In [1]:
import pertpy as pt
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc
import anndata as ad
import pandas as pd 

  from .autonotebook import tqdm as notebook_tqdm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
adata_d1n = sc.read_10x_mtx('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375483_D1N_matrix')
adata_d1n.obs.index = [x.split('-')[0] for x in adata_d1n.obs.index]
adata_d1n.obs['sample'] = 'D1_nostim'
adata_d1n.obs['patient'] = 'D1'


adata_d2n = sc.read_10x_mtx('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375485_D2N_matrix')
adata_d2n.obs.index = [x.split('-')[0] for x in adata_d2n.obs.index]
adata_d2n.obs['sample'] = 'D2_nostim'
adata_d2n.obs['patient'] = 'D2'

adata_d1s = sc.read_10x_mtx('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375484_D1S_matrix')
adata_d1s.obs.index = [x.split('-')[0] for x in adata_d1s.obs.index]
adata_d1s.obs['sample'] = 'D1_stim'
adata_d1s.obs['patient'] = 'D1'

adata_d2s = sc.read_10x_mtx('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375486_D2S_matrix')
adata_d2s.obs.index = [x.split('-')[0] for x in adata_d2s.obs.index]
adata_d2s.obs['sample'] = 'D2_stim'
adata_d2s.obs['patient'] = 'D2'

In [3]:
obsmat_d1n = pd.read_csv('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375487_D1N_CellBC_sgRNA.csv', index_col=0)
obsmat_d2n = pd.read_csv('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375489_D2N_CellBC_sgRNA.csv', index_col=0)
obsmat_d1s = pd.read_csv('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375488_D1S_CellBC_sgRNA.csv', index_col=0)
obsmat_d2s = pd.read_csv('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375490_D2S_CellBC_sgRNA.csv', index_col=0)



In [4]:
def integrate_obsmat(adata, obsmat, copy=True):
    """ set the corresponding columns of adata.obs to the values in obsmat.
        copy boolean slows performance but prevents modification of original adata."""
    if copy:
        adata = adata.copy() # make a copy so we don't modify the original
    adata.obs['guide_id'] = 'NA'
    adata.obs['guide_counts']= 0
    for i in obsmat.index:
        if i in adata.obs.index:
            adata.obs.loc[i, 'guide_id'] = obsmat.loc[i]['gRNA.ID']
            adata.obs.loc[i, 'guide_counts'] = obsmat.loc[i]['UMI.count']
    return(adata)

In [5]:
adata_d1n = integrate_obsmat(adata_d1n, obsmat_d1n, copy=False)
adata_d2n = integrate_obsmat(adata_d2n, obsmat_d2n, copy=False)
adata_d1s = integrate_obsmat(adata_d1s, obsmat_d1s, copy=False)
adata_d2s = integrate_obsmat(adata_d2s, obsmat_d2s, copy=False)

In [6]:
adata = ad.concat([adata_d1n, adata_d2n, adata_d1s, adata_d2s], join='outer')

  utils.warn_names_duplicates("obs")


In [7]:
adata.obs_names_make_unique()

In [8]:
adata.obs.groupby('sample').size()

sample
D1_nostim    11105
D1_stim      15829
D2_nostim    11486
D2_stim      13816
dtype: int64

In [9]:
adata.obs['guide_id'].str.split('.')

AAACCTGAGACACTAA                 [NA]
AAACCTGAGAGACTTA                 [NA]
AAACCTGAGCATCATC                 [NA]
AAACCTGAGCGATTCT    [ES, sg26, PDCD1]
AAACCTGAGGGCTTCC                 [NA]
                          ...        
TTTGTCATCCTCAACC                 [NA]
TTTGTCATCTCGCATC                 [NA]
TTTGTCATCTTAGAGC    [ES, sg34, TCEB2]
TTTGTCATCTTATCTG    [ES, sg35, TCEB2]
TTTGTCATCTTGTCAT                 [NA]
Name: guide_id, Length: 52236, dtype: object

In [10]:
# split the entries column 'guide_id' that contain periods by the period, and
# take the third element and put it into a new 'perturbation' column

adata.obs['target'] = adata.obs['guide_id'].str.split('.').str[2]

In [11]:
adata.obs['perturbation'] = adata.obs['target']
# set all NaN values to "control"
adata.obs['perturbation'] = adata.obs['perturbation'].fillna('control')
# set all NonTarget values to "control"
adata.obs['perturbation'] = adata.obs['perturbation'].replace('NonTarget', 'control')

In [12]:
adata.obs['perturbation_2'] = adata.obs['sample'].str.split('_').str[1]

# set all "nostim" values to "control"
adata.obs['perturbation_2'] = adata.obs['perturbation_2'].replace('nostim', 'control')

In [13]:
adata.obs['perturbation'].value_counts()

perturbation
control     30683
DGKA         2296
PDCD1        1484
TMEM222      1426
BTLA         1412
HAVCR2       1355
CBLB         1327
CD5          1080
C10orf54     1058
MEF2D        1026
DGKZ         1020
LCP2          981
TCEB2         929
RASA2         905
CD3D          856
LAG3          840
SOCS1         835
TNFRSF9       777
CDKN1B        749
ARID1A        625
STAT6         572
Name: count, dtype: int64

In [14]:
adata.obs['disease']= "healthy"
adata.obs['cancer']= False
adata.obs['tissue_type']="primary"
adata.obs['organism']="human"
adata.obs['perturbation_type']="CRISPR"
adata.obs['perturbation_type_2']= "TCR stimulation"
adata.obs['nperts']=1
adata.obs['celltype']="T cells"

In [15]:
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
adata.var['ribo']= adata.var_names.str.startswith('RPS') | adata.var_names.str.startswith('RPL') # annotate the group of ribosomal genes as 'ribo'

In [16]:
qc =  sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','ribo'], percent_top=None, log1p=False, inplace=False) 

In [17]:

adata.obs['ncounts'] = qc[0]['total_counts']
adata.obs['ngenes'] = qc[0]['n_genes_by_counts']
adata.obs['percent_mito'] = qc[0]['pct_counts_mt']
adata.obs['percent_ribo'] = qc[0]['pct_counts_ribo']
adata.var['ncounts'] = qc[1]['total_counts']
adata.var['ncells'] = qc[1]['n_cells_by_counts']


In [18]:
adata.var.drop(columns=['mt', 'ribo'], inplace=True)

In [19]:
adata.var

Unnamed: 0,ncounts,ncells
RP11-34P13.3,0.0,0
FAM138A,0.0,0
OR4F5,0.0,0
RP11-34P13.7,73.0,73
RP11-34P13.8,14.0,14
...,...,...
AC233755.2,0.0,0
AC233755.1,4.0,4
AC240274.1,296.0,287
AC213203.1,0.0,0


In [20]:
adata.obs

Unnamed: 0,sample,patient,guide_id,guide_counts,target,perturbation,perturbation_2,disease,cancer,tissue_type,organism,perturbation_type,perturbation_type_2,nperts,celltype,ncounts,ngenes,percent_mito,percent_ribo
AAACCTGAGACACTAA,D1_nostim,D1,,0,,control,control,healthy,False,primary,human,CRISPR,TCR stimulation,1,T cells,4367.0,1716,1.854820,26.402565
AAACCTGAGAGACTTA,D1_nostim,D1,,0,,control,control,healthy,False,primary,human,CRISPR,TCR stimulation,1,T cells,5846.0,1998,3.250086,30.294218
AAACCTGAGCATCATC,D1_nostim,D1,,0,,control,control,healthy,False,primary,human,CRISPR,TCR stimulation,1,T cells,3377.0,1438,2.931596,28.042641
AAACCTGAGCGATTCT,D1_nostim,D1,ES.sg26.PDCD1,5,PDCD1,PDCD1,control,healthy,False,primary,human,CRISPR,TCR stimulation,1,T cells,5710.0,1993,2.416813,33.047287
AAACCTGAGGGCTTCC,D1_nostim,D1,,0,,control,control,healthy,False,primary,human,CRISPR,TCR stimulation,1,T cells,3077.0,1266,0.747481,36.529087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCATCCTCAACC,D2_stim,D2,,0,,control,stim,healthy,False,primary,human,CRISPR,TCR stimulation,1,T cells,7121.0,2333,2.892852,25.530121
TTTGTCATCTCGCATC,D2_stim,D2,,0,,control,stim,healthy,False,primary,human,CRISPR,TCR stimulation,1,T cells,4100.0,1535,1.975610,34.439026
TTTGTCATCTTAGAGC,D2_stim,D2,ES.sg34.TCEB2,2,TCEB2,TCEB2,stim,healthy,False,primary,human,CRISPR,TCR stimulation,1,T cells,4293.0,1556,2.073142,35.616119
TTTGTCATCTTATCTG,D2_stim,D2,ES.sg35.TCEB2,6,TCEB2,TCEB2,stim,healthy,False,primary,human,CRISPR,TCR stimulation,1,T cells,7735.0,2532,2.301228,27.666452


In [21]:
adata.write_h5ad('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/ShifrutMarson2018.h5ad')