In [1]:
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import pandas as pd
from tqdm import tqdm
from ceres_infer.data import stats_Crispr

In [2]:
dm_data_pkl_file = '../out/20.0817 proc_data/gene_effect/dm_data.pkl'
outdir = '../out/20.0817 proc_data_baseline/gene_effect/' # output directory

if not os.path.exists(outdir):
    os.makedirs(outdir)

In [4]:
#load data
dm_data = pickle.load(open(dm_data_pkl_file,'rb'))

# Plots

In [12]:
df_crispr_stats = stats_Crispr(dm_data)

#----------------------
#plot stats
plt.figure()
ax = sns.distplot(df_crispr_stats['avg'])
ax.set(xlabel='CERES [mean]', ylabel='Freq')
plt.savefig("%s/dist_ceres_mean.pdf" % outdir)
plt.close()

plt.figure()
ax = sns.distplot(df_crispr_stats['std'])
ax.set(xlabel='CERES [SD]', ylabel='Freq')
plt.savefig("%s/dist_ceres_sd.pdf" % outdir)
plt.close()

plt.figure()
ax = sns.scatterplot(x='diff',y='std', data=df_crispr_stats,s=90)
ax.set(xlabel='CERES range', ylabel='CERES sd')
plt.savefig("%s/scatter_range.sd.png" % outdir)
plt.close()

plt.figure()
ax = sns.scatterplot(x='avg',y='std', data=df_crispr_stats,s=90)
ax.set(xlabel='CERES mean', ylabel='CERES sd')
plt.savefig("%s/scatter_mean_sd.png" % outdir)
plt.close()

plt.figure()
ax = sns.scatterplot(x='avg',y='diff', data=df_crispr_stats,s=90)
ax.set(xlabel='CERES mean', ylabel='CERES range')
plt.savefig("%s/scatter_mean_range.png" % outdir)
plt.close()

  from pandas import Panel


# Gene classification (essentiality)

**Derive the gene classifications based on the gene dependency (probability values)**

In [41]:
# get gene dependency classifications (selective essential, common essentials, common non-essential)
# gene dependency is the 'probability that knocking out the gene has a real depletion effect using gene_effect'
df_genedep = pd.read_csv('%s/%s' % (dm_data.dir_datasets, dm_data.fname_gene_dependency), header=0, index_col=0)
df_genedep.columns = df_genedep.columns.str.extract('^(.*)\s').squeeze().values

def classifyDep(x):
    if all(x > 0.5):
        return 'common_essential'
    elif all(x < 0.5):
        return 'common_nonessential'
    else:
        return 'selective_essential'

dep_class = df_genedep.apply(lambda x: classifyDep(x), axis=0)
dep_class.to_csv("%s/gene_essential_classification.csv" % outdir, header=False, index=True)

In [42]:
dep_class.value_counts()

selective_essential    10587
common_nonessential     7322
common_essential         424
dtype: int64

In [43]:
dep_class.value_counts() / dep_class.shape[0] # percentage

selective_essential    0.577483
common_nonessential    0.399389
common_essential       0.023128
dtype: float64

**Derive the gene classifications based on the gene effects**

In [13]:
def classifyDep2(x):
    if all(x < -0.5):
        return 'common_essential (<-0.5)'
    elif all((x < 0.5) & (x > -0.5)):
        return 'common_nonessential (near 0s)'
    else:
        return 'selective_essential (others)'

dep_class2 = dm_data.df_crispr.apply(lambda x: classifyDep2(x), axis=0)

In [14]:
dep_class2.value_counts()

selective_essential (others)     15706
common_nonessential (near 0s)     2090
common_essential (<-0.5)           537
dtype: int64

In [15]:
dep_class2.value_counts() / dep_class2.shape[0] # percentage

selective_essential (others)     0.856706
common_nonessential (near 0s)    0.114002
common_essential (<-0.5)         0.029291
dtype: float64

**Looking at range**

In [5]:
range_vals = dm_data.df_crispr.apply(lambda x: max(x)-min(x), axis=0)

In [12]:
df = range_vals < 1
df.value_counts()

False    11966
True      6367
dtype: int64