In [1]:
### This notebook visualize candidates selected for experiment
### This notebook picks candidates that overlap with each other
import pandas as pd
from os.path import join
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_theme(style="whitegrid", font_scale=1.2)

In [2]:
path  = '../data/output/DESeq'
output_dir = './processed'

In [3]:
# need to define some universal functions
# Process dataframe so that there is a category column
def process_df(df):
    # Ensure the index is reset to make the operation clear
    df = df.reset_index()
    # Create the 'categ' column based on the split index for both DataFrames
    df['categ'] = df['index'].apply(lambda x: x.split('_')[0])
    # If needed, set the index back to its original state
    df = df.set_index('index')
    
    return(df)

In [4]:
def thresholds(df, categ_list, threshold):
    listK562_ABE[(K562_ABE['categ'] == 'MIS') & (K562_ABE['padj']<0.01)].index

In [5]:
BE = 'CBE'
library = 'nSpG'
thrd = 0.05

In [55]:
### First we pick ABE candidates between D0 and DN1
K562_ABE = pd.read_csv(join(path, f'DESeq2_K562_{BE}_{library}_D0_DN1.csv'), index_col = 0)
Jurkat_ABE = pd.read_csv(join(path, f'DESeq2_Jurkat_{BE}_{library}_D0_DN1.csv'), index_col = 0)
K562_ABE = process_df(K562_ABE)
Jurkat_ABE = process_df(Jurkat_ABE)

In [56]:
# Make K562 and Jurkat dataframe
data = pd.concat([K562_ABE[['log2FoldChange','padj']].add_suffix('_K562'), 
                  Jurkat_ABE[['log2FoldChange', 'padj']].add_suffix('_Jurkat')], axis = 1).dropna()
data = process_df(data)
data['significant'] = data.apply(lambda x: True if (x['padj_K562']<thrd) & (x['padj_Jurkat']<thrd) else False, axis = 1)

In [57]:
data.to_csv(join(output_dir, f'{library}_{BE}_cell_line_padj.csv'))

In [None]:
### Make dataframe for the barplot
### category, hit number, total sgRNA, hit rate, editing

In [72]:
thrd = 0.05
list_res = []
for (BE,library) in [('ABE', 'ABE'), ('ABE','nSpG')]:
# for (BE,library) in [('CBE', 'CBE'), ('CBE','nSpG')]:

    ### First we pick ABE candidates between D0 and DN1
    K562_ABE = pd.read_csv(join(path, f'DESeq2_K562_{BE}_{library}_D0_DN1.csv'), index_col = 0)
    Jurkat_ABE = pd.read_csv(join(path, f'DESeq2_Jurkat_{BE}_{library}_D0_DN1.csv'), index_col = 0)
    K562_ABE = process_df(K562_ABE)
    Jurkat_ABE = process_df(Jurkat_ABE)

    # Make K562 and Jurkat dataframe
    data = pd.concat([K562_ABE[['log2FoldChange','padj']].add_suffix('_K562'), 
                      Jurkat_ABE[['log2FoldChange', 'padj']].add_suffix('_Jurkat')], axis = 1).dropna()
    data = process_df(data)
    data['significant'] = data.apply(lambda x: True if (x['padj_K562']<thrd) & (x['padj_Jurkat']<thrd) else False, axis = 1)

    # data.to_csv(join(output_dir, f'{library}_{BE}_cell_line_padj.csv'))

    list_categ = data['categ'].unique()
    for i,categ in enumerate(list_categ):
        data_ = data[data['categ'] == categ]
        tot_n = len(data_)
        hit_n = len(data_[data_['significant'] == True])
        hit_perc = hit_n/tot_n*100
        list_res.append({'categ':categ, 'tot_n':tot_n, 'sig_n':hit_n, 'hit_perc':hit_perc, 'BE':BE, 'library':library})

In [73]:
pd.DataFrame(list_res).to_csv(join(output_dir, 'nSpG_ABE_hitn.csv'))

In [91]:
thrd = 0.05
list_res = []
BE = 'ABE'; library = 'ABE'
### First we pick ABE candidates between D0 and DN1
K562_ABE = pd.read_csv(join(path, f'DESeq2_K562_{BE}_{library}_D0_DN1.csv'), index_col = 0)
Jurkat_ABE = pd.read_csv(join(path, f'DESeq2_Jurkat_{BE}_{library}_D0_DN1.csv'), index_col = 0)
K562_ABE = process_df(K562_ABE)
Jurkat_ABE = process_df(Jurkat_ABE)

# Make K562 and Jurkat dataframe
data_ABE = pd.concat([K562_ABE[['log2FoldChange','padj']].add_suffix('_K562'), 
                  Jurkat_ABE[['log2FoldChange', 'padj']].add_suffix('_Jurkat')], axis = 1).dropna()
data_ABE = process_df(data_ABE)
data_ABE['significant'] = data_ABE.apply(lambda x: True if (x['padj_K562']<thrd) & (x['padj_Jurkat']<thrd) else False, axis = 1)

BE = 'ABE'; library = 'nSpG'
### First we pick ABE candidates between D0 and DN1
K562_nSpG = pd.read_csv(join(path, f'DESeq2_K562_{BE}_{library}_D0_DN1.csv'), index_col = 0)
Jurkat_nSpG = pd.read_csv(join(path, f'DESeq2_Jurkat_{BE}_{library}_D0_DN1.csv'), index_col = 0)
K562_nSpG = process_df(K562_nSpG)
Jurkat_nSpG = process_df(Jurkat_nSpG)
# Make K562 and Jurkat dataframe
data_nSpG = pd.concat([K562_nSpG[['log2FoldChange','padj']].add_suffix('_K562'), 
                  Jurkat_nSpG[['log2FoldChange', 'padj']].add_suffix('_Jurkat')], axis = 1).dropna()
data_nSpG = process_df(data_nSpG)
data_nSpG['significant'] = data_nSpG.apply(lambda x: True if (x['padj_K562']<thrd) & (x['padj_Jurkat']<thrd) else False, axis = 1)

list_ovp = []
list_categ = data_ABE['categ'].unique()
for i,categ in enumerate(list_categ):
    data_ABE_ = data_ABE[data_ABE['categ'] == categ]
    data_nSpG_ = data_nSpG[data_nSpG['categ'] == categ]
    hits_ABE = data_ABE_.index[data_ABE_['significant'] == True]
    hits_nSpG = data_nSpG_.index[data_nSpG_['significant'] == True]
    ovp = hits_ABE.intersection(hits_nSpG)
    list_ovp.append({'categ':categ, 'hits_ABE':len(hits_ABE),'hits_nSpG':len(hits_nSpG), 'ovp':len(ovp),'BE':BE})

In [92]:
pd.DataFrame(list_ovp).to_csv(join(output_dir,'ABE_nSpG_ovp.csv'))

In [88]:
thrd = 0.05
list_res = []
BE = 'CBE'; library = 'CBE'
### First we pick ABE candidates between D0 and DN1
K562_ABE = pd.read_csv(join(path, f'DESeq2_K562_{BE}_{library}_D0_DN1.csv'), index_col = 0)
Jurkat_ABE = pd.read_csv(join(path, f'DESeq2_Jurkat_{BE}_{library}_D0_DN1.csv'), index_col = 0)
K562_ABE = process_df(K562_ABE)
Jurkat_ABE = process_df(Jurkat_ABE)

# Make K562 and Jurkat dataframe
data_ABE = pd.concat([K562_ABE[['log2FoldChange','padj']].add_suffix('_K562'), 
                  Jurkat_ABE[['log2FoldChange', 'padj']].add_suffix('_Jurkat')], axis = 1).dropna()
data_ABE = process_df(data_ABE)
data_ABE['significant'] = data_ABE.apply(lambda x: True if (x['padj_K562']<thrd) & (x['padj_Jurkat']<thrd) else False, axis = 1)

BE = 'CBE'; library = 'nSpG'
### First we pick ABE candidates between D0 and DN1
K562_nSpG = pd.read_csv(join(path, f'DESeq2_K562_{BE}_{library}_D0_DN1.csv'), index_col = 0)
Jurkat_nSpG = pd.read_csv(join(path, f'DESeq2_Jurkat_{BE}_{library}_D0_DN1.csv'), index_col = 0)
K562_nSpG = process_df(K562_nSpG)
Jurkat_nSpG = process_df(Jurkat_nSpG)
# Make K562 and Jurkat dataframe
data_nSpG = pd.concat([K562_nSpG[['log2FoldChange','padj']].add_suffix('_K562'), 
                  Jurkat_nSpG[['log2FoldChange', 'padj']].add_suffix('_Jurkat')], axis = 1).dropna()
data_nSpG = process_df(data_nSpG)
data_nSpG['significant'] = data_nSpG.apply(lambda x: True if (x['padj_K562']<thrd) & (x['padj_Jurkat']<thrd) else False, axis = 1)

list_ovp = []
list_categ = data_ABE['categ'].unique()
for i,categ in enumerate(list_categ):
    data_ABE_ = data_ABE[data_ABE['categ'] == categ]
    data_nSpG_ = data_nSpG[data_nSpG['categ'] == categ]
    hits_ABE = data_ABE_.index[data_ABE_['significant'] == True]
    hits_nSpG = data_nSpG_.index[data_nSpG_['significant'] == True]
    ovp = hits_ABE.intersection(hits_nSpG)
    list_ovp.append({'categ':categ, 'hits_ABE':len(hits_ABE),'hits_nSpG':len(hits_nSpG), 'ovp':len(ovp),'BE':BE})

In [90]:
pd.DataFrame(list_ovp).to_csv(join(output_dir,'CBE_nSpG_ovp.csv'))