In [1]:
### This notebook picks candidates that overlap with each other
import pandas as pd
from os.path import join
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle

In [2]:
# INPUT DESEQ2 FILE PATH
path  = '../data/output/DESeq'

In [3]:
# need to define some universal functions
# Process dataframe so that there is a category column
def process_df(df):
    # Ensure the index is reset to make the operation clear
    df = df.reset_index()
    # Create the 'categ' column based on the split index for both DataFrames
    df['categ'] = df['index'].apply(lambda x: x.split('_')[0])
    # If needed, set the index back to its original state
    df = df.set_index('index')
    
    return(df)

# Select candidates that are common in 2 dataframes
def sel_candidate(df1,df2, thrd = 0.05 , category = None):
    df1 = process_df(df1)
    df2 = process_df(df2)
    list_candidate_df1 = df1[(df1['categ'] == category) & (df1['padj']<thrd) ].index
    list_candidate_df2 = df2[(df2['categ'] == category) & (df2['padj']<thrd)].index
    
    candidate = set(list_candidate_df1).intersection(list_candidate_df2)
    
    return(candidate)

***ABE candidates***

In [7]:
### First we pick ABE candidates between D0 and DN1
K562_ABE = pd.read_csv(join(path, f'DESeq2_K562_ABE_ABE_D0_DN1.csv'), index_col = 0)
Jurkat_ABE = pd.read_csv(join(path, f'DESeq2_Jurkat_ABE_ABE_D0_DN1.csv'), index_col = 0)
### First we pick ABE candidates between D0 and DN1
K562_ABE_nSpG = pd.read_csv(join(path, f'DESeq2_K562_ABE_nSpG_D0_DN1.csv'), index_col = 0)
Jurkat_ABE_nSpG = pd.read_csv(join(path, f'DESeq2_Jurkat_ABE_nSpG_D0_DN1.csv'), index_col = 0)

In [8]:
candidate_ABE = sel_candidate(K562_ABE, Jurkat_ABE, thrd = 0.05, category = 'SYN')
candidate_nSpG = sel_candidate(K562_ABE_nSpG, Jurkat_ABE_nSpG, thrd = 0.05, category = 'SYN')
ABE_syn_candidates = [i for i in candidate_ABE if i not in candidate_nSpG]
print('nSpG filtered candidates')
print([i for i in candidate_ABE if i in candidate_nSpG])
with open('./processed/ABE_SYN_Candidates.pkl', 'wb') as f:
    pickle.dump(ABE_syn_candidates, f)

nSpG filtered candidates
['SYN_SNRNP200_268']


In [9]:
candidate_ABE = sel_candidate(K562_ABE, Jurkat_ABE, thrd = 0.05, category = 'MIS')
candidate_nSpG = sel_candidate(K562_ABE_nSpG, Jurkat_ABE_nSpG, thrd = 0.05, category = 'MIS')
ABE_mis_candidates = [i for i in candidate_ABE if i not in candidate_nSpG]
print('nSpG filtered candidates')
print([i for i in candidate_ABE if i in candidate_nSpG])
with open('./processed/ABE_MIS_Candidates.pkl', 'wb') as f:
    pickle.dump(ABE_mis_candidates, f)

nSpG filtered candidates
[]


In [10]:
print('Synonymous candidate sgRNAs are:')
print(ABE_syn_candidates)
print('Missense candidate sgRNAs are')
print(ABE_mis_candidates)

Synonymous candidate sgRNAs are:
['SYN_DYNC1H1_581', 'SYN_RUVBL1_1802', 'SYN_SFPQ_64', 'SYN_HSPA9_1786']
Missense candidate sgRNAs are
['MIS_RPAP1_10984', 'MIS_COPA_5845', 'MIS_RPAP1_11195', 'MIS_POLR2B_12064', 'MIS_CDC16_7200', 'MIS_POLR2C_11387', 'MIS_ECD_14808', 'MIS_POLR1A_2208', 'MIS_ECD_14812']


In [15]:
df_ = process_df(K562_ABE)

In [18]:
df_[df_['categ'] == 'SYN']

Unnamed: 0_level_0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,categ
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SYN_CLP1_0,3275.487112,-0.300452,0.341464,-0.879895,0.378916,0.841389,SYN
SYN_CLP1_1,6823.595969,-0.450568,0.274938,-1.638799,0.101255,0.496942,SYN
SYN_CLP1_2,13892.210872,-0.113292,0.194526,-0.582403,0.560296,0.915210,SYN
SYN_CLP1_3,8133.475930,0.230098,0.235759,0.975989,0.329070,0.802923,SYN
SYN_CLP1_4,5350.439265,0.149614,0.271966,0.550119,0.582237,0.921029,SYN
...,...,...,...,...,...,...,...
SYN_CDT1_2129,3381.286854,0.221085,0.331836,0.666248,0.505252,0.900189,SYN
SYN_CDT1_2130,1918.945857,0.306685,0.437265,0.701371,0.483072,0.890459,SYN
SYN_CDT1_2131,6329.109356,0.322645,0.261142,1.235514,0.216639,0.706660,SYN
SYN_CDT1_2132,2989.503908,0.445927,0.364048,1.224914,0.220608,0.709242,SYN


***CBE candidates***

In [8]:
### First we pick ABE candidates between D0 and DN1
K562_CBE = pd.read_csv(join(path, 'DESeq2_K562_CBE_CBE_D0_DN1.csv'), index_col = 0)
Jurkat_CBE = pd.read_csv(join(path, 'DESeq2_Jurkat_CBE_CBE_D0_DN1.csv'), index_col = 0)
K562_CBE_D14 = pd.read_csv(join(path, 'DESeq2_K562_CBE_CBE_D14_DN1.csv'), index_col = 0)
K562_CBE_nSpG = pd.read_csv(join(path, 'DESeq2_K562_CBE_nSpG_D0_DN1.csv'), index_col = 0)
Jurkat_CBE_nSpG = pd.read_csv(join(path, 'DESeq2_Jurkat_CBE_nSpG_D0_DN1.csv'), index_col = 0)
K562_CBE_nSpG_D14 = pd.read_csv(join(path, 'DESeq2_K562_CBE_nSpG_D14_DN1.csv'), index_col = 0)

In [9]:
candidate_CBE = sel_candidate(K562_CBE, Jurkat_CBE, thrd = 0.05, category = 'SYN')
candidate_nSpG = sel_candidate(K562_CBE_nSpG, Jurkat_CBE_nSpG, thrd = 0.05, category = 'SYN')
CBE_syn_candidates = [i for i in candidate_CBE if i not in candidate_nSpG]
# with open('./processed/ABE_SYN_Candidates.pkl', 'wb') as f:
#     pickle.dump(ABE_syn_candidates, f)
print('There are no CBE candidates for synonymous in the first time point:')
print(CBE_syn_candidates)

There are no CBE candidates for synonymous in the first time point:
[]


In [10]:
candidate_CBE = sel_candidate(K562_CBE, K562_CBE_D14, thrd = 0.05, category = 'SYN')
candidate_nSpG = sel_candidate(K562_CBE_nSpG, K562_CBE_nSpG_D14, thrd = 0.05, category = 'SYN')
CBE_syn_candidates = [i for i in candidate_CBE if i not in candidate_nSpG]
print('nSpG filtered candidates')
print([i for i in candidate_CBE if i in candidate_nSpG])
with open('./processed/CBE_SYN_Candidates.pkl', 'wb') as f:
    pickle.dump(CBE_syn_candidates, f)

nSpG filtered candidates
[]


In [11]:
candidate_CBE = sel_candidate(K562_CBE, Jurkat_CBE, thrd = 0.05, category = 'MIS')
candidate_nSpG = sel_candidate(K562_CBE_nSpG, Jurkat_CBE_nSpG, thrd = 0.05, category = 'MIS')
CBE_mis_candidates = [i for i in candidate_CBE if i not in candidate_nSpG]
with open('./processed/CBE_MIS_Candidates.pkl', 'wb') as f:
    pickle.dump(CBE_mis_candidates, f)
print('nSpG filtered candidates')
print([i for i in candidate_CBE if i in candidate_nSpG])

nSpG filtered candidates
[]


In [12]:
print('Synonymous candidate sgRNAs are:')
print(CBE_syn_candidates)
print('Missense candidate sgRNAs are')
print(CBE_mis_candidates)

Synonymous candidate sgRNAs are:
['SYN_DYNC1H1_1108', 'SYN_PSMB5_1536', 'SYN_VCP_467', 'SYN_SF3B3_2907']
Missense candidate sgRNAs are
['MIS_POLR1A_1341', 'MIS_POLR1A_1428', 'MIS_TUT1_8428', 'MIS_RUVBL1_7221']


***Potential same threshold missense candidates***

In [42]:
candidate_CBE = sel_candidate(K562_CBE, K562_CBE_D14, thrd = 0.05, category = 'MIS')
candidate_nSpG = sel_candidate(K562_CBE_nSpG, K562_CBE_nSpG_D14, thrd = 0.05, category = 'MIS')
CBE_mis_candidates = [i for i in candidate_CBE if i not in candidate_nSpG]
CBE_mis_candidates

['MIS_POLR1A_1341',
 'MIS_POLR1A_1428',
 'MIS_RNGTT_5821',
 'MIS_TUT1_8428',
 'MIS_RUVBL1_7221']