# Notebook for generating master tables

- Combining LFC data, Z-score normalization, editing information into a single dataframe

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
import seaborn as sns
import warnings
import os
import functools
import upsetplot

warnings.filterwarnings('ignore')
plt.rc('font', family='Helvetica')

In [2]:
MBES = pd.read_csv('MBESv2_CORRECTED.csv')

#adding in gene names
genes = []
for i, val in MBES.iterrows():
    t = val['classification']
    if t=='safe-targeting control':
        genes.append('ST')
    elif t=='non-targeting control':
        genes.append('NT')
    elif t=='targeting guide':
        g = val['gene_name_m']
        if str(g)=='nan':
            g = val['Symbol']

        genes.append(g)

MBES['gene_name_m_corrected'] = genes

#and info about whether its a legacy guides
legacy = []
for i in list(MBES['mutation_idx']):
    if 'MBESv1_legacy' in str(i):
        legacy.append(True)
    else:
        legacy.append(False)

MBES['legacy'] = legacy

#subsetting
ABE = MBES[MBES['Editor']=='ABE']
CBE = MBES[MBES['Editor']=='CBE']

#and then add back in the non-targetting guides
nt_ABE = list(ABE[ABE['classification']=='non-targeting control']['gRNA_id'])
nt_CBE = list(CBE[CBE['classification']=='non-targeting control']['gRNA_id'])


In [3]:
ABE_mageck = ['ABE_d5_input.sgrna_summary.txt',
 'ABE_d15_input.sgrna_summary.txt',
 'ABE_bonemarrow_input.sgrna_summary.txt',
 'ABE_spleen_input.sgrna_summary.txt',
 'ABE_meninges_input.sgrna_summary.txt',
 ]

CBE_mageck = ['CBE_d5_input.sgrna_summary.txt',
 'CBE_d15_input.sgrna_summary.txt',
 'CBE_bonemarrow_input.sgrna_summary.txt',
 'CBE_spleen_input.sgrna_summary.txt',
 'CBE_meninges_input.sgrna_summary.txt',
 ]

names = ['In Vitro D5','In Vitro D15','Bone', 'Spleen', 'Meninges',]

abe_df_holder = []
cbe_df_holder = []
for i in ABE_mageck:
    d = pd.read_csv(f'mageck/sgrna_summaries/{i}', sep='\t')
    abe_df_holder.append(d)
for i in CBE_mageck:
    d = pd.read_csv(f'mageck/sgrna_summaries/{i}', sep='\t')
    cbe_df_holder.append(d)

ABE_mageck_dict = dict(zip(names, abe_df_holder))
CBE_mageck_dict = dict(zip(names, cbe_df_holder))

In [4]:
#FIRST ABE
samples = ['In Vitro D15','Bone', 'Spleen', 'Meninges',]

excluded_guides = []
df_holder = []


for samp in samples:
    mageck_df = ABE_mageck_dict[samp]
    mageck_df = mageck_df.rename(columns = {'sgrna':'gRNA_id', 'LFC':f'LFC_{samp}', 'treat_mean':f'treat_mean_{samp}', 'control_mean':f'control_mean_{samp}',
                                 'treatment_count':f'treat_count_{samp}', 'control_count':f'control_count_{samp}', 'FDR':f'FDR_{samp}'})

    #exc = list(mageck_df[mageck_df[f'control_mean_{samp}']<control_mean_minimum]['gRNA_id'])
    #for i in exc:
    #    excluded_guides.append(i)
    #include everything and filter afterward instead


    df_holder.append(mageck_df[['gRNA_id', f'LFC_{samp}', f'treat_mean_{samp}', f'control_mean_{samp}',f'treat_count_{samp}', f'control_count_{samp}', f'FDR_{samp}']])
    #mageck_df = mageck_df[mageck_df['control_mean']>=control_mean_minimum].sort_values(by='LFC', ascending=False)
    
df_merged = functools.reduce(lambda  left,right: pd.merge(left,right,on=['gRNA_id'],
                                            how='outer'), df_holder)


#and then excluder the guides with too few control counts
exc_g = list(np.unique(excluded_guides))
df_merged = df_merged[~df_merged['gRNA_id'].isin(exc_g)]


#and then calculate the z-score for each
for k in samples:
    lfcs = np.asarray(df_merged[f'LFC_{k}'])
    avg = np.average(lfcs)
    std = np.std(lfcs)

    zscore = (lfcs-avg)/std
    df_merged[f'Z_{k}'] = zscore

df_merged_ABE = df_merged

In [5]:
#Then CBE

samples = ['In Vitro D15','Bone', 'Spleen', 'Meninges',]

excluded_guides = []
df_holder = []


for samp in samples:
    mageck_df = CBE_mageck_dict[samp]
    mageck_df = mageck_df.rename(columns = {'sgrna':'gRNA_id', 'LFC':f'LFC_{samp}', 'treat_mean':f'treat_mean_{samp}', 'control_mean':f'control_mean_{samp}',
                                 'treatment_count':f'treat_count_{samp}', 'control_count':f'control_count_{samp}', 'FDR':f'FDR_{samp}'})

    #exc = list(mageck_df[mageck_df[f'control_mean_{samp}']<control_mean_minimum]['gRNA_id'])
    #for i in exc:
    #    excluded_guides.append(i)
    #include everything and filter afterward instead


    df_holder.append(mageck_df[['gRNA_id', f'LFC_{samp}', f'treat_mean_{samp}', f'control_mean_{samp}',f'treat_count_{samp}', f'control_count_{samp}', f'FDR_{samp}']])
    #mageck_df = mageck_df[mageck_df['control_mean']>=control_mean_minimum].sort_values(by='LFC', ascending=False)
    
df_merged = functools.reduce(lambda  left,right: pd.merge(left,right,on=['gRNA_id'],
                                            how='outer'), df_holder)


#and then excluder the guides with too few control counts
exc_g = list(np.unique(excluded_guides))
df_merged = df_merged[~df_merged['gRNA_id'].isin(exc_g)]


#and then calculate the z-score for each
for k in samples:
    lfcs = np.asarray(df_merged[f'LFC_{k}'])
    avg = np.average(lfcs)
    std = np.std(lfcs)

    zscore = (lfcs-avg)/std
    df_merged[f'Z_{k}'] = zscore

In [6]:
#USING THE D5 EDITING AS PROXY FOR ALL OTHER EDITING

d5_abe = pd.read_csv('ABE_editing/MLE/d5_ABE_MLE.csv')
##---USING UPDATED CBE EDITING THAT INCLUDES LEGACY GUIDES!!---
d5_cbe = pd.read_csv('CBE_editing_UPDATED/MLE/d5_CBE_MLE.csv')

#and then adding in editing information to the dataframe
abe_editing = d5_abe[['Guide_ID', 'Reads_aligned_all_amplicons', 'corr_perc',	'target_base_edit_perc', 'byproduct_INDEL_perc','byproduct_sub_perc']].rename(columns = {'Guide_ID':'gRNA_id', 'Reads_aligned_all_amplicons': 'sensor_reads'})
abe_editing = abe_editing[abe_editing['gRNA_id'].isin(df_merged_ABE['gRNA_id'])]

cbe_editing = d5_cbe[['Guide_ID', 'Reads_aligned_all_amplicons', 'corr_perc',	'target_base_edit_perc', 'byproduct_INDEL_perc','byproduct_sub_perc']].rename(columns = {'Guide_ID':'gRNA_id', 'Reads_aligned_all_amplicons': 'sensor_reads'})
cbe_editing = cbe_editing[cbe_editing['gRNA_id'].isin(df_merged['gRNA_id'])]

ABE_master = pd.merge(df_merged_ABE, abe_editing, on='gRNA_id')
CBE_master = pd.merge(df_merged, cbe_editing, on='gRNA_id')

In [7]:
combined_master = pd.concat((ABE_master, CBE_master))
combined_master_final = pd.merge(MBES, combined_master, on='gRNA_id')

In [8]:
combined_master_final_control_50 = combined_master_final[combined_master_final['control_mean_In Vitro D15']>=50]

In [9]:
combined_master_final.to_csv('master_table_unfiltered.csv', index=False)
combined_master_final_control_50.to_csv('master_table_min50_control_mean_invitro.csv', index=False)