In [1]:
### This notebook take the Hannah et al and Martin et al data to present the hit rate in their paper
import pandas as pd 
from os.path import join
import numpy as np
import MetaAnalysis_utils as mu

data_path = '../../data/MetaAnlysis'
output_path = './processed'


In [2]:
### Define a dictionary that use the same names
dict_rename = {'Missense':['missense', 'MIS', 'Missense'],
'Synonymous':['synonymous', 'SYN','Synonymous'],
'No-Edits':['empty-window','ETY','No-Edits', 'No edits'],
'Non-Targeting':['Non-targeting', 'Non-Targeting'],
'Nonsense':['Nonsense','nonsense'],
'Splice site': ['Splice site', 'splice-donor', 'splice-acceptor'],
'Non-coding':['Intron', 'non-coding', 'UTR']}
# Creating a reverse lookup dictionary for mapping values
reverse_dict = {v: k for k, vals in dict_rename.items() for v in vals}

### 1. Hannah et al

In [3]:
# We process both base editors
hannah_be39 = pd.read_csv(join(data_path,'Ess_Hanna_2021_BE3.9.csv'))
hannah_be4 = pd.read_csv(join(data_path,'Ess_Hanna_2021_BE4max.csv'))
hannah_annot = pd.read_csv(join(data_path,'Ess_Hanna_2021_annotation.csv'))

pan_lethal = ['EEF2','HNRNPU', 'KPNB1', 'PELP1', 'POLR1C', 'PSMA6', 'RPS20', 'SF3B1', 'SNRPD1','TFRC']
control_genes = ['ICAM1','FAS','CD81','CD33']

In [4]:
dict_res = {}
# for cell_line in ['A375']:
for cell_line in ['A375', 'OVCAR8', 'HAP1', 'HA1E', 'MELJUSO']:
    dict_res[cell_line] = {}
    print(f'PROCESSING CELL LINE {cell_line}......')
    sub_df = mu.make_subdf(hannah_be39, hannah_annot, cell_line)
    sub_df['Category'] = sub_df['Mutation category'].apply(mu.sgRNA_categ)
    sub_df.loc[sub_df['Gene symbol'] == 'NEGATIVE CONTROL', 'Category'] = 'NEGATIVE CONTROL'
    # Calculate the normalized log2 fold change
    _df = mu.calc_L2FC(sub_df,cell_line)
    _df = mu.calc_z_score(_df, cell_line, control_genes)

    categ_list = [i for i in _df['Category'].unique() if i != 'NEGATIVE CONTROL']
    _df = _df[_df['Gene symbol'].isin(pan_lethal)]
    for categ_ in categ_list:
        n_sig, n_tot = mu.sig_perc(_df, 'Z_score', categ_, cut_off = -2, category_col = 'Category')
        dict_res[cell_line][categ_] = {'sig_n':n_sig, 'tot_n':n_tot}

PROCESSING CELL LINE A375......
PROCESSING CELL LINE OVCAR8......
PROCESSING CELL LINE HAP1......
PROCESSING CELL LINE HA1E......
PROCESSING CELL LINE MELJUSO......


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[f'{cell_line}_L2FC'] = np.mean(np.array([l2fcA, l2fcB]), axis=0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Z_score'] = (df[f'{cell_line}_L2FC'] - mean_control) / std_control
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df[f'{cell_line}_L2FC'] = np.mean(np.array([l2fc

In [6]:
# Organize the result dictionary into dataframe
df_res = pd.DataFrame()
for cell_lines in dict_res.keys():
    df_ = pd.DataFrame.from_dict(dict_res[cell_lines], orient = 'index')
    df_['cell_line'] = cell_lines
    df_ = df_.reset_index(names = ['category'])
    df_res = pd.concat([df_res, df_])
# Calculate the percentage of the hit rate
df_res['mutation_category'] = df_res['category'].replace(reverse_dict)
df_res = df_res.groupby(['mutation_category', 'cell_line'],as_index=False).sum()
df_res = df_res.drop('category', axis = 1)
df_res['perc'] = df_res['sig_n']/df_res['tot_n']
# Save dataframe
df_res.to_csv(join(output_path, 'Hannah.csv'))

### 2. Martin et al

In [143]:
ess_genes = ['ATR','BARD1','BRCA1','BRCA2','RAD51C','RAD51D','XRCC3']
dict_cutoff = {'MCF10A':-0.46581,
'MCF7':-0.61511,
'HAP1':-0.9068}

In [144]:
dict_res = {}  
for cells in ['MCF10A', 'HAP1', 'MCF7']:
    dict_res[cells] = {}
    df = pd.read_csv(join(data_path, f'Raquel_{cells}_score.csv'))
    categ_list =[ i for i in df['Function'].unique() if not pd.isna(i)]
    # Only counting hits in essential genes
    df_ess = df[df['Gene'].isin(ess_genes)]
    for categ_ in categ_list:
        n_sig, n_tot = mu.sig_perc(df_ess, 'LFC_UNT', categ_, cut_off = dict_cutoff[cells], category_col = 'Function')
        dict_res[cells][categ_] = {'sig_n':n_sig, 'tot_n':n_tot}

In [145]:
# Organize the result dictionary into dataframe
df_res = pd.DataFrame()
for cell_lines in dict_res.keys():
    df_ = pd.DataFrame.from_dict(dict_res[cell_lines], orient = 'index')
    df_['cell_line'] = cell_lines
    df_ = df_.reset_index(names = ['category'])
    df_res = pd.concat([df_res, df_])
# Calculate the percentage of the hit rate
df_res['mutation_category'] = df_res['category'].replace(reverse_dict)
df_res = df_res.groupby(['mutation_category', 'cell_line'],as_index=False).sum()
df_res = df_res.drop('category', axis = 1)
df_res['perc'] = df_res['sig_n']/df_res['tot_n']
# Save dataframe
df_res.to_csv(join(output_path, 'Martin_BE3.csv'))

In [137]:
df_res

Unnamed: 0,mutation_category,cell_line,sig_n,tot_n,perc
0,Missense,HAP1,84,1385,0.06065
1,Missense,MCF10A,129,1399,0.092209
2,Missense,MCF7,104,1401,0.074233
3,No-Edits,HAP1,15,598,0.025084
4,No-Edits,MCF10A,34,617,0.055105
5,No-Edits,MCF7,72,619,0.116317
6,Non-coding,HAP1,7,96,0.072917
7,Non-coding,MCF10A,6,96,0.0625
8,Non-coding,MCF7,5,96,0.052083
9,Nonsense,HAP1,32,176,0.181818
