In [1]:
"""This notebook is the central workstation for figuring out how best to preprocess the data;
Author: Abdullah Kuziez
Dependencies: making_cellbox_files.py, graphing_fxns.py, filtering_functions.py,

General philosophy: the first section of the codebase structures the data into dictionaries of data_by_cell_line, 
once this is acheived, the filtering and plotting functions should be able to be applied to any dataset with the same structure

Dictionary structured as {cell_line: dataframe_for_cell_line,....}

Dataframe for cell line structured:
Rows=experiments(averaged together across replicates
Columns=protein expression levels+phenotypes+metadata(identified by the regex prefix meta_))

Supporting dataframes/dictionaries of control values and coefficients of variation are also made to facilitate transformation and filtering
The code checks for completeness of experiments and proteins, and removes outliers and low coefficient of variation proteins"""


%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy.cluster.hierarchy import linkage, leaves_list, fcluster
from collections import defaultdict
from scipy.stats import spearmanr, pearsonr
import requests
import json
import sys
import os
import pickle



#//////////////////////////////////////////Part 1: loading in data //////////////////////////////
#/////////////////////////////////////////////////////////////////////////////////////////////
#/////////////////////////////////////////////////////////////////////////////////////////////

with open(r'C:\Users\abdul\OneDrive - University of Cambridge\Desktop\MDRA\cellbox_torch\Abdullah_kuziez\Experiments\raw_data\TNBC_set\symbol_to_uniprot.json', 'r') as f:
    symbol_to_uniprot = json.load(f)

print(f"Loaded {len(symbol_to_uniprot)} symbol-to-uniprot mappings")

data=pd.read_csv(
r'C:\Users\abdul\OneDrive - University of Cambridge\Desktop\MDRA\cellbox_torch\Abdullah_kuziez\Experiments\raw_data\TNBC_set\PTV1_protein_matrix_test.cleaned.tsv',
index_col=0,sep='\t')

viability_raw=pd.read_excel(
r'C:\Users\abdul\OneDrive - University of Cambridge\Desktop\MDRA\cellbox_torch\Abdullah_kuziez\Experiments\raw_data\TNBC_set\cell_viability_sampleinfo_for_chris.xlsx', 
sheet_name=0)

screen_info = pd.read_excel(
    r"C:\Users\abdul\OneDrive - University of Cambridge\Desktop\MDRA\cellbox_torch\Abdullah_kuziez\Experiments\raw_data\TNBC_set\PTV1_sample_info_test.xlsx", 
    sheet_name=0)  # First sheet is single perturbation info


#//////////////////////////////////////////Part 2: structuring data //////////////////////////////
#/////////////////////////////////////////////////////////////////////////////////////////////
#/////////////////////////////////////////////////////////////////////////////////////////////

#making data_by_cell_line_raw; control_data_by_cell_line; control_data_by_cell_line_coeffvar:
data_T=data.transpose()
data_T['Sample_ID']=data_T.index
data_and_screen_info=pd.merge(data_T,screen_info,on='Sample_ID',how='left')
data_and_screen_info_and_viability = pd.merge(data_and_screen_info, viability_raw, on='Sample_ID', how='left')

#averaging over replicates:(pandas mean ignores NaN's)
#preserving the control rows since they drop out for some reason:
control_rows=data_and_screen_info_and_viability[data_and_screen_info_and_viability['pert_id']=='no']
r_conts = control_rows.copy()
data_and_screen_info_and_viability_grouped = (
    data_and_screen_info_and_viability
    .groupby(['pert_id', 'Cell', 'pert_time_x'])
    .mean(numeric_only=True)
    .reset_index()
)
r_conts.loc[r_conts['pert_id'] == 'no', 'pert_id'] = 'control'
r_conts_grouped_mean=r_conts.groupby(['pert_id', 'cell_line']).mean(numeric_only=True).reset_index()
#dropping unneeded columns:
data_dropped=data_and_screen_info_and_viability_grouped.drop(columns=['BioRep_y','pert_time_y'])

#getting coeff var for control data for filtering:
intermediate=r_conts.groupby(['cell_line']).std(numeric_only=True)
inter_idx=intermediate.index
r_conts_grouped_std=intermediate.reset_index()
# Calculate coefficient of variation, but if mean exists and std is NaN, set coeff_var to 3
mean_df = r_conts_grouped_mean.select_dtypes(include=[float, int])
std_df = r_conts_grouped_std.select_dtypes(include=[float, int])
coeff_var = std_df / mean_df

# Find where mean is not NaN and std is NaN, set coeff_var to 3 in those places
mask = mean_df.notna() & std_df.isna()
coeff_var[mask] = -.25
coeff_var['cell_line']=inter_idx

#dropping the irrelevant timepoints, (6 and 48)
data_dropped=data_dropped[data_dropped['pert_time_x']==6]

#//////////metadata codeblock
# Move the first three columns to the end of data_dropped
first_three_cols = data_dropped.columns[:3]
other_cols = data_dropped.columns[3:]
data_dropped = data_dropped.loc[:, list(other_cols) + list(first_three_cols)]
# Rename the last 10 columns of data_dropped by adding 'meta_' as a prefix
cols = list(data_dropped.columns)
last_9 = cols[-8:]
new_last_9 = ['meta_' + col for col in last_9]
rename_dict = dict(zip(last_9, new_last_9))
data_dropped = data_dropped.rename(columns=rename_dict)
#////////////////////////

#generating the dictionaries by cell line
data_by_cell_line_raw={}
control_data_by_cell_line={}
control_data_by_cell_line_coeffvar={}
cell_lines=pd.unique(data_and_screen_info_and_viability_grouped['Cell'])
for cell in cell_lines:
    data_by_cell_line_raw[cell]=data_dropped[data_dropped['meta_Cell']==cell]
    control_data_by_cell_line[cell]=r_conts_grouped_mean[r_conts_grouped_mean['cell_line']==cell]
    control_data_by_cell_line_coeffvar[cell]=coeff_var[coeff_var['cell_line']==cell]
    control_data_by_cell_line_coeffvar[cell].drop(columns=['cell_line'],inplace=True)

#//////////////////////////////////////////Part 3: renaming from pert-ID to drug name to group trials //////////////////////////////
#/////////////////////////////////////////////////////////////////////////////////////////////
#/////////////////////////////////////////////////////////////////////////////////////////////
#making drug_pert_id_targets_dict
drugs_and_targets = pd.read_csv(
    r'C:\Users\abdul\OneDrive - University of Cambridge\Desktop\MDRA\cellbox_torch\Abdullah_kuziez\Experiments\raw_data\TNBC_set\ptv1_unique_drug_names.csv'
)
drugs_and_targets.rename(columns={'original_drug_names': 'pert_name'}, inplace=True)
drugs_and_targets.dropna(inplace=True,subset=['pert_name'])

# Read and process drugs_and_pert_ids
drugs_and_pert_ids = pd.read_csv(
    r'C:\Users\abdul\OneDrive - University of Cambridge\Desktop\MDRA\cellbox_torch\Abdullah_kuziez\Experiments\raw_data\TNBC_set\PTV1_sample_info_test.csv'
)

drugs_and_pert_ids = (
    drugs_and_pert_ids
    .drop_duplicates(subset=['pert_id'])
    .drop_duplicates(subset=['pert_name'])
    .loc[:, ['pert_id', 'pert_name']]
    .assign(pert_id=lambda df: df['pert_id'].str.replace('#', ''))
    .dropna()
    .assign(pert_id=lambda df: df['pert_id'].astype(int))
    .sort_values('pert_id')
    .reset_index(drop=True)
)

# Merge and drop unnecessary columns
drugs_pert_ids_targets = (
    pd.merge(drugs_and_pert_ids, drugs_and_targets, on='pert_name', how='left')
    .drop(columns=['corrected_drug_name', 'drugbank_targets_manual_check'], errors='ignore')
)

# Build dictionary mapping pert_id and pert_name to list of targets
drug_pert_id_targets_dict = {}
for _, row in drugs_pert_ids_targets.iterrows():
    targets = []
    if pd.notnull(row.get('target_uniprot_ids')):
        targets = [t.strip() for t in str(row['target_uniprot_ids']).split(',') if t.strip()]
    drug_pert_id_targets_dict[row['pert_id']] = targets
    drug_pert_id_targets_dict[row['pert_name']] = targets


#//////////////////////////////////////////Part 4: making targeted_prots_raw and non_targeted_prots_raw: //////////////////////////////
#/////////////////////////////////////////////////////////////////////////////////////////////
#/////////////////////////////////////////////////////////////////////////////////////////////
#making targeted_prots_raw and non_targeted_prots_raw:
graph_flag=False
#Main method-essential filtering:
# graph_flag = False  #set to true to plot in the filter lines
cell_lines=['HS578T','HCC70','BT549','MDA-MB-453','MCF7','DU4475']

#basic completeness filters #TODO graphs are broken but that's fine
targeted_prots_raw={}
non_targeted_prots_raw={}
all_targeted_prots=[item for sublist in drug_pert_id_targets_dict.values() for item in sublist]

for cell in cell_lines:
    meta_cols=data_by_cell_line_raw[cell].columns[data_by_cell_line_raw[cell].columns.str.contains('meta_')]
    intersection=list(set(all_targeted_prots).intersection(set(data_by_cell_line_raw[cell].columns)))


    targeted_prots_raw[cell] = data_by_cell_line_raw[cell][list(intersection) + list(meta_cols)]
    
    non_targeted_prots_raw[cell]=data_by_cell_line_raw[cell].drop(columns=intersection)


#//////////////////////////////////////////Part 5: writing data to files///////////////////////
#/////////////////////////////////////////////////////////////////////////////////////////////
#/////////////////////////////////////////////////////////////////////////////////////////////

# Set a directory for intermediate files
intermediate_dir = Path("intermediate_files_TNBC/6hr")
intermediate_dir.mkdir(exist_ok=True)

# Save data_by_cell_line_raw, control_data_by_cell_line, control_data_by_cell_line_coeffvar, targeted_prots_raw, non_targeted_prots_raw
with open(intermediate_dir / "data_by_cell_line_raw_6hr.pkl", "wb") as f:
    pickle.dump(data_by_cell_line_raw, f)
with open(intermediate_dir / "control_data_by_cell_line_6hr.pkl", "wb") as f:
    pickle.dump(control_data_by_cell_line, f)
with open(intermediate_dir / "control_data_by_cell_line_coeffvar_6hr.pkl", "wb") as f:
    pickle.dump(control_data_by_cell_line_coeffvar, f)
with open(intermediate_dir / "targeted_prots_raw_6hr.pkl", "wb") as f:
    pickle.dump(targeted_prots_raw, f)
with open(intermediate_dir / "non_targeted_prots_raw_6hr.pkl", "wb") as f:
    pickle.dump(non_targeted_prots_raw, f)
with open(intermediate_dir / "cell_lines_6hr.pkl", "wb") as f:
    pickle.dump(cell_lines, f)
with open(intermediate_dir / "drug_pert_id_targets_dict_6hr.pkl", "wb") as f:
    pickle.dump(drug_pert_id_targets_dict, f)

# Save symbol_to_uniprot as JSON
with open(intermediate_dir / "symbol_to_uniprot.json", "w") as f:
    json.dump(symbol_to_uniprot, f)


Loaded 20311 symbol-to-uniprot mappings


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_data_by_cell_line_coeffvar[cell].drop(columns=['cell_line'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_data_by_cell_line_coeffvar[cell].drop(columns=['cell_line'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control_data_by_cell_line_coeffvar[cell].drop(columns=['cell_line'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org