In [None]:
import joblib
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import matplotlib.colors as colors
from matplotlib_venn import venn2
import statistics 
import random

%matplotlib inline

In [None]:
pd.options.mode.chained_assignment = None 

In [None]:
HERE = os.path.dirname(os.path.abspath('__file__'))
ROOT = os.path.abspath(os.path.join(HERE, os.pardir))
DATA = os.path.join(ROOT, 'Data')

SSGSEA_BRCA_KEGG = os.path.join(
    DATA,
    "kegg_brca.tsv"
)

GENE_SETS_KEGG = os.path.join(
    DATA,
    "kegg_geneset_final.gmt"
)

TRAINED_MODEL = os.path.join(
    DATA,
    "trained_models",
    "brca_trained_model.joblib"
)

BRCA_LABELS = os.path.join(
    DATA,
    "phenotype_classes_brca.cls"
)

HGNC_ID_MAP_TO_GENE_NAME = os.path.join(
    DATA,
    "hgnc_id_to_symbol.csv"
)

APPROVED_DRUG_BREAST = os.path.join(
    DATA,
    'validation',
    "approved_drug_breast.csv"
)

In [None]:
## Lable preparation

brca_labels = pd.read_csv(BRCA_LABELS, sep = "\t")
brca_labels.drop(brca_labels.index[0], inplace=True)
brca_labels = brca_labels.rename(index={1:'label'})
brca_labels = brca_labels.transpose()

temp_lable = []

for lable in brca_labels.label:
    temp_lable = lable.split(' ')

In [None]:
## Patient_Pathway dataframe preparation

# Transpose the dataframe's columns and rows
raw_data = pd.read_csv(SSGSEA_BRCA_KEGG, sep = "\t", index_col=0).transpose()

# Append the data lable as a column to main dataframe
raw_data.insert(311, "label", temp_lable, True)

# Convert the data lable into numerical value
num_labels = {"Normal": 0, "Tumor": 1} 
raw_data.label = [num_labels[item] for item in raw_data.label]

# # Removing the index column 
raw_data.reset_index(drop=True, inplace=True)

In [None]:
lable_list = list(raw_data['label'].values)

In [None]:
def parse_gmt_file(gmt_path: str, min_size=3, max_size=3000):
    """Parse gmt file."""
    with open(gmt_path) as f:
        genesets_dict = {
            line.strip().split("\t")[0]: line.strip().split("\t")[2:]
            for line in f
        }
    return genesets_dict

In [None]:
pathway_genes_dict = parse_gmt_file(GENE_SETS_KEGG)

In [None]:
## Read the drugbank_to_genes file and filter all source_databases but drugbank

drugbank_to_genes_ID = pd.read_csv('https://raw.githubusercontent.com/drug2ways/results/master/networks/data/custom_network.tsv',sep = "\t")
drugbank_to_genes_ID_keep_drugbank = drugbank_to_genes_ID.loc[drugbank_to_genes_ID['source_database'] == "drugbank"]

for i in range(len(drugbank_to_genes_ID_keep_drugbank["source"])):
    temp_drug = drugbank_to_genes_ID_keep_drugbank["source"].iloc[i]
    temp_drug = temp_drug.split(':')[1]
    drugbank_to_genes_ID_keep_drugbank["source"].iloc[i] = temp_drug

In [None]:
## Read the HGNC_ID to gene_name mapped file downloaded from HGNC website 

HGNC_ID_map_to_gene_name = pd.read_csv(HGNC_ID_MAP_TO_GENE_NAME, sep = "\t")

In [None]:
## Drug Dataset Preparation

clinical_trials_drugs = pd.read_csv("https://raw.githubusercontent.com/drug2ways/results/master/validation/data/DrugBank-MeSH-slim-counts.tsv",sep = "\t")
clinical_trials_drugs = clinical_trials_drugs.loc[clinical_trials_drugs['condition'] == "D001943"]
approved_brca_drugs = pd.read_csv(APPROVED_DRUG_BREAST,sep = "\t")

In [None]:
## Statistics

temp_intersect_drugbank_approved_brca = []
temp_intersect_drugbank_clinical_trials_brca = []

for drug in drugbank_to_genes_ID_keep_drugbank["source"].values:
    if drug in clinical_trials_drugs["drugbank_id"].values:
        temp_intersect_drugbank_clinical_trials_brca.append(drug)
    if drug in approved_brca_drugs["Approved_drug"].values:
        temp_intersect_drugbank_approved_brca.append(drug)
        
intersect_drugbank_clinical_trials_brca = set(temp_intersect_drugbank_clinical_trials_brca)
intersect_drugbank_approved_brca = set(temp_intersect_drugbank_approved_brca)

In [None]:
## Load the trained classifier

trained_model = joblib.load(open(TRAINED_MODEL, "rb"))

In [None]:
## Replace gene_id with gene_name in drugbunk gene target file

for gene_ID in drugbank_to_genes_ID_keep_drugbank["target"]:
    for gene_id in HGNC_ID_map_to_gene_name["HGNC ID"]:
        
        # Skip genes that are not the same
        if gene_ID != gene_id:
            continue
            
        row_index_HGNC_ID_map = HGNC_ID_map_to_gene_name[HGNC_ID_map_to_gene_name["HGNC ID"] == gene_id].index.values[0]
        gene_symbol = HGNC_ID_map_to_gene_name.iloc[row_index_HGNC_ID_map, HGNC_ID_map_to_gene_name.columns.get_loc('Approved symbol')]
        drugbank_to_genes_ID_keep_drugbank['target'] = drugbank_to_genes_ID_keep_drugbank['target'].replace(gene_ID,gene_symbol)

In [None]:
## Drug dataframe prepration for calculating score of a pathway including all of its involving genes 

# Drop the source_database column as all drugs in dataframe are coming from drugbank
drugbank = drugbank_to_genes_ID_keep_drugbank.drop('source_database', 1)

# Group the targeting genes based on the drugs
drugbank_groupby_drug = drugbank.groupby('source')

# Forming a list of unique drugs used further for preparation of dataframe containing drugs and its targeted pathway 
## and all targeted genes involved in that pathway
unique_drug = drugbank["source"].unique()

In [None]:
## Preparing dataframe containing drugs and its targeted pathway and all targeted genes involved in that pathway
## plus its corresponding affecting score

# creating an empty dataframe
pathway_to_score = pd.DataFrame(columns=['drug_ID','pathway','affection_rate','gene_name'])

for drug in range(len(unique_drug)):
    
    # get the subset of drugbank dataset with regards to the a data
    temp_drug_gene_relation_df = drugbank_groupby_drug.get_group(unique_drug[drug])
    
    # drop the drug column to turn it to dict for efficient looping
    temp_drug_gene_relation_df = temp_drug_gene_relation_df.drop("source",1)
    
    # convert the subset dataframe to dictionary
    temp_gene_score_dict = dict(temp_drug_gene_relation_df.values.tolist())   
    
    # loop over pathway_genes_dict genes and pathways
    for pathways, genes in pathway_genes_dict.items():
        temp_gene= genes
        temp_pathway = pathways
        
        # loop over subset dataframe converted dict genes and scores
        for gene, score in temp_gene_score_dict.items():
            gene_temp = gene
            score_temp = score
            
            # find all genes of a pathway and makeing a dataframe out of that with all details (drug,gene,pathway,affecting score)
            if gene_temp in temp_gene:
                pathway_to_score = pathway_to_score.append({'drug_ID':unique_drug[drug],'pathway': temp_pathway, 'affection_rate': score_temp, 'gene_name': gene_temp}, 
                                                       ignore_index=True)

In [None]:
## Preparing dataframe with a score per drug per pathway(considering all of its involving genes)

# creating an empty dataframe
pathway_scores = pd.DataFrame(columns=['drug_ID','Pathway', 'Finall_affected_score'])

# Groupby the last step provided dataframe by 'drug_ID','pathway'
pathway_to_score_groupby = pathway_to_score.groupby(['drug_ID','pathway'])

for drug,path,score,gene in pathway_to_score.values:
    
    # get the subset of last step prepared dataframe with regards to the drug and correponding pathway (considering all of its involving genes)
    temp_pathway_to_score_df = pathway_to_score_groupby.get_group((drug,path))
    
    # calculating the sum of the scores for all the genes of a pathway
    temp_affected_score = temp_pathway_to_score_df['affection_rate'].sum()
    
    # calculating the mean 
    finall_affected_score = temp_affected_score / (temp_pathway_to_score_df.shape[0])
    
    # make a dataframe dataframe with a score per drug per pathway
    pathway_scores = pathway_scores.append({'drug_ID':drug,'Pathway': path, 'Finall_affected_score': finall_affected_score},ignore_index=True)

# Drop the duplicate if there is any
pathway_scores.drop_duplicates(subset=['drug_ID','Pathway'],keep="first",inplace=True) 

In [None]:
## Splite samples based on our desired lables

def splite_samples(raw_data, desired_label):
    
    # Split the subset of pateints having desired lable
    desired_label_sample= raw_data.loc[raw_data['label'] == desired_label]
    
    # Dataframe including the other subset of patients with undisred lable
    undesired_label_sample = pd.concat([raw_data, desired_label_sample]).drop_duplicates(keep=False)
    
    return desired_label_sample, undesired_label_sample

In [None]:
desired_label_sample, undesired_label_sample = splite_samples(raw_data,1)

desired_label_sample.drop('label', axis=1, inplace=True)
undesired_label_sample.drop('label', axis=1, inplace=True)

patients_mean_pathway = {}
healthy_mean_pathway = {}

for pathway in desired_label_sample:
    mean = desired_label_sample[pathway].mean()
    patients_mean_pathway[pathway] = mean

mean = 0
    
for pathway in undesired_label_sample:
    mean = undesired_label_sample[pathway].mean()
    healthy_mean_pathway[pathway] = mean
    
    
patients_mean_pathway_df = pd.DataFrame(patients_mean_pathway.items(), columns=['pathway', 'mean_patient'])
healthy_mean_pathway_df = pd.DataFrame(healthy_mean_pathway.items(), columns=['pathway', 'mean_healthy'])

mean_patient_healthy_pathway = pd.merge(healthy_mean_pathway_df, patients_mean_pathway_df, on=["pathway"])

In [None]:
mean_patient_healthy_pathway["diff_mean"] = abs(mean_patient_healthy_pathway["mean_healthy"] - mean_patient_healthy_pathway["mean_patient"])
mean_patient_healthy_pathway = mean_patient_healthy_pathway.sort_values(by = 'diff_mean',ascending = False)

In [None]:
## Modify the pathway score of each patient with regards to each drug available in drugbank

def path_score_modification(drug_name, raw_data, desired_label,mean_patient_healthy_pathway):
     
    desired_path_score_changed_sample, undesired_path_score_changed_sample = splite_samples(raw_data, desired_label)
    
    # Get subset of dataframe with a score per drug per pathway with regards to selected drug
    temp_pathway_drug_all_gene_score = pathway_scores.groupby('drug_ID')
    pathway_drug_including_all_gene_score = temp_pathway_drug_all_gene_score.get_group(drug_name)
    
    # Dictionary of pathways affected by the drug to their respective scores
    affected_pathway_to_score = {
        pathway: score
        for _, pathway, score in pathway_drug_including_all_gene_score.values
    }
                 
    # For each sample id
    for sample in range(len(desired_path_score_changed_sample)):
            
        # For each pathway that we have to modify a score in all patients since it is targetted by the drug
        for pathway in affected_pathway_to_score:
                        
            # Get related affection scored calculated per drug per pathway
            affection_score = affected_pathway_to_score[pathway]
            
            if pathway in desired_path_score_changed_sample.columns:
                
                pathway_column = desired_path_score_changed_sample.columns.get_loc(pathway)
                current_score = desired_path_score_changed_sample.iloc[sample, pathway_column]
                temp_mean_patient_healthy_pathway = mean_patient_healthy_pathway.loc[mean_patient_healthy_pathway['pathway'] == pathway]


                
                if affection_score > 0:
                    
                    if temp_mean_patient_healthy_pathway.iloc[0,3] > np.quantile(mean_patient_healthy_pathway['diff_mean'], 0.75):
                    
                        desired_path_score_changed_sample.iloc[sample, pathway_column] = 20 * abs(current_score)
                        
                    elif np.quantile(mean_patient_healthy_pathway['diff_mean'], 0.75) >= temp_mean_patient_healthy_pathway.iloc[0,3] >= np.quantile(mean_patient_healthy_pathway['diff_mean'], 0.5):
                        
                        desired_path_score_changed_sample.iloc[sample, pathway_column] = 5 * abs(current_score)
                    
                    else:
                        desired_path_score_changed_sample.iloc[sample, pathway_column] = 10 * abs(current_score)
                        
 
                elif affection_score == 0:
                    
                    desired_path_score_changed_sample.iloc[sample, pathway_column] = current_score

                    
                else:
                    
                    if temp_mean_patient_healthy_pathway.iloc[0,3] > np.quantile(mean_patient_healthy_pathway['diff_mean'], 0.75):
                    
                        desired_path_score_changed_sample.iloc[sample, pathway_column] = -20 * abs(current_score)
                        
                    elif np.quantile(mean_patient_healthy_pathway['diff_mean'], 0.75) >= temp_mean_patient_healthy_pathway.iloc[0,3] >= np.quantile(mean_patient_healthy_pathway['diff_mean'], 0.5):
                        
                        desired_path_score_changed_sample.iloc[sample, pathway_column] = -5 * abs(current_score)
                    
                    else:
                        desired_path_score_changed_sample.iloc[sample, pathway_column] = -10 * abs(current_score)
    
    return desired_path_score_changed_sample

In [None]:
def auc_per_drug(drug_data_set,model,data,desired_lable, raw_data_set_lable,mean_patient_healthy_pathway):
    
    pathway_drug_score_brca = pd.DataFrame(columns=['drug','label_changed_ratio'])

    for drug in tqdm(pathway_scores["drug_ID"].unique()):
        ratio = 0
        temp_data_set = path_score_modification(drug,raw_data,desired_lable,mean_patient_healthy_pathway)
        prepared_data_set_for_prediction = temp_data_set.iloc[:,:311]
        prediction = trained_model.predict(prepared_data_set_for_prediction)
        for l_o,l_p in zip(raw_data.label,prediction):
            if l_o != l_p and l_o == 1:
                ratio = ratio + 1
        ratio = (ratio/lable_list.count(1))
        pathway_drug_score_brca = pathway_drug_score_brca.append({'drug': drug,'label_changed_ratio': ratio},ignore_index=True)
        
    return pathway_drug_score_brca       

In [None]:
pathway_drug_score_brca_df = auc_per_drug(pathway_scores,trained_model,raw_data,1,raw_data.label,mean_patient_healthy_pathway)