In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import r2_score
from scipy.optimize import curve_fit
from scipy.optimize import leastsq
import scipy.optimize as opt
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import gc

from sklearn.model_selection import LeaveOneOut
from IPython.display import display

import os
from data_preprocessing import FilteringCurves, ShowResponseCurves
from fitting_curves import FittingColumn, ShowResponseCurvesWithFitting, compute_r2_score

In [2]:
os.listdir("results")

['merged_drug_profiles_sigmoid4_23.csv',
 'filtered_drug_profiles_13.csv',
 'filtered_drug_profiles_12.csv',
 'filtered_drug_profiles_23.csv',
 'filtered_drug_profiles.csv',
 'merged_drug_profiles_sigmoid4_123.csv',
 'filtered_drug_profiles_123.csv']

In [160]:
df123= pd.read_csv("results/merged_drug_profiles_sigmoid4_123.csv")
df23= pd.read_csv("results/merged_drug_profiles_sigmoid4_23.csv")
drug_features = pd.read_csv('data/Drug_Features.csv').rename(columns={"Drug ID": "DRUG_ID", 
                                                                      "Drug Name" : "Drug_Name",
                                                                      "Target Pathway": "Target_Pathway"})

In [161]:
drugs_123 = df123["DRUG_ID"].unique()
drugs_23 = df23["DRUG_ID"].unique()
drugs = drug_features["DRUG_ID"].unique()
print("Number of all available drugs:", len(drugs))
print("Number of drugs in filtered dataset with filtering scenario [2,3]:", len(drugs_23))
print("Number of drugs in filtered dataset with filtering scenario [1,2,3]:", len(drugs_123))
print("Number of drugs in both filterd datasets:", len(set(drugs_123)&set(drugs_23)))

Number of all available drugs: 265
Number of drugs in filtered dataset with filtering scenario [2,3]: 145
Number of drugs in filtered dataset with filtering scenario [1,2,3]: 145
Number of drugs in both filterd datasets: 145


In [162]:
drug_features.head()

Unnamed: 0,DRUG_ID,Drug_Name,Synonyms,Target,Target_Pathway
0,1,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,EGFR signaling
1,3,Rapamycin,"AY-22989, Sirolimus, WY-090217, Torisel, Rapamune",MTORC1,PI3K/MTOR signaling
2,5,Sunitinib,"Sutent, Sunitinib Malate, SU-11248","PDGFR, KIT, VEGFR, FLT3, RET, CSF1R",RTK signaling
3,6,PHA-665752,"PHA665752, PHA 665752",MET,RTK signaling
4,9,MG-132,"LLL cpd, MG 132, MG132","Proteasome, CAPN1",Protein stability and degradation


### get drug features from pubchempy

In [163]:
import pubchempy as pcp
import re

In [164]:
# how it works
n=0
drug_id = drug_features["DRUG_ID"][n]
drug_name = drug_features["Drug_Name"][n]
pcp_id = pcp.get_compounds(drug_name, 'name')
print(drug_name, ": PubChem_ID:", re.search(r'\((.*?)\)', str(pcp_id)).group(1))

ids = pcp.get_compounds("Obatoclax Mesylate", 'name')
print("\nObatoclax Mesylate:", re.findall(r'\((.*?)\)', str(ids)))

Erlotinib : PubChem_ID: 176870

Obatoclax Mesylate: ['16681698', '46930996', '16727411', '70700268', '71816154', '56843157', '122362431']


In [204]:
%%time

for drug_id in drug_features["DRUG_ID"].unique():
    drug_index = drug_features[drug_features["DRUG_ID"]==drug_id].index
    drug_name = drug_features.loc[drug_index, "Drug_Name"].values[0]
    deriv = pcp.get_compounds(drug_name, 'name')
    drug_features.loc[drug_index, "deriv_found"] = len(deriv)
    try:
        drug_features.loc[drug_index, "PubChem_ID"]= re.findall(r'\((.*?)\)', str(deriv))
    except:
        if len(deriv)>1:
            drug_features.loc[drug_index, "PubChem_ID"]= str([np.int(x) for x in re.findall(r'\((.*?)\)', str(deriv))]).strip("[").strip("]")
        else:
            drug_features.loc[drug_index, "PubChem_ID"]= 0

CPU times: user 7.2 s, sys: 716 ms, total: 7.91 s
Wall time: 3min 16s


In [205]:
drug_features["deriv_found"].value_counts()

1.0     215
0.0      38
2.0       3
16.0      2
3.0       1
19.0      1
8.0       1
5.0       1
7.0       1
6.0       1
13.0      1
Name: deriv_found, dtype: int64

In [206]:
# check parsing pubchem_ids for drugs in filtered datasets
drug_features.set_index("DRUG_ID").loc[drugs_123, "deriv_found"].value_counts()

1.0     120
0.0      16
2.0       2
19.0      1
3.0       1
5.0       1
6.0       1
13.0      1
7.0       1
8.0       1
Name: deriv_found, dtype: int64

**Conclusion:** For 15 drugs there is no directly found matching of compound name and its pubchem_id,
    for 7 drugs more than 3 matching was found

In [207]:
drug_features[drug_features["deriv_found"]>1]

Unnamed: 0,DRUG_ID,Drug_Name,Synonyms,Target,Target_Pathway,deriv_found,PubChem_ID
5,11,Paclitaxel,"BMS-181339-01, Taxol, Onxol, Paxene, Praxel, A...",Microtubule stabiliser,Mitosis,13.0,"36314, 441276, 4666, 23636632, 44155032, 73759..."
32,89,Parthenolide,-,HDAC1,Chromatin histone acetylation,16.0,"7251185, 6473881, 5420804, 5420805, 4692, 1080..."
45,140,Vinorelbine,"vinorelbine tartrate, Navelbine, Exelbine",Microtubule destabiliser,Mitosis,6.0,"5311497, 60780, 13200033, 5672, 44424639, 7175..."
72,182,Obatoclax Mesylate,"GX15-070MS, Obatoclax, GX15-070","BCL2, BCL-XL, BCL-W, MCL1",Apoptosis regulation,7.0,"16681698, 46930996, 16727411, 70700268, 718161..."
76,190,Bleomycin,-,dsDNA break induction,DNA replication,5.0,"5360373, 84068, 5460769, 456190, 72467"
81,197,Bryostatin 1,Bryostatin,PKC,"Other, kinases",16.0,"5280757, 433902, 22524140, 44277532, 5458769, ..."
86,203,BMS-345541,"BMS345541, IKK Inhibitor 3","IKK1, IKK2","Other, kinases",2.0,"9813758, 9926054"
153,309,Y-39983,-,ROCK,Cytoskeleton,2.0,"11507964, 9810884"
169,1004,Vinblastine,Velban,Microtubule destabiliser,Mitosis,8.0,"241903, 13342, 3823887, 6710780, 44417235, 111..."
181,1016,Temsirolimus,"CCI-779, Torisel",MTOR,PI3K/MTOR signaling,19.0,"6918289, 18293306, 23724530, 148191, 53486199,..."


### Manual corrections

In [230]:
# error in spelling
# https://www.cancerrxgene.org/compounds
# in the data:Lestauritinib, true name: Lestaurtinib (difference in one letter)
error_names_dict={"Lestauritinib": "Lestaurtinib"}
error_name = "Lestauritinib"

# correct the search results
error_drug_index = drug_features[drug_features["Drug_Name"]==error_name].index
correct_drug_name = error_names_dict[error_name]
drug_features.loc[error_drug_index, "Drug_Name"] = correct_drug_name

In [231]:
new_synonyms = {"Y-39983": {"Synonyms": "Y-33075",
                           "reference": ["https://www.medchemexpress.com/Y-33075.html",
                            "https://www.nature.com/articles/s41467-019-13781-3"]}}

manual_corrections = {
    "Lestaurtinib":{"pubchem_id" : 126565,
               "reference" : "https://www.cancerrxgene.org/compounds"},
    
    "WZ-1-84": {"pubchem_id" : 49821040,
               "reference" : "http://lincs.hms.harvard.edu/db/datasets/20119/smallmolecules"},
    
    "GW441756": {"pubchem_id" : 9943465 ,
               "reference" : "",
               "note": "no result in drugbank"},
    
    "Parthenolide" : {"pubchem_id" : 6473881,
               "reference" : "https://www.drugbank.ca/drugs/DB13063"},
    
    "Obatoclax Mesylate": {"pubchem_id" : 347828476,
               "reference" : "https://www.drugbank.ca/drugs/DB12191"},
    
    "Bleomycine": {"pubchem_id" : 72467,
               "reference" : "https://www.drugbank.ca/drugs/DB00290"},
    
    "Y-39983": {"pubchem_id" : 20601328,
               "reference" : "https://www.medchemexpress.com/Y-33075.html"},
    
    "JW-7-52-1": {"pubchem_id" : 20822503,
               "reference" : "https://pharmacodb.ca/drugs/392"},
    
    "VNLG/124": { "pubchem_id": 24894414, 
                  "reference": "https://www.cancerrxgene.org/compounds" },
    
    "PDK1 inhibitor 7": { "pubchem_id": 56965967, 
                         "reference": "https://www.cancerrxgene.org/compounds"},
    
    "KIN001-260": {"pubchem_id": 10451420, 
                   "reference": "https://www.cancerrxgene.org/compounds"},
    
    "SB52334": {"pubchem_id": 9967941, 
                "reference": "https://www.cancerrxgene.org/compounds"},
    
    "KIN001-270": { "pubchem_id": 66577006, 
                   "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Cisplatin": {"pubchem_id": 84691, 
                  "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Cetuximab": {"pubchem_id": 85668777, 
                  "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Nutlin-3a (-)": { "pubchem_id": 
                      11433190, "reference": ""},
    
    "681640": { "pubchem_id": 10384072, 
               "reference": ""},
    
    "MPS-1-IN-1": {"pubchem_id": 25195352, 
                   "reference": ""},
    
    "KIN001-266": { "pubchem_id": 44143370, 
                   "reference": ""},
    
    "JW-7-52-1" : {"pubchem_id": 49836027, 
                   "reference": ""},
    
    "Vinorelbine": {"pubchem_id": 44424639, 
                   "reference": "https://www.drugbank.ca/drugs/DB00361"},
    
    "Paclitaxel": {"pubchem_id": 36314, 
                   "reference": "https://www.drugbank.ca/drugs/DB01229"},
    
    "Bleomycin": {"pubchem_id": 5360373, 
                   "reference": "https://www.drugbank.ca/drugs/DB00290"},
    
    "Vinblastine": {"pubchem_id": 13342, 
                   "reference": "https://www.drugbank.ca/drugs/DB00570"},
    
    
    "THZ-2-102-1" : {"pubchem_id": 78357763, 
                   "reference": ["https://www.cancerrxgene.org/compounds", 
                                "https://www.medchemexpress.com/THZ2.html",
                                "https://pubchem.ncbi.nlm.nih.gov/compound/78357763"]},
    
    "THZ-2-49" : {"pubchem_id": 78357763 , 
                   "reference": ["https://www.cancerrxgene.org/compounds", 
                                "https://www.medchemexpress.com/THZ2.html",
                                "https://pubchem.ncbi.nlm.nih.gov/compound/78357763"]},
    
    "QL-XII-47": {"pubchem_id": 71748056, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10077-101-1/"},
    
    "BMS-345541" : {"pubchem_id": 9813758, 
                   "reference": ""},
    
    "Temsirolimus" : {"pubchem_id": 23724530, 
                   "reference": "https://www.drugbank.ca/drugs/DB06287"},
    
    "SB590885" : {"pubchem_id": 135398506, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/#query=SB590885"},
    
    "WZ3105" : {"pubchem_id": 42628507, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10084-101/"},
    
    "NPK76-II-72-1" : {"pubchem_id": 46843648, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10070-101/"},
    
    "JW-7-24-1" : {"pubchem_id": 69923936, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10019-101/"},
    "Bryostatin 1" : {"pubchem_id": 6435419, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/#query=Bryostatin%201"},
    
}

# Vinorelbine: pubchem_id 46507772, DrugBank DB00361 (PubChem Substance)
# Vinorelbine Ditartarate:  pubchem_id 44424639 PubChem Compound  ***

# Paclitaxel: PubChem Compound 36314 ***, PubChem Substance 46506910

# Bleomycin PubChem Compound 5360373 *** PubChem Substance 46509116

# Vinblastine PubChem Compound 13342 *** PubChem Substance 46504550

# THZ-2-102-1 https://www.cancerrxgene.org/compounds PubCHEM_Id: None
# THZ-2-49 https://www.cancerrxgene.org/compounds PubCHEM_Id: None
# https://www.medchemexpress.com/THZ2.html - CAS1604810-84-5 PubCHEM_Id 78357763 THZ2
# https://pubchem.ncbi.nlm.nih.gov/compound/78357763
# https://pharmacodb.ca/drugs/687

# "QL-XII-47": https://lincs.hms.harvard.edu/db/sm/10077-101-1/ "PubCHEM_Id": 71748056, 
# https://pubchem.ncbi.nlm.nih.gov/compound/71748056  - QL47

# BMS-345541 PubCHEM_Id 9813758

# Temsirolimus PubChem Compound 23724530 *** PubChem Substance 99443243

# SB590885 (E) PubCHEM_Id 135398506  or (Z) PubCHEM_Id 135421339 

# PubChem_ID

In [232]:
ambiguous_drug_names = drug_features[drug_features["deriv_found"]>1]["Drug_Name"].values

for name in ambiguous_drug_names:
    print("Drug ID:", drug_features[drug_features["Drug_Name"]==name]["DRUG_ID"].values[0], 
          "Drug name:", name,"Chosen_PubChem_ID:", manual_corrections[name]["pubchem_id"])
    print("Other PubChem_ids:", drug_features[drug_features["Drug_Name"]==name]["PubChem_ID"].values)
    
    print("")

Drug ID: 11 Drug name: Paclitaxel Chosen_PubChem_ID: 36314
Other PubChem_ids: ['36314, 441276, 4666, 23636632, 44155032, 73759704, 91885464, 5352019, 6713921, 6915727, 16760674, 24791027, 53313247']

Drug ID: 89 Drug name: Parthenolide Chosen_PubChem_ID: 6473881
Other PubChem_ids: ['7251185, 6473881, 5420804, 5420805, 4692, 108068, 5353864, 927704, 5702252, 5320420, 11972532, 91873711, 23631982, 73759808, 123134657, 124463365']

Drug ID: 140 Drug name: Vinorelbine Chosen_PubChem_ID: 44424639
Other PubChem_ids: ['5311497, 60780, 13200033, 5672, 44424639, 71752961']

Drug ID: 182 Drug name: Obatoclax Mesylate Chosen_PubChem_ID: 347828476
Other PubChem_ids: ['16681698, 46930996, 16727411, 70700268, 71816154, 56843157, 122362431']

Drug ID: 190 Drug name: Bleomycin Chosen_PubChem_ID: 5360373
Other PubChem_ids: ['5360373, 84068, 5460769, 456190, 72467']

Drug ID: 197 Drug name: Bryostatin 1 Chosen_PubChem_ID: 6435419
Other PubChem_ids: ['5280757, 433902, 22524140, 44277532, 5458769, 5477635

In [233]:
drug_features[drug_features["Drug_Name"]=="Bryostatin 1"]

Unnamed: 0,DRUG_ID,Drug_Name,Synonyms,Target,Target_Pathway,deriv_found,PubChem_ID
81,197,Bryostatin 1,Bryostatin,PKC,"Other, kinases",16.0,"5280757, 433902, 22524140, 44277532, 5458769, ..."


### Manual corrections

In [234]:
for drug_name in manual_corrections:
    drug_index = drug_features[drug_features["Drug_Name"]==drug_name].index
    drug_features.loc[drug_index, "deriv_found"] = 1
    drug_features.loc[drug_index, "PubChem_ID"]= manual_corrections[drug_name]["pubchem_id"]

In [235]:
# check parsing pubchem_ids for drugs in filtered datasets
drug_features.set_index("DRUG_ID").loc[drugs_123, "deriv_found"].value_counts()

1.0    139
0.0      6
Name: deriv_found, dtype: int64

**Conclusion:** Problem with 19 drugs
Need to find how important these drugs are

In [236]:
cell_features = pd.read_csv("data/Cell_Line_Features_PANCAN_simple_MOBEM.tsv", sep="\t")

stat_data123 = df123.groupby(["DRUG_ID"])[["COSMIC_ID"]].count().rename(columns={"COSMIC_ID": "count_cell_lines"})\
            .sort_values("count_cell_lines", ascending=False)
stat_data23 = df23.groupby(["DRUG_ID"])[["COSMIC_ID"]].count().rename(columns={"COSMIC_ID": "count_cell_lines"})\
            .sort_values("count_cell_lines", ascending=False)

statistics123 = pd.merge(left = stat_data123, right = drug_features, how= "left", on = "DRUG_ID").sort_values("count_cell_lines", ascending =False)
statistics23 = pd.merge(left = stat_data23, right = drug_features, how= "left", on = "DRUG_ID").sort_values("count_cell_lines", ascending =False)

In [239]:
statistics23[statistics23["deriv_found"]!=1.0]

Unnamed: 0,DRUG_ID,count_cell_lines,Drug_Name,Synonyms,Target,Target_Pathway,deriv_found,PubChem_ID
63,164,13,JQ12,-,"HDAC1, HDAC2",Chromatin histone acetylation,0.0,0
77,225,7,Genentech Cpd 10,-,"AURKA, AURKB",Mitosis,0.0,0
94,330,4,XMD13-2,-,RIPK1,Apoptosis regulation,0.0,0
113,1037,2,BX796,BX-796,"TBK1, PDK1 (PDPK1), IKK, AURKB, AURKC",Other,0.0,0
138,329,1,QL-XI-92,-,DDR1,Other,0.0,0
133,211,1,TL-2-105,-,not defined,ERK MAPK signaling,0.0,0
