In [1]:
import pandas as pd
import numpy as np
import os
# pip install PubChemPy
import pubchempy as pcp
import re
from pubchempy import Compound
import warnings
warnings.filterwarnings("ignore")
import time
import tqdm

In [2]:
os.listdir("data")

['Drug_Features2.csv',
 'Cell_Lines_Details.csv',
 'Drug_Features.csv',
 'Cell_Line_Features_PANCAN_simple_MOBEM.xlsx',
 'Cell_Line_Features_PANCAN_simple_MOBEM.tsv',
 'normalised_dose_response_data.csv']

In [3]:
drug_features = pd.read_csv('data/Drug_Features.csv').rename(columns={"Drug ID": "DRUG_ID", 
                                                                      "Drug Name" : "Drug_Name",
                                                                      "Target Pathway": "Target_Pathway"})

### get drug features from pubchempy

In [4]:
%%time

for drug_id in drug_features["DRUG_ID"].unique():
    drug_index = drug_features[drug_features["DRUG_ID"]==drug_id].index
    drug_name = drug_features.loc[drug_index, "Drug_Name"].values[0]
    deriv = pcp.get_compounds(drug_name, 'name')
    drug_features.loc[drug_index, "deriv_found"] = len(deriv)
    try:
        drug_features.loc[drug_index, "PubChem_ID"]= re.findall(r'\((.*?)\)', str(deriv))
    except:
        if len(deriv)>1:
            drug_features.loc[drug_index, "PubChem_ID"]= str([np.int(x) for x in re.findall(r'\((.*?)\)', str(deriv))]).strip("[").strip("]")
        else:
            drug_features.loc[drug_index, "PubChem_ID"]= 0

CPU times: user 7.23 s, sys: 719 ms, total: 7.95 s
Wall time: 5min 53s


### Manual matching for drugs with missing or mutiple data

In [5]:
error_names_dict={"Lestauritinib": "Lestaurtinib"}
error_name = "Lestauritinib"

# correct the search results
error_drug_index = drug_features[drug_features["Drug_Name"]==error_name].index
correct_drug_name = error_names_dict[error_name]
drug_features.loc[error_drug_index, "Drug_Name"] = correct_drug_name

In [6]:
new_synonyms = {"Y-39983": {"Synonyms": "Y-33075",
                           "reference": ["https://www.medchemexpress.com/Y-33075.html",
                            "https://www.nature.com/articles/s41467-019-13781-3"]}}

manual_corrections = {
    "Lestaurtinib":{"pubchem_id" : 126565,
               "reference" : "https://www.cancerrxgene.org/compounds"},
    
    "WZ-1-84": {"pubchem_id" : 49821040,
               "reference" : "http://lincs.hms.harvard.edu/db/datasets/20119/smallmolecules"},
    
    "GW441756": {"pubchem_id" : 9943465 ,
               "reference" : "",
               "note": "no result in drugbank"},
    
    "Parthenolide" : {"pubchem_id" : 6473881,
               "reference" : "https://www.drugbank.ca/drugs/DB13063"},
    
    "Obatoclax Mesylate": {"pubchem_id" : 347828476,
               "reference" : "https://www.drugbank.ca/drugs/DB12191"},
    
    "Bleomycine": {"pubchem_id" : 72467,
               "reference" : "https://www.drugbank.ca/drugs/DB00290"},
    
    "Y-39983": {"pubchem_id" : 20601328,
               "reference" : "https://www.medchemexpress.com/Y-33075.html"},
    
    "JW-7-52-1": {"pubchem_id" : 20822503,
               "reference" : "https://pharmacodb.ca/drugs/392"},
    
    "VNLG/124": { "pubchem_id": 24894414, 
                  "reference": "https://www.cancerrxgene.org/compounds" },
    
    "PDK1 inhibitor 7": { "pubchem_id": 56965967, 
                         "reference": "https://www.cancerrxgene.org/compounds"},
    
    "KIN001-260": {"pubchem_id": 10451420, 
                   "reference": "https://www.cancerrxgene.org/compounds"},
    
    "SB52334": {"pubchem_id": 9967941, 
                "reference": "https://www.cancerrxgene.org/compounds"},
    
    "KIN001-270": { "pubchem_id": 66577006, 
                   "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Cisplatin": {"pubchem_id": 84691, 
                  "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Cetuximab": {"pubchem_id": 85668777, 
                  "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Nutlin-3a (-)": { "pubchem_id": 
                      11433190, "reference": ""},
    
    "681640": { "pubchem_id": 10384072, 
               "reference": ""},
    
    "MPS-1-IN-1": {"pubchem_id": 25195352, 
                   "reference": ""},
    
    "KIN001-266": { "pubchem_id": 44143370, 
                   "reference": ""},
    
    "JW-7-52-1" : {"pubchem_id": 49836027, 
                   "reference": ""},
    
    "Vinorelbine": {"pubchem_id": 44424639, 
                   "reference": "https://www.drugbank.ca/drugs/DB00361"},
    
    "Paclitaxel": {"pubchem_id": 36314, 
                   "reference": "https://www.drugbank.ca/drugs/DB01229"},
    
    "Bleomycin": {"pubchem_id": 5360373, 
                   "reference": "https://www.drugbank.ca/drugs/DB00290"},
    
    "Vinblastine": {"pubchem_id": 13342, 
                   "reference": "https://www.drugbank.ca/drugs/DB00570"},
    
    
    "THZ-2-102-1" : {"pubchem_id": 146011539, 
                   "reference": "Katjusa Koler's suggestion"},
    
    "THZ-2-49" : {"pubchem_id": 78357763 , 
                   "reference": ["https://www.cancerrxgene.org/compounds", 
                                "https://www.medchemexpress.com/THZ2.html",
                                "https://pubchem.ncbi.nlm.nih.gov/compound/78357763"]},
    
    "QL-XII-47": {"pubchem_id": 71748056, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10077-101-1/"},
    
    "BMS-345541" : {"pubchem_id": 9813758, 
                   "reference": ""},
    
    "Temsirolimus" : {"pubchem_id": 23724530, 
                   "reference": "https://www.drugbank.ca/drugs/DB06287"},
    
    "SB590885" : {"pubchem_id": 135398506, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/#query=SB590885"},
    
    "WZ3105" : {"pubchem_id": 42628507, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10084-101/"},
    
    "NPK76-II-72-1" : {"pubchem_id": 46843648, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10070-101/"},
    
    "JW-7-24-1" : {"pubchem_id": 69923936, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10019-101/"},
    "Bryostatin 1" : {"pubchem_id": 6435419, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/#query=Bryostatin%201"},
    "QL-XI-92": {"pubchem_id": 73265214,
                 "reference": "Katjusa Koler's & Dennis Wang's database"},
    
    "SL0101": {"pubchem_id": 10459196,
                 "reference": "https://www.cancerrxgene.org/compounds"}, 
    "Z-LLNle-CHO": {"pubchem_id": 16760646  ,
                 "reference": "https://www.cancerrxgene.org/compounds"}, 
    "JNK-9L": {"pubchem_id": 25222038  ,
                 "reference": "https://www.cancerrxgene.org/compounds"}, 
    "KIN001-244": {"pubchem_id": 56965967  ,
                 "reference": "https://www.cancerrxgene.org/compounds"},
    "RO-3306":  {"pubchem_id": 44450571  ,
                 "reference": "https://www.cancerrxgene.org/compounds"},
    "EHT-1864": {"pubchem_id": 9938202  ,
                 "reference": "https://www.cancerrxgene.org/compounds"},
    
        
    }
    
corrections_pubchem_id = {
    "Temsirolimus": 6918289,
    "Vinorelbine": 5311497,
    "Y-39983": 9810884,
    "GW441756": 9943465, 
    "Vinblastine": 6710780,
    "Bryostatin 1": 5280757,
    "Parthenolide": 7251185,
    "Obatoclax Mesylate": 11404337,
    "Bleomycin (50 uM)": 5460769,
    "SB590885": 11316960,
    "Paclitaxel" :36314,
    "BMS-345541": 9813758 
}

for drug_name in manual_corrections:
    drug_index = drug_features[drug_features["Drug_Name"]==drug_name].index
    drug_features.loc[drug_index, "deriv_found"] = 1
    drug_features.loc[drug_index, "PubChem_ID"]= manual_corrections[drug_name]["pubchem_id"]

# more_corrections
for drug_name in corrections_pubchem_id:
    drug_index = drug_features[drug_features["Drug_Name"]==drug_name].index
    drug_features.loc[drug_index, "deriv_found"] = 1
    drug_features.loc[drug_index, "PubChem_ID"]= corrections_pubchem_id[drug_name]

In [7]:
# write the data
drug_features.to_csv("results/drug_features_pubchem_id.csv")

### Extracting properties from PubChem

In [31]:
%%time
# for i, PubChem_id in tqdm(list(enumerate(drug_features["PubChem_ID"].values))):
for PubChem_id in drug_features["PubChem_ID"].values:
    try:
        drug_index = drug_features[drug_features["PubChem_ID"]==PubChem_id].index
        
        c = Compound.from_cid(PubChem_id)
        
        drug_features.loc[drug_index, "molecular_weight"] = c.molecular_weight
   
        drug_features.loc[drug_index, "elements"] = str(set(c.elements)).strip("{").strip("}")
        
        bonds = [int(str(i).split(",")[-1].strip(")")) for i in c.bonds]
        drug_features.loc[drug_index, "2bonds"] = bonds.count(2)
        drug_features.loc[drug_index, "3bonds"] = bonds.count(3)

        drug_features.loc[drug_index, "xlogp"] = c.xlogp
        drug_features.loc[drug_index, "formal_charge"] = c.charge
    
        drug_features.loc[drug_index, "surface_area"] = c.tpsa

        drug_features.loc[drug_index, "complexity"] = c.complexity

        drug_features.loc[drug_index, "h_bond_donor_count"] = c.h_bond_donor_count

        drug_features.loc[drug_index, "h_bond_acceptor_count"] = c.h_bond_acceptor_count

        drug_features.loc[drug_index, "rotatable_bond_count"] = c.rotatable_bond_count

        drug_features.loc[drug_index, "heavy_atom_count"] = c.heavy_atom_count

        drug_features.loc[drug_index, "atom_stereo_count"] = c.atom_stereo_count

        drug_features.loc[drug_index, "defined_atom_stereo_count"] = c.defined_atom_stereo_count

        drug_features.loc[drug_index, "undefined_atom_stereo_count"] = c.undefined_atom_stereo_count

        drug_features.loc[drug_index, "bond_stereo_count"] = c.bond_stereo_count

        drug_features.loc[drug_index, "covalent_unit_count"] = c.covalent_unit_count
        drug_features.loc[drug_index, "molecular_formula"] = c.molecular_formula

        drug_features.loc[drug_index, "canonical_smiles"] = c.canonical_smiles

        drug_features.loc[drug_index, "inchi_string"] = c.inchi

        drug_features.loc[drug_index, "inchi_key"] = c.inchikey
    except:
        pass

# # fingerprint
# # Raw padded and hex-encoded fingerprint, as returned by the PUG REST API.

# # cactvs_fingerprint
# # PubChem CACTVS fingerprint.
# # Each bit in the fingerprint represents the presence or absence of one of 881 chemical substructures.
# # More information at ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt


CPU times: user 11.5 s, sys: 587 ms, total: 12.1 s
Wall time: 5min 48s


In [32]:
#pubchem_id is none
drug_features[drug_features["molecular_weight"].isnull()].shape[0]

16

### Additional features from splitting columns in drug_features

In this section, we are going to have some dumnies columns for Target and Target_Pathway

Converting of Target Pathway resulted in 26 new columns

It is also worth considering elements columns and that deleting columns with C and H which are present in all the compounds

### Dumnies for Target (229) and Target_Pathway (23)

In [147]:
d1 = pd.read_csv('data/Drug_Features.csv').rename(columns={"Drug ID": "DRUG_ID", 
                                                                      "Drug Name" : "Drug_Name",
                                                                      "Target Pathway": "Target_Pathway"})
d1.columns

Index(['DRUG_ID', 'Drug_Name', 'Synonyms', 'Target', 'Target_Pathway'], dtype='object')

In [148]:
targets = ""
for x in drug_features["Target"].values:
    targets = targets + ", " + x
targets = list(set(targets.split(", ")[1:]))
len(targets)

229

In [149]:
df_target = pd.DataFrame(data = np.int32(np.zeros([drug_features.shape[0], len(targets)])), index = drug_features["DRUG_ID"], columns = targets)
d1.set_index("DRUG_ID", inplace=True)
for index in drug_features["DRUG_ID"]:
    targets_i = d1.loc[index, "Target"].split(", ")
    df_target.loc[index, targets_i]=1
df_target.shape

(265, 229)

In [150]:
# column Target_Pathway has only one value inside it
# for ind, x  in list(enumerate(d1["Target_Pathway"].values)):
#     if "," in x:
#         print(ind, x)
# for ind, x  in list(enumerate(d1["Target_Pathway"].values)):
#     if "/" in x:
#         print(ind, x)

In [151]:
d1["Target_Pathway"].nunique()

23

In [152]:
df_target_target_pathway = pd.concat([df_target, pd.get_dummies(d1["Target_Pathway"])], axis=1)
df_target_target_pathway.shape

(265, 252)

In [153]:
229+23

252

In [154]:
for col in df_target_target_pathway.columns:
    if df_target_target_pathway[col].nunique()>2:
        print(col)

### Write names of uniques Targets and Target_Pathway
### Save DataFrame with Splitted Data from Targets and Target_Pathway

In [155]:
with open("results/X_features_Targets.txt", 'w') as f:
    for s in targets:
        f.write(str(s) + '\n')
        
with open("results/X_features_Target_Pathway.txt", 'w') as f:
    for s in d1["Target_Pathway"].unique():
        f.write(str(s) + '\n')   
        
df_target_target_pathway.reset_index().to_csv("results/target_target_pathway_df.csv")

In [157]:
df_target_target_pathway.reset_index()

Unnamed: 0,DRUG_ID,LOK,PDK1 (PDPK1),PDGFRB,AURKB,IRAK1,Amyloid beta40,PPARdelta,VEGFR2,ERBB2,...,JNK and p38 signaling,Metabolism,Mitosis,Other,"Other, kinases",PI3K/MTOR signaling,Protein stability and degradation,RTK signaling,WNT signaling,p53 pathway
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,1498,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
261,1502,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
262,1526,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
263,1527,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [33]:
#As in the Target column only several drugs have more than 3 values, only three dumny columns were created

# drug_features["first_target"] = drug_features["Target"].str.split(",", expand=True).fillna(0)[0]
# drug_features["second_target"] = drug_features["Target"].str.split(",", expand=True).fillna(0)[1]
# drug_features["third_target"] = drug_features["Target"].str.split(",", expand=True).fillna(0)[2]

### Presence of some elements (11 elements)

In [52]:
%%time
all_elements = list(set(drug_features["elements"].str.split(",", expand=True).fillna(0).values.flatten())- set([0," 'C'", "'C'", " 'H'"]))
elements_in_drugs= list(set([atom.strip(" ").strip("'") for atom in all_elements]))
exceptions =[]
for drug_index in drug_features.index:
    compound_elements = drug_features.loc[drug_index, "elements"]
    try:
        for i, atom in list(enumerate(elements_in_drugs)):
            if all_elements[i] in compound_elements:
                drug_features.loc[drug_index, atom] = 1
            else:
                drug_features.loc[drug_index, atom] = 0
    except:
        exceptions.append(drug_index)
        drug_features.loc[drug_index, atom] = 0

CPU times: user 1.41 s, sys: 61.8 ms, total: 1.48 s
Wall time: 1.62 s


In [None]:
drug_features.loc[exceptions, :].shape[0]

In [None]:
#drug_features.drop("Unnamed: 0", axis=1).to_csv("results/drug_features_with_pubchem_properties.csv")
# drug_features.to_csv("results/drug_features_with_pubchem_properties.csv")

In [None]:
# drug_features= pd.read_csv("results/drug_features_with_pubchem_properties.csv")

In [132]:
drug_features.head(3)

Unnamed: 0,DRUG_ID,Drug_Name,Synonyms,Target,deriv_found,PubChem_ID,molecular_weight,elements,2bonds,3bonds,...,O,N,S,B,Pt,P,I,Br,H,h_bond_donor_count
0,1,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,1.0,176870,393.4,"'H', 'O', 'N', 'C'",8.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,3,Rapamycin,"AY-22989, Sirolimus, WY-090217, Torisel, Rapamune",MTORC1,1.0,5284616,914.2,"'H', 'O', 'N', 'C'",9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0
2,5,Sunitinib,"Sutent, Sunitinib Malate, SU-11248","PDGFR, KIT, VEGFR, FLT3, RET, CSF1R",1.0,5329102,398.5,"'H', 'O', 'N', 'F', 'C'",8.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,3.0


In [134]:
drug_features["N"].value_counts()

0.0    248
1.0      1
Name: N, dtype: int64

In [None]:
drug_features.columns


In [None]:
sum(drug_features["inchi_key"].value_counts()>1), sum(drug_features["canonical_smiles"].value_counts()>1), sum(drug_features["inchi_string"].value_counts()>1)

In [None]:
drug_features["complexity"].value_counts()

In [None]:
column_not_to_use = ['DRUG_ID', 'Drug_Name', 'Synonyms', "Target", "deriv_found", "PubChem_ID",
                     "elements", "inchi_key", "canonical_smiles", "inchi_string"]
len(drug_features.columns)

In [None]:
drug_features_to_normalise = list(set(['molecular_weight','rotatable_bond_count', 'h_bond_acceptor_count',
 'undefined_atom_stereo_count', 'bond_stereo_count', 'defined_atom_stereo_count',
 'complexity', 'atom_stereo_count','covalent_unit_count','2bonds',
 'surface_area', 'xlogp', 'heavy_atom_count', "x_conc", '2bonds',
 '3bonds', 'h_bond_acceptor_count', 'H', 'Br', 'I', 'O', 'F', 'N', 'Cl', 'S', 'Pt', 'P']))

with open("results/X_drug_features_to_normalise_easy_read.txt", 'w') as f:
    for s in drug_features_to_normalise:
        f.write(str(s) + '\n')

with open("results/X_drug_features_to_normalise_easy_read.txt", 'r') as f:
    drug_features_to_normalise = [line.rstrip('\n') for line in f]
len(drug_features_to_normalise)

In [None]:
with open("results/X_features_1122_easy_read.txt", 'r') as f:
    X_columns = [line.rstrip('\n') for line in f]
len(X_columns)

In [None]:
df = pd.read_csv('results/merged_fitted_sigmoid4_123_with_drugs_properties.csv')

In [None]:
with open("results/X_features_cancer_cell_lines_easy_read.txt", 'r') as f:
    X_cells = [line.rstrip('\n') for line in f]
len(X_cells)

In [None]:
drugs_columns = list(set(X_columns)-set(X_cells))
len(drugs_columns)

In [None]:
# final features
- cell lines features - 1073
- PubChem drug features -26
- drug description features 23 Target_Pathway + 229 targets

In [55]:
elements_in_drugs

['B', 'I', 'Br', 'Cl', 'H', 'O', 'N', 'F', 'P', 'S', 'Pt']

In [131]:
PubChem_features = ["molecular_weight","2bonds", "3bonds", "xlogp", "formal_charge", 
    "surface_area", "complexity", "h_bond_donor_count", 
    "h_bond_acceptor_count", "rotatable_bond_count",
    "heavy_atom_count", "atom_stereo_count", "defined_atom_stereo_count",
    "undefined_atom_stereo_count", "bond_stereo_count", "covalent_unit_count",
    'B', 'I', 'Br', 'Cl', 'O', 'N', 'F', 'P', 'S', 'Pt']

with open("results/X_PubChem_features.txt", 'w') as f:
    for s in PubChem_features:
        f.write(str(s) + '\n')

with open("results/X_PubChem_features.txt", 'r') as f:
    PubChem_features = [line.rstrip('\n') for line in f]
len(PubChem_features)

26

In [54]:
len(elements_in_drugs)

11

In [57]:
1073+52

1125

In [129]:
d2 = pd.read_csv("results/drug_features_with_pubchem_properties.csv").drop(["Unnamed: 0", "Target"], axis=1)

In [130]:
d2.columns

Index(['DRUG_ID', 'Drug_Name', 'Synonyms', 'deriv_found', 'PubChem_ID',
       'molecular_weight', 'elements', '2bonds', '3bonds', 'xlogp',
       'formal_charge', 'surface_area', 'complexity', 'h_bond_acceptor_count',
       'rotatable_bond_count', 'heavy_atom_count', 'atom_stereo_count',
       'defined_atom_stereo_count', 'undefined_atom_stereo_count',
       'bond_stereo_count', 'covalent_unit_count', 'molecular_formula',
       'canonical_smiles', 'inchi_string', 'inchi_key', 'first_target',
       'second_target', 'third_target', 'ABL signaling',
       'Apoptosis regulation', 'Cell cycle', 'Chromatin histone acetylation',
       'Chromatin histone methylation', 'Chromatin other', 'Cytoskeleton',
       'DNA replication', 'EGFR signaling', 'ERK MAPK signaling',
       'Genome integrity', 'Hormone-related', 'IGFR signaling',
       'JNK and p38 signaling', 'Metabolism', 'Mitosis', 'Other',
       'Other, kinases', 'PI3K/MTOR signaling',
       'Protein stability and degradation', 

In [135]:
drug_features[['DRUG_ID', 'Drug_Name', 'Synonyms', 'PubChem_ID',
               'molecular_formula','canonical_smiles', 'inchi_string', 'inchi_key']
              + PubChem_features].to_csv("results/drug_features_with_pubchem_properties.csv")