In [1]:
import pandas as pd
import numpy as np
import os
# pip install PubChemPy
import pubchempy as pcp
import re
from pubchempy import Compound
import warnings
warnings.filterwarnings("ignore")
import time

In [2]:
os.listdir("results")

['drug_features_with_properties.csv',
 'merged_drug_profiles_sigmoid4_23.csv',
 'drug_features_pubchem.csv',
 'filtered_drug_profiles_13.csv',
 'filtered_drug_profiles_12.csv',
 'filtered_drug_profiles_23.csv',
 'filtered_drug_profiles.csv',
 'merged_drug_profiles_sigmoid4_123.csv',
 'filtered_drug_profiles_123.csv']

In [3]:
drug_features = pd.read_csv('data/Drug_Features.csv').rename(columns={"Drug ID": "DRUG_ID", 
                                                                      "Drug Name" : "Drug_Name",
                                                                      "Target Pathway": "Target_Pathway"})

### get drug features from pubchempy

In [8]:
%%time

for drug_id in drug_features["DRUG_ID"].unique():
    drug_index = drug_features[drug_features["DRUG_ID"]==drug_id].index
    drug_name = drug_features.loc[drug_index, "Drug_Name"].values[0]
    deriv = pcp.get_compounds(drug_name, 'name')
    drug_features.loc[drug_index, "deriv_found"] = len(deriv)
    try:
        drug_features.loc[drug_index, "PubChem_ID"]= re.findall(r'\((.*?)\)', str(deriv))
    except:
        if len(deriv)>1:
            drug_features.loc[drug_index, "PubChem_ID"]= str([np.int(x) for x in re.findall(r'\((.*?)\)', str(deriv))]).strip("[").strip("]")
        else:
            drug_features.loc[drug_index, "PubChem_ID"]= 0

CPU times: user 6.89 s, sys: 432 ms, total: 7.32 s
Wall time: 3min 9s


### Manual matching for drugs with missing or mutiple data

In [12]:
error_names_dict={"Lestauritinib": "Lestaurtinib"}
error_name = "Lestauritinib"

# correct the search results
error_drug_index = drug_features[drug_features["Drug_Name"]==error_name].index
correct_drug_name = error_names_dict[error_name]
drug_features.loc[error_drug_index, "Drug_Name"] = correct_drug_name

new_synonyms = {"Y-39983": {"Synonyms": "Y-33075",
                           "reference": ["https://www.medchemexpress.com/Y-33075.html",
                            "https://www.nature.com/articles/s41467-019-13781-3"]}}

manual_corrections = {
    "Lestaurtinib":{"pubchem_id" : 126565,
               "reference" : "https://www.cancerrxgene.org/compounds"},
    
    "WZ-1-84": {"pubchem_id" : 49821040,
               "reference" : "http://lincs.hms.harvard.edu/db/datasets/20119/smallmolecules"},
    
    "GW441756": {"pubchem_id" : 9943465 ,
               "reference" : "",
               "note": "no result in drugbank"},
    
    "Parthenolide" : {"pubchem_id" : 6473881,
               "reference" : "https://www.drugbank.ca/drugs/DB13063"},
    
    "Obatoclax Mesylate": {"pubchem_id" : 347828476,
               "reference" : "https://www.drugbank.ca/drugs/DB12191"},
    
    "Bleomycine": {"pubchem_id" : 72467,
               "reference" : "https://www.drugbank.ca/drugs/DB00290"},
    
    "Y-39983": {"pubchem_id" : 20601328,
               "reference" : "https://www.medchemexpress.com/Y-33075.html"},
    
    "JW-7-52-1": {"pubchem_id" : 20822503,
               "reference" : "https://pharmacodb.ca/drugs/392"},
    
    "VNLG/124": { "pubchem_id": 24894414, 
                  "reference": "https://www.cancerrxgene.org/compounds" },
    
    "PDK1 inhibitor 7": { "pubchem_id": 56965967, 
                         "reference": "https://www.cancerrxgene.org/compounds"},
    
    "KIN001-260": {"pubchem_id": 10451420, 
                   "reference": "https://www.cancerrxgene.org/compounds"},
    
    "SB52334": {"pubchem_id": 9967941, 
                "reference": "https://www.cancerrxgene.org/compounds"},
    
    "KIN001-270": { "pubchem_id": 66577006, 
                   "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Cisplatin": {"pubchem_id": 84691, 
                  "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Cetuximab": {"pubchem_id": 85668777, 
                  "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Nutlin-3a (-)": { "pubchem_id": 
                      11433190, "reference": ""},
    
    "681640": { "pubchem_id": 10384072, 
               "reference": ""},
    
    "MPS-1-IN-1": {"pubchem_id": 25195352, 
                   "reference": ""},
    
    "KIN001-266": { "pubchem_id": 44143370, 
                   "reference": ""},
    
    "JW-7-52-1" : {"pubchem_id": 49836027, 
                   "reference": ""},
    
    "Vinorelbine": {"pubchem_id": 44424639, 
                   "reference": "https://www.drugbank.ca/drugs/DB00361"},
    
    "Paclitaxel": {"pubchem_id": 36314, 
                   "reference": "https://www.drugbank.ca/drugs/DB01229"},
    
    "Bleomycin": {"pubchem_id": 5360373, 
                   "reference": "https://www.drugbank.ca/drugs/DB00290"},
    
    "Vinblastine": {"pubchem_id": 13342, 
                   "reference": "https://www.drugbank.ca/drugs/DB00570"},
    
    
    "THZ-2-102-1" : {"pubchem_id": 146011539, 
                   "reference": "Katjusa Koler's suggestion"},
    
    "THZ-2-49" : {"pubchem_id": 78357763 , 
                   "reference": ["https://www.cancerrxgene.org/compounds", 
                                "https://www.medchemexpress.com/THZ2.html",
                                "https://pubchem.ncbi.nlm.nih.gov/compound/78357763"]},
    
    "QL-XII-47": {"pubchem_id": 71748056, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10077-101-1/"},
    
    "BMS-345541" : {"pubchem_id": 9813758, 
                   "reference": ""},
    
    "Temsirolimus" : {"pubchem_id": 23724530, 
                   "reference": "https://www.drugbank.ca/drugs/DB06287"},
    
    "SB590885" : {"pubchem_id": 135398506, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/#query=SB590885"},
    
    "WZ3105" : {"pubchem_id": 42628507, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10084-101/"},
    
    "NPK76-II-72-1" : {"pubchem_id": 46843648, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10070-101/"},
    
    "JW-7-24-1" : {"pubchem_id": 69923936, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10019-101/"},
    "Bryostatin 1" : {"pubchem_id": 6435419, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/#query=Bryostatin%201"},
    "QL-XI-92": {"pubchem_id": "73265214",
                 "reference": "Katjusa Koler's & Dennis Wang's database"}
        
    }
    
corrections_pubchem_id = {
    "Temsirolimus": 6918289,
    "Vinorelbine": 5311497,
    "Y-39983": 9810884,
    "GW441756": 9943465, 
    "Vinblastine": 6710780,
    "Bryostatin 1": 5280757,
    "Parthenolide": 7251185,
    "Obatoclax Mesylate": 11404337,
    "Bleomycin (50 uM)": 5460769,
    "SB590885": 11316960,
    "Paclitaxel" :36314,
    "BMS-345541": 9813758 
}


for drug_name in manual_corrections:
    drug_index = drug_features[drug_features["Drug_Name"]==drug_name].index
    drug_features.loc[drug_index, "deriv_found"] = 1
    drug_features.loc[drug_index, "PubChem_ID"]= corrections_pubchem_id[drug_name]

# more_corrections
for drug_name in corrections_pubchem_id:
    drug_index = drug_features[drug_features["Drug_Name"]==drug_name].index
    drug_features.loc[drug_index, "deriv_found"] = 1
    drug_features.loc[drug_index, "PubChem_ID"]= corrections_pubchem_id[drug_name]

In [13]:
# write the data
drug_features.to_csv("results/drug_features_pubchem.csv")

### Extracting properties from PubChem

In [20]:
%%time
for i, PubChem_id in tqdm(list(enumerate(drug_features["PubChem_ID"].values))):
# for PubChem_id in drug_features["PubChem_ID"].values[:5]:
    try:
        drug_index = drug_features[drug_features["PubChem_ID"]==PubChem_id].index
        
        c = Compound.from_cid(PubChem_id)
        
        drug_features.loc[drug_index, "molecular_weight"] = c.molecular_weight
   
        drug_features.loc[drug_index, "elements"] = str(set(c.elements)).strip("{").strip("}")
        
        bonds = [int(str(i).split(",")[-1].strip(")")) for i in c.bonds]
        drug_features.loc[drug_index, "2bonds"] = bonds.count(2)
        drug_features.loc[drug_index, "3bonds"] = bonds.count(3)

        drug_features.loc[drug_index, "xlogp"] = c.xlogp
        drug_features.loc[drug_index, "formal_charge"] = c.charge
    
        drug_features.loc[drug_index, "surface_area"] = c.tpsa

        drug_features.loc[drug_index, "complexity"] = c.complexity

        drug_features.loc[drug_index, "complexity"] = c.h_bond_donor_count

        drug_features.loc[drug_index, "h_bond_acceptor_count"] = c.h_bond_acceptor_count

        drug_features.loc[drug_index, "rotatable_bond_count"] = c.rotatable_bond_count

        drug_features.loc[drug_index, "heavy_atom_count"] = c.heavy_atom_count

        drug_features.loc[drug_index, "atom_stereo_count"] = c.atom_stereo_count

        drug_features.loc[drug_index, "defined_atom_stereo_count"] = c.defined_atom_stereo_count

        drug_features.loc[drug_index, "undefined_atom_stereo_count"] = c.undefined_atom_stereo_count

        drug_features.loc[drug_index, "bond_stereo_count"] = c.bond_stereo_count

        drug_features.loc[drug_index, "covalent_unit_count"] = c.covalent_unit_count
        drug_features.loc[drug_index, "molecular_formula"] = c.molecular_formula

        drug_features.loc[drug_index, "canonical_smiles"] = c.canonical_smiles

        drug_features.loc[drug_index, "inchi_string"] = c.inchi

        drug_features.loc[drug_index, "inchi_key"] = c.inchikey
    except:
        pass

# # fingerprint
# # Raw padded and hex-encoded fingerprint, as returned by the PUG REST API.

# # cactvs_fingerprint
# # PubChem CACTVS fingerprint.
# # Each bit in the fingerprint represents the presence or absence of one of 881 chemical substructures.
# # More information at ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt


100%|██████████| 265/265 [02:46<00:00,  1.59it/s]

CPU times: user 11.2 s, sys: 582 ms, total: 11.8 s
Wall time: 2min 46s





### Additional features from splitting columns in drug_features

In this section, we are going to have some dumnies columns for Target and Target_Pathway
As in Target column only several drugs have more than 3 values, only three dumny columns were created

Converting of Target Pathway resulted in 29 new columns

It is also worth considering elements columns and that deleting columns with C and H which are present in all the compounds


In [21]:
drug_features["first_target"] = drug_features["Target"].str.split(",", expand=True).fillna(0)[0]
drug_features["second_target"] = drug_features["Target"].str.split(",", expand=True).fillna(0)[1]
drug_features["third_target"] = drug_features["Target"].str.split(",", expand=True).fillna(0)[2]

#### dumnies columns for Target Pathway (29 values)

In [22]:
drug_features = pd.concat([drug_features, pd.get_dummies(drug_features["Target_Pathway"])], axis=1).drop("Target_Pathway", axis=1)
drug_features.shape

(265, 52)

In [103]:
drug_features = pd.read_csv("results/drug_features_with_properties.csv")
drug_features.shape

(265, 53)

In [112]:
%%time
for drug_index in drug_features.index:
    compound_elements = drug_features.loc[drug_index, "elements"]
    try:
        for i, atom in list(enumerate(elements_in_drugs)):
            if all_elements[i] in compound_elements:
                drug_features.loc[drug_index, atom] = 1
            else:
                drug_features.loc[drug_index, atom] = 0
    except:
        print(drug_index)
        drug_features.loc[drug_index, atom] = 0

58
72
92
98
109
116
134
135
158
159
161
199
225
226
228
229
231
237
248
257
CPU times: user 1.38 s, sys: 77.5 ms, total: 1.46 s
Wall time: 1.41 s


In [41]:
drug_features.drop("Unnamed: 0", axis=1).to_csv("results/drug_features_with_properties2.csv")