## Get Pubchem drug feautures

1. finding corresponding Pubchem ids for the drugs 
2. call Pubchem to get chemical properties of the drugs
3. Preprocess text Drug description from the original datasets
4. Preprocess some text characteristics from PubChem properties

In [1]:
import pandas as pd
import numpy as np
import os
# pip install PubChemPy
import pubchempy as pcp
import re
from pubchempy import Compound
import warnings
warnings.filterwarnings("ignore")
import time
import tqdm

_FOLDER = "data/"
_FOLDER_2 ="results/"

In [2]:
os.listdir(_FOLDER)

['Cell_Lines_Details.csv',
 'Cell_Line_Features_PANCAN_simple_MOBEM.tsv',
 'Cell_Line_Features_PANCAN_simple_MOBEM.xlsx',
 'dataSplit.json',
 'dataSplit.txt',
 'Drug_Features.csv',
 'filteredResponses.csv',
 'filteredResponsesWithCCL.csv',
 'filteredResponsesWithCCLAndParams.csv',
 'normalised_dose_response_data.csv']

In [3]:
os.listdir(_FOLDER_2)

['.ipynb_checkpoints',
 'baggingPredicted.csv',
 'baggingTrue.csv',
 'boostingPredicted.csv',
 'boostingPredicted2.csv',
 'boostingPredictedNoTuning.csv',
 'boostingPredictedTuned.csv',
 'boostingPredictedTunedGPMin.csv',
 'boostingTrue.csv',
 'file_name.csv',
 'filtered_drug_profiles_123.csv',
 'fit_filtered_drug_profiles_123.csv',
 'lassoPredicted.csv',
 'ridgePredicted.csv',
 'sigmoid_01.csv',
 'sigmoid_02.csv',
 'stackingPredicted.csv',
 'svmPredicted.csv',
 'threeLearnerStackingPredicted.csv',
 'trueResults.csv',
 'X_features_cancer_cell_lines_easy_read.txt']

In [4]:
drug_features = pd.read_csv(_FOLDER + "Drug_Features.csv").rename(columns={"Drug ID": "DRUG_ID", 
                                                                           "Drug Name": "Drug_Name",
                                                                          "Target Pathway": "Target_Pathway"})
drug_features.set_index("DRUG_ID", inplace= True)
drug_features.head()

Unnamed: 0_level_0,Drug_Name,Synonyms,Target,Target_Pathway
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,EGFR signaling
3,Rapamycin,"AY-22989, Sirolimus, WY-090217, Torisel, Rapamune",MTORC1,PI3K/MTOR signaling
5,Sunitinib,"Sutent, Sunitinib Malate, SU-11248","PDGFR, KIT, VEGFR, FLT3, RET, CSF1R",RTK signaling
6,PHA-665752,"PHA665752, PHA 665752",MET,RTK signaling
9,MG-132,"LLL cpd, MG 132, MG132","Proteasome, CAPN1",Protein stability and degradation


## Part 1: Get drug features from PubChempy

In [6]:
%%time

for drug_id in drug_features.index: 
    drug_name = drug_features.loc[drug_id, "Drug_Name"]
    deriv = pcp.get_compounds(drug_name, 'name')
    drug_features.loc[drug_id, "deriv_found"] = len(deriv)
    try:
        # Cleaning and simplifying the extraction of the first PubChem ID
        drug_features.loc[drug_id, "PubChem_ID"] = [c.cid for c in deriv if c.cid][0] if deriv else 0
    except Exception as e:
        print(f"Error for {drug_name}: {e}")  # Catching the problem, more of a netting all approach

CPU times: total: 3.03 s
Wall time: 2min 48s


### Manual matching for drugs with missing or mutiple data

In [7]:
error_names_dict={"Lestauritinib": "Lestaurtinib"}
error_name = "Lestauritinib"

# correct the search results
error_drug_index = drug_features[drug_features["Drug_Name"]==error_name].index
correct_drug_name = error_names_dict[error_name]
drug_features.loc[error_drug_index, "Drug_Name"] = correct_drug_name

In [8]:
new_synonyms = {"Y-39983": {"Synonyms": "Y-33075",
                           "reference": ["https://www.medchemexpress.com/Y-33075.html",
                            "https://www.nature.com/articles/s41467-019-13781-3"]}}

manual_corrections = {
    "Lestaurtinib":{"pubchem_id" : 126565,
               "reference" : "https://www.cancerrxgene.org/compounds"},
    
    "WZ-1-84": {"pubchem_id" : 49821040,
               "reference" : "http://lincs.hms.harvard.edu/db/datasets/20119/smallmolecules"},
    
    "GW441756": {"pubchem_id" : 9943465 ,
               "reference" : "",
               "note": "no result in drugbank"},
    
    "Parthenolide" : {"pubchem_id" : 6473881,
               "reference" : "https://www.drugbank.ca/drugs/DB13063"},
    
    "Obatoclax Mesylate": {"pubchem_id" : 347828476,
               "reference" : "https://www.drugbank.ca/drugs/DB12191"},
    
    "Bleomycine": {"pubchem_id" : 72467,
               "reference" : "https://www.drugbank.ca/drugs/DB00290"},
    
    "Y-39983": {"pubchem_id" : 20601328,
               "reference" : "https://www.medchemexpress.com/Y-33075.html"},
    
    "JW-7-52-1": {"pubchem_id" : 20822503,
               "reference" : "https://pharmacodb.ca/drugs/392"},
    
    "VNLG/124": { "pubchem_id": 24894414, 
                  "reference": "https://www.cancerrxgene.org/compounds" },
    
    "PDK1 inhibitor 7": { "pubchem_id": 56965967, 
                         "reference": "https://www.cancerrxgene.org/compounds"},
    
    "KIN001-260": {"pubchem_id": 10451420, 
                   "reference": "https://www.cancerrxgene.org/compounds"},
    
    "SB52334": {"pubchem_id": 9967941, 
                "reference": "https://www.cancerrxgene.org/compounds"},
    
    "KIN001-270": { "pubchem_id": 66577006, 
                   "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Cisplatin": {"pubchem_id": 84691, 
                  "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Cetuximab": {"pubchem_id": 85668777, 
                  "reference": "https://www.cancerrxgene.org/compounds"},
    
    "Nutlin-3a (-)": { "pubchem_id": 
                      11433190, "reference": ""},
    
    "681640": { "pubchem_id": 10384072, 
               "reference": ""},
    
    "MPS-1-IN-1": {"pubchem_id": 25195352, 
                   "reference": ""},
    
    "KIN001-266": { "pubchem_id": 44143370, 
                   "reference": ""},
    
    "JW-7-52-1" : {"pubchem_id": 49836027, 
                   "reference": ""},
    
    "Vinorelbine": {"pubchem_id": 44424639, 
                   "reference": "https://www.drugbank.ca/drugs/DB00361"},
    
    "Paclitaxel": {"pubchem_id": 36314, 
                   "reference": "https://www.drugbank.ca/drugs/DB01229"},
    
    "Bleomycin": {"pubchem_id": 5360373, 
                   "reference": "https://www.drugbank.ca/drugs/DB00290"},
    
    "Vinblastine": {"pubchem_id": 13342, 
                   "reference": "https://www.drugbank.ca/drugs/DB00570"},
    
    
    "THZ-2-102-1" : {"pubchem_id": 146011539, 
                   "reference": "Katjusa Koler's suggestion"},
    
    "THZ-2-49" : {"pubchem_id": 78357763 , 
                   "reference": ["https://www.cancerrxgene.org/compounds", 
                                "https://www.medchemexpress.com/THZ2.html",
                                "https://pubchem.ncbi.nlm.nih.gov/compound/78357763"]},
    
    "QL-XII-47": {"pubchem_id": 71748056, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10077-101-1/"},
    
    "BMS-345541" : {"pubchem_id": 9813758, 
                   "reference": ""},
    
    "Temsirolimus" : {"pubchem_id": 23724530, 
                   "reference": "https://www.drugbank.ca/drugs/DB06287"},
    
    "SB590885" : {"pubchem_id": 135398506, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/#query=SB590885"},
    
    "WZ3105" : {"pubchem_id": 42628507, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10084-101/"},
    
    "NPK76-II-72-1" : {"pubchem_id": 46843648, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10070-101/"},
    
    "JW-7-24-1" : {"pubchem_id": 69923936, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10019-101/"},
    "Bryostatin 1" : {"pubchem_id": 6435419, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/#query=Bryostatin%201"},
    "QL-XI-92": {"pubchem_id": 73265214,
                 "reference": "Katjusa Koler's & Dennis Wang's database"},
    
    "SL0101": {"pubchem_id": 10459196,
                 "reference": "https://www.cancerrxgene.org/compounds"}, 
    "Z-LLNle-CHO": {"pubchem_id": 16760646  ,
                 "reference": "https://www.cancerrxgene.org/compounds"}, 
    "JNK-9L": {"pubchem_id": 25222038  ,
                 "reference": "https://www.cancerrxgene.org/compounds"}, 
    "KIN001-244": {"pubchem_id": 56965967  ,
                 "reference": "https://www.cancerrxgene.org/compounds"},
    "RO-3306":  {"pubchem_id": 44450571  ,
                 "reference": "https://www.cancerrxgene.org/compounds"},
    "EHT-1864": {"pubchem_id": 9938202  ,
                 "reference": "https://www.cancerrxgene.org/compounds"},
    
        
    }
    
corrections_pubchem_id = {
    "Temsirolimus": 6918289,
    "Vinorelbine": 5311497,
    "Y-39983": 9810884,
    "GW441756": 9943465, 
    "Vinblastine": 6710780,
    "Bryostatin 1": 5280757,
    "Parthenolide": 7251185,
    "Obatoclax Mesylate": 11404337,
    "Bleomycin (50 uM)": 5460769,
    "SB590885": 11316960,
    "Paclitaxel" :36314,
    "BMS-345541": 9813758,
    "YM201636" :  9956222, 
}

for drug_name in manual_corrections:
    drug_index = drug_features[drug_features["Drug_Name"]==drug_name].index
    drug_features.loc[drug_index, "deriv_found"] = 1
    drug_features.loc[drug_index, "PubChem_ID"]= manual_corrections[drug_name]["pubchem_id"]

# more_corrections
for drug_name in corrections_pubchem_id:
    drug_index = drug_features[drug_features["Drug_Name"]==drug_name].index
    drug_features.loc[drug_index, "deriv_found"] = 1
    drug_features.loc[drug_index, "PubChem_ID"]= corrections_pubchem_id[drug_name]

In [9]:
drugs_with_pubchem_id = drug_features[drug_features["PubChem_ID"]!=0].index
print("All drugs: %d, With known PubChem_Id: %d" % (drug_features.shape[0], len(drugs_with_pubchem_id)))

All drugs: 265, With known PubChem_Id: 256


In [10]:
with open(_FOLDER_2 +"drugs_with_pubchem_id_NEW.txt", 'w') as f:
    for s in drugs_with_pubchem_id:
        f.write(str(s) + '\n')

## Getting properties by PubChem API

In [11]:
%%time
# for i, PubChem_id in tqdm(list(enumerate(drug_features["PubChem_ID"].values))):
for PubChem_id in drug_features["PubChem_ID"].values:
    try:
        drug_index = drug_features[drug_features["PubChem_ID"]==PubChem_id].index
        
        c = Compound.from_cid(PubChem_id)
        
        drug_features.loc[drug_index, "molecular_weight"] = c.molecular_weight
   
        drug_features.loc[drug_index, "elements"] = str(set(c.elements)).strip("{").strip("}")
        
        bonds = [int(str(i).split(",")[-1].strip(")")) for i in c.bonds]
        drug_features.loc[drug_index, "2bonds"] = bonds.count(2)
        drug_features.loc[drug_index, "3bonds"] = bonds.count(3)

        drug_features.loc[drug_index, "xlogp"] = c.xlogp
        drug_features.loc[drug_index, "formal_charge"] = c.charge
    
        drug_features.loc[drug_index, "surface_area"] = c.tpsa

        drug_features.loc[drug_index, "complexity"] = c.complexity

        drug_features.loc[drug_index, "h_bond_donor_count"] = c.h_bond_donor_count

        drug_features.loc[drug_index, "h_bond_acceptor_count"] = c.h_bond_acceptor_count

        drug_features.loc[drug_index, "rotatable_bond_count"] = c.rotatable_bond_count

        drug_features.loc[drug_index, "heavy_atom_count"] = c.heavy_atom_count

        drug_features.loc[drug_index, "atom_stereo_count"] = c.atom_stereo_count

        drug_features.loc[drug_index, "defined_atom_stereo_count"] = c.defined_atom_stereo_count

        drug_features.loc[drug_index, "undefined_atom_stereo_count"] = c.undefined_atom_stereo_count

        drug_features.loc[drug_index, "bond_stereo_count"] = c.bond_stereo_count

        drug_features.loc[drug_index, "covalent_unit_count"] = c.covalent_unit_count
        drug_features.loc[drug_index, "molecular_formula"] = c.molecular_formula

        drug_features.loc[drug_index, "canonical_smiles"] = c.canonical_smiles

        drug_features.loc[drug_index, "inchi_string"] = c.inchi

        drug_features.loc[drug_index, "inchi_key"] = c.inchikey
    except:
        pass

# # fingerprint
# # Raw padded and hex-encoded fingerprint, as returned by the PUG REST API.

# # cactvs_fingerprint
# # PubChem CACTVS fingerprint.
# # Each bit in the fingerprint represents the presence or absence of one of 881 chemical substructures.
# # More information at ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt


CPU times: total: 3.88 s
Wall time: 2min 10s


In [12]:
#pubchem_id is none
drug_features[drug_features["molecular_weight"].isnull()].shape[0]

0

In [13]:
len(manual_corrections)+len(corrections_pubchem_id)

53

In [14]:
drugs_with_pubchem_id = drug_features[drug_features["PubChem_ID"]!=0].index
print("All drugs: %d, With known PubChem_Id: %d" % (drug_features.shape[0], len(drugs_with_pubchem_id)))

All drugs: 265, With known PubChem_Id: 256


## Preprocessing Text PubChem characteristics

### Presence of some elements (11 elements)

In [15]:
%%time
all_elements = list(set(drug_features["elements"].str.split(",", expand=True).fillna(0).values.flatten())- set([0," 'C'", "'C'", " 'H'"]))
all_elements

elements_in_drugs= list(set([atom.strip(" ").strip("'") for atom in all_elements]))
exceptions =[]
for drug_index in drug_features.index:
    compound_elements = drug_features.loc[drug_index, "elements"]
    print(compound_elements)
    try:
        for i, atom in list(enumerate(elements_in_drugs)):
            if atom in compound_elements:
                drug_features.loc[drug_index, atom] = 1
                print(atom, "Yes")
            else:
                drug_features.loc[drug_index, atom] = 0
                print(atom, "No")
    except:
        exceptions.append(drug_index)
        drug_features.loc[drug_index, atom] = 0

print("Exceptions:", drug_features.loc[exceptions, :].shape[0])
print("Elements in drugs:", len(elements_in_drugs), elements_in_drugs)

'C', 'N', 'H', 'O'
S No
P No
Cl No
Br No
N Yes
I No
nan No
F No
Pt No
B No
O Yes
'C', 'N', 'H', 'O'
S No
P No
Cl No
Br No
N Yes
I No
nan No
F No
Pt No
B No
O Yes
'N', 'H', 'C', 'F', 'O'
S No
P No
Cl No
Br No
N Yes
I No
nan No
F Yes
Pt No
B No
O Yes
'S', 'Cl', 'N', 'H', 'C', 'O'
S Yes
P No
Cl Yes
Br No
N Yes
I No
nan No
F No
Pt No
B No
O Yes
'C', 'N', 'H', 'O'
S No
P No
Cl No
Br No
N Yes
I No
nan No
F No
Pt No
B No
O Yes
'C', 'N', 'H', 'O'
S No
P No
Cl No
Br No
N Yes
I No
nan No
F No
Pt No
B No
O Yes
'C', 'N', 'H', 'O'
S No
P No
Cl No
Br No
N Yes
I No
nan No
F No
Pt No
B No
O Yes
'C', 'N', 'H', 'O'
S No
P No
Cl No
Br No
N Yes
I No
nan No
F No
Pt No
B No
O Yes
'Cl', 'N', 'H', 'C', 'F', 'O'
S No
P No
Cl Yes
Br No
N Yes
I No
nan No
F Yes
Pt No
B No
O Yes
'S', 'N', 'H', 'C', 'O'
S Yes
P No
Cl No
Br No
N Yes
I No
nan No
F No
Pt No
B No
O Yes
'C', 'N', 'H', 'O'
S No
P No
Cl No
Br No
N Yes
I No
nan No
F No
Pt No
B No
O Yes
'S', 'Cl', 'N', 'H', 'C', 'O'
S Yes
P No
Cl Yes
Br No
N Yes
I No
nan No

In [16]:
drug_features["Br"].value_counts()

Br
0.0    258
1.0      7
Name: count, dtype: int64

In [17]:
drug_features.to_csv(_FOLDER_2 + "drug_features_with_pubchem_properties.csv")

### Write PubChem names

In [18]:
PubChem_features = ["molecular_weight","2bonds", "3bonds", "xlogp", "formal_charge", 
    "surface_area", "complexity", "h_bond_donor_count", 
    "h_bond_acceptor_count", "rotatable_bond_count",
    "heavy_atom_count", "atom_stereo_count", "defined_atom_stereo_count",
    "undefined_atom_stereo_count", "bond_stereo_count", "covalent_unit_count",
    'B', 'I', 'Br', 'Cl', 'O', 'N', 'F', 'P', 'S', 'Pt']

with open(_FOLDER_2 + "X_PubChem_properties.txt", 'w') as f:
    for s in PubChem_features:
        f.write(str(s) + '\n')

print("Number of PubChem features:", len(PubChem_features))

Number of PubChem features: 26


## Part 2: Preprocessing Drugs description from original data

In this section, we are going to have some dumnies columns for Target and Target_Pathway

Converting of Target Pathway resulted in 26 new columns

It is also worth considering elements columns and that deleting columns with C and H which are present in all the compounds

### Dumnies for Target (229) and Target_Pathway (23)

In [19]:
drug_features.head(3)

Unnamed: 0_level_0,Drug_Name,Synonyms,Target,Target_Pathway,deriv_found,PubChem_ID,molecular_weight,elements,2bonds,3bonds,...,P,Cl,Br,N,I,nan,F,Pt,B,O
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,EGFR signaling,1.0,176870,393.4,"'C', 'N', 'H', 'O'",8.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Rapamycin,"AY-22989, Sirolimus, WY-090217, Torisel, Rapamune",MTORC1,PI3K/MTOR signaling,1.0,5284616,914.2,"'C', 'N', 'H', 'O'",9.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
5,Sunitinib,"Sutent, Sunitinib Malate, SU-11248","PDGFR, KIT, VEGFR, FLT3, RET, CSF1R",RTK signaling,1.0,5329102,398.5,"'N', 'H', 'C', 'F', 'O'",8.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [20]:
targets = ""
for x in drug_features["Target"].values:
    targets = targets + ", " + x
targets = list(set(targets.split(", ")[1:]))
print("Number of targets:", len(targets))

df_target = pd.DataFrame(data = np.int32(np.zeros([drug_features.shape[0], len(targets)])), 
                         index = drug_features.index, 
                         columns = targets)

Number of targets: 229


In [21]:
for index in drug_features.index:
    targets_i = drug_features.loc[index, "Target"].split(", ")
    df_target.loc[index, targets_i]=1
df_target.shape

(265, 230)

In [36]:
print("Number of unique pathways:", drug_features["Target_Pathway"].nunique())

# Concatenating the target dataframe with one-hot encoded pathway columns
df_target_target_pathway = pd.concat([df_target, pd.get_dummies(drug_features["Target_Pathway"])], axis=1)
print("Shape of dataframe after concatenation:", df_target_target_pathway.shape)

# Assuming these are the columns that ended up being boolean
boolean_columns = [
    'ABL signaling', 'Apoptosis regulation', 'Cell cycle',
    'Chromatin histone acetylation', 'Chromatin histone methylation',
    'Chromatin other', 'Cytoskeleton', 'DNA replication', 
    'EGFR signaling', 'ERK MAPK signaling', 'Genome integrity',
    'Hormone-related', 'IGFR signaling', 'JNK and p38 signaling',
    'Metabolism', 'Mitosis', 'Other', 'Other, kinases',
    'PI3K/MTOR signaling', 'Protein stability and degradation', 
    'RTK signaling', 'WNT signaling', 'p53 pathway'
]

# Convert boolean columns back to integers (0 and 1)
df_target_target_pathway[boolean_columns] = df_target_target_pathway[boolean_columns].astype(int)


Number of unique pathways: 23
Shape of dataframe after concatenation: (265, 253)


In [39]:
df_target_target_pathway.to_csv(_FOLDER_2+"target_target_pathway_df.csv")

### Write names of uniques Targets and Target_Pathway

In [40]:
with open(_FOLDER_2 + "X_features_Targets.txt", 'w') as f:
    for s in targets:
        f.write(str(s) + '\n')
        
with open(_FOLDER_2 + "X_features_Target_Pathway.txt", 'w') as f:
    for s in drug_features["Target_Pathway"].unique():
        f.write(str(s) + '\n')   

In [41]:
df_target_target_pathway.reset_index()

Unnamed: 0,DRUG_ID,SERCA,ERK5,ROCK2,FAK,CDK1,BRD2,Anthracycline,MAPK7,Microtubule destabiliser,...,JNK and p38 signaling,Metabolism,Mitosis,Other,"Other, kinases",PI3K/MTOR signaling,Protein stability and degradation,RTK signaling,WNT signaling,p53 pathway
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,1498,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
261,1502,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
262,1526,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
263,1527,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [42]:
with open(_FOLDER_2+"X_features_cancer_cell_lines_easy_read.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]

In [43]:
print("Final Features: \n")
print("Cell lines (CCL) features:", len(X_cancer_cell_lines))
print("PubChem drug features:", len(PubChem_features))
print("Drug description features - Targets: %d, Target_Pathway: %d" % (len(targets), drug_features["Target_Pathway"].nunique()))

Final Features: 

Cell lines (CCL) features: 1073
PubChem drug features: 26
Drug description features - Targets: 229, Target_Pathway: 23


In [44]:
all_elements

["'Pt'",
 " 'Br'",
 " 'F'",
 " 'Cl'",
 "'N'",
 " 'B'",
 "'P'",
 "'S'",
 " 'O'",
 " 'N'",
 'nan',
 " 'I'",
 "'Cl'",
 "'Br'",
 " 'S'"]