## Merge with Drug Properties

Add preprocessed earlier drug description and characteristics

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import os
_FOLDER_2 = "results/"

In [2]:
os.listdir(_FOLDER_2)

['.DS_Store',
 'merged_drug_profiles_cells_sigmoid4_123.csv',
 'drugs_with_pubchem_id_NEW.txt',
 'drug_features_with_pubchem_properties.csv',
 'target_target_pathway_df.csv',
 'fit_filtered_drug_profiles_123.csv',
 'X_features_Targets.txt',
 '.ipynb_checkpoints',
 'filtered_drug_profiles_123.csv',
 'X_PubChem_properties.txt',
 'X_features_Target_Pathway.txt',
 'X_features_cancer_cell_lines.txt']

In [3]:
df_profiles = pd.read_csv(_FOLDER_2+"merged_drug_profiles_cells_sigmoid4_123.csv").drop("Unnamed: 0", axis=1)
df_drugs_properties = pd.read_csv(_FOLDER_2+"drug_features_with_pubchem_properties.csv")
df_targets = pd.read_csv(_FOLDER_2+"target_target_pathway_df.csv")

df_profiles.shape, df_drugs_properties.shape, df_targets.shape

((2612, 1100), (265, 39), (265, 253))

In [4]:
df_drugs_properties.head(2)

Unnamed: 0,DRUG_ID,Drug_Name,Synonyms,Target,Target_Pathway,deriv_found,PubChem_ID,molecular_weight,elements,2bonds,...,I,Cl,S,B,N,H,Pt,P,F,O
0,1,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,EGFR signaling,1.0,176870,393.4,"'C', 'N', 'O', 'H'",8.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1,3,Rapamycin,"AY-22989, Sirolimus, WY-090217, Torisel, Rapamune",MTORC1,PI3K/MTOR signaling,1.0,5284616,914.2,"'C', 'N', 'O', 'H'",9.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [5]:
df_drugs_properties[df_drugs_properties["PubChem_ID"]=="0"]

Unnamed: 0,DRUG_ID,Drug_Name,Synonyms,Target,Target_Pathway,deriv_found,PubChem_ID,molecular_weight,elements,2bonds,...,I,Cl,S,B,N,H,Pt,P,F,O
58,164,JQ12,-,"HDAC1, HDAC2",Chromatin histone acetylation,0.0,0,,,,...,,,,,,,,,,
92,211,TL-2-105,-,not defined,ERK MAPK signaling,0.0,0,,,,...,,,,,,,,,,
98,225,Genentech Cpd 10,-,"AURKA, AURKB",Mitosis,0.0,0,,,,...,,,,,,,,,,
109,253,XMD14-99,-,"ALK, CDK7, LTK, others",Other,0.0,0,,,,...,,,,,,,,,,
116,261,TL-1-85,-,TAK,"Other, kinases",0.0,0,,,,...,,,,,,,,,,
134,286,KIN001-236,-,Angiopoietin-1 receptor,Other,0.0,0,,,,...,,,,,,,,,,
159,330,XMD13-2,-,RIPK1,Apoptosis regulation,0.0,0,,,,...,,,,,,,,,,
161,332,XMD15-27,-,CAMK2,"Other, kinases",0.0,0,,,,...,,,,,,,,,,
199,1037,BX796,BX-796,"TBK1, PDK1 (PDPK1), IKK, AURKB, AURKC",Other,0.0,0,,,,...,,,,,,,,,,
225,1142,HG-5-113-01,-,"LOK, LTK, TRCB, ABL(T315I)",Other,0.0,0,,,,...,,,,,,,,,,


### Merge data

In [6]:
column_not_to_use = ["Synonyms", "deriv_found", "PubChem_ID",
                    "inchi_key", "canonical_smiles", "inchi_string"]
columns_to_use = set(df_drugs_properties.columns) - set(column_not_to_use)


merged_df = pd.merge(left = df_profiles, right = df_drugs_properties[columns_to_use], on = "DRUG_ID") 
merged_df = pd.merge(left = merged_df, right = df_targets, on = "DRUG_ID") 
 
merged_df.shape

(2612, 1384)

In [7]:
# potential features for ML:
# - X_cancer_cell_lines - 1073
# - X_PubChem_properties - 26
# - X_targets - 229
# - X_target_pathway

with open(_FOLDER_2+"X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]
print("Number of cancer cell lines features:", len(X_cancer_cell_lines))
# *****************************************

with open(_FOLDER_2+"X_PubChem_properties.txt", 'r') as f:
    X_PubChem_properties = [line.rstrip('\n') for line in f]
print("Number of PubChem drug properties:", len(X_PubChem_properties))
# *****************************************

with open(_FOLDER_2+"X_features_Targets.txt", 'r') as f:
    X_targets = [line.rstrip('\n') for line in f]
print("Number of possible targets:", len(X_targets))
# *****************************************

with open(_FOLDER_2+"X_features_Target_Pathway.txt", 'r') as f:
    X_target_pathway = [line.rstrip('\n') for line in f]
print("Number of possible target pathways:", len(X_target_pathway))
# *****************************************

print("\n Maximum number of features:",len(X_cancer_cell_lines)+ len(X_PubChem_properties)+len(X_targets) + len(X_target_pathway))

Number of cancer cell lines features: 1073
Number of PubChem drug properties: 26
Number of possible targets: 229
Number of possible target pathways: 23

 Maximum number of features: 1351


In [8]:
difference = set(merged_df.columns) - set(X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway)
len(difference), difference

(33,
 {'COSMIC_ID',
  'DRUG_ID',
  'Drug_Name',
  'H',
  'MAX_CONC',
  'Target',
  'Target_Pathway',
  'elements',
  'fd_num_0',
  'fd_num_1',
  'fd_num_2',
  'fd_num_3',
  'fd_num_4',
  'fd_num_5',
  'fd_num_6',
  'fd_num_7',
  'fd_num_8',
  'fd_num_9',
  'molecular_formula',
  'norm_cells_0',
  'norm_cells_1',
  'norm_cells_2',
  'norm_cells_3',
  'norm_cells_4',
  'norm_cells_5',
  'norm_cells_6',
  'norm_cells_7',
  'norm_cells_8',
  'norm_cells_9',
  'param_1',
  'param_2',
  'param_3',
  'param_4'})

### Exclude drugs that don't have PubChem_id

In [9]:
drugs_with_pubchem_id = df_drugs_properties[df_drugs_properties["PubChem_ID"]!="0"]["DRUG_ID"].values
drugs_with_no_pubchem_id = df_drugs_properties[df_drugs_properties["PubChem_ID"]=="0"]["DRUG_ID"].values

with open(_FOLDER_2 + "drugs_with_pubchem_id.txt", 'w') as f:
    for s in drugs_with_pubchem_id:
        f.write(str(s) + '\n')
with open(_FOLDER_2 + "drugs_with_no_pubchem_id.txt", 'w') as f:
    for s in drugs_with_no_pubchem_id:
        f.write(str(s) + '\n')
print("All drugs: %d, With known PubChem_Id: %d" % (df_drugs_properties.shape[0], len(drugs_with_pubchem_id)))

All drugs: 265, With known PubChem_Id: 250


In [10]:
filtered_drugs_with_pubchem_id = set(drugs_with_pubchem_id) & set(df_profiles["DRUG_ID"].unique())

merged_df = merged_df.set_index("DRUG_ID").loc[filtered_drugs_with_pubchem_id, :].reset_index()
merged_df.shape

(2585, 1384)

### Exclude drugs that don't have PubChem_id

In [11]:
drugs_with_pubchem_id = df_drugs_properties[df_drugs_properties["PubChem_ID"]!=0]["DRUG_ID"]
print("All drugs: %d, With known PubChem_Id: %d" % (df_drugs_properties.shape[0], len(drugs_with_pubchem_id)))

All drugs: 265, With known PubChem_Id: 265


In [12]:
drugs_with_pubchem_id = df_drugs_properties[df_drugs_properties["PubChem_ID"]!=0]["DRUG_ID"].values
merged_df = merged_df.set_index("DRUG_ID").loc[filtered_drugs_with_pubchem_id, :].reset_index()
merged_df.shape

(2585, 1384)

### Save the data

In [13]:
merged_df.to_csv(_FOLDER_2 +"merged_fitted_sigmoid4_123_with_drugs_properties.csv")

In [14]:
merged_df["Br"].value_counts()

0.0    2554
1.0      31
Name: Br, dtype: int64