In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import os

from scipy.stats import norm

import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import display
_FOLDER = "results/"
# _FOLDER = "/home/acq18mk/master/results/"

In [2]:
os.listdir(_FOLDER)

['drug_cells_kernels_best_parameters.csv',
 'X_features_1122_easy_read.txt',
 'drug_cells_PubChem_scaled_kernels_best_parameters.csv',
 '.DS_Store',
 'merged_drug_profiles_cells_sigmoid4_123.csv',
 'drugs_with_pubchem_id.txt',
 'drug_features_with_pubchem_properties.csv',
 'drugs_with_no_pubchem_id.txt',
 'statistics_of_sigmoid_coefficients.csv',
 'target_target_pathway_df.csv',
 'X_features_Targets.txt',
 'X_PubChem_features.txt',
 '.ipynb_checkpoints',
 'X_columns_1094.txt',
 'drug_cells_description_kernels_best_parameters.csv',
 'filtered_drug_profiles_123.csv',
 'X_PubChem_properties.txt',
 'X_features_Target_Pathway.txt',
 'merged_fitted_sigmoid4_123_with_drugs_properties.csv',
 'drug_cells_PubChem_kernels_best_parameters.csv',
 'X_features_cancer_cell_lines.txt']

In [3]:
# potential features for ML:
# - X_cancer_cell_lines - 1073
# - X_PubChem_properties - 26
# - X_targets - 229
# - X_target_pathway

with open(_FOLDER+"X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]
print("Number of cancer cell lines features:", len(X_cancer_cell_lines))
# *****************************************

with open(_FOLDER+"X_PubChem_properties.txt", 'r') as f:
    X_PubChem_properties = [line.rstrip('\n') for line in f]
print("Number of PubChem drug properties:", len(X_PubChem_properties))
# *****************************************

with open(_FOLDER+"X_features_Targets.txt", 'r') as f:
    X_targets = [line.rstrip('\n') for line in f]
print("Number of possible targets:", len(X_targets))
# *****************************************

with open(_FOLDER+"X_features_Target_Pathway.txt", 'r') as f:
    X_target_pathway = [line.rstrip('\n') for line in f]
print("Number of possible target pathways:", len(X_target_pathway))
# *****************************************

print("\n Maximum number of features:",len(X_cancer_cell_lines)+ len(X_PubChem_properties)+len(X_targets) + len(X_target_pathway))

Number of cancer cell lines features: 1073
Number of PubChem drug properties: 26
Number of possible targets: 229
Number of possible target pathways: 23

 Maximum number of features: 1351


### Data Sparsity

In [13]:
df = pd.read_csv(_FOLDER+"merged_fitted_sigmoid4_123_with_drugs_properties.csv").drop("Unnamed: 0", axis=1)
df.shape

(2585, 1380)

In [15]:
difference = set(df.columns) - set(X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway)
len(difference), difference

(29,
 {'COSMIC_ID',
  'DRUG_ID',
  'Drug_Name',
  'MAX_CONC',
  'fd_num_0',
  'fd_num_1',
  'fd_num_2',
  'fd_num_3',
  'fd_num_4',
  'fd_num_5',
  'fd_num_6',
  'fd_num_7',
  'fd_num_8',
  'fd_num_9',
  'molecular_formula',
  'norm_cells_0',
  'norm_cells_1',
  'norm_cells_2',
  'norm_cells_3',
  'norm_cells_4',
  'norm_cells_5',
  'norm_cells_6',
  'norm_cells_7',
  'norm_cells_8',
  'norm_cells_9',
  'param_1',
  'param_2',
  'param_3',
  'param_4'})

In [16]:
# dataset1 -  only cancer cell lines features
columns = X_cancer_cell_lines
print("Dataset 1 - only cancer cell lines features:", df[columns].shape)
zero_elements = sum(sum(df[columns].values==0))
print("\nNumber of zero elements:", '{:,.2f}'.format(zero_elements))
all_elements = df[columns].shape[0] *df[columns].shape[1]
print("Number of all elements:", '{:,.2f}'.format(all_elements))
print("Sparsity of data in %:", round(100*zero_elements/all_elements, 3))

Dataset 1 - only cancer cell lines features: (2585, 1073)

Number of zero elements: 2,676,495.00
Number of all elements: 2,773,705.00
Sparsity of data in %: 96.495


In [17]:
# dataset2 -  cancer cell lines features + drug description
columns = X_cancer_cell_lines + X_targets + X_target_pathway +["MAX_CONC"]
print("Dataset 2 - cancer cell lines features + drug description:", df[columns].shape)
zero_elements = sum(sum(df[columns].values==0))
print("\nNumber of zero elements:", '{:,.2f}'.format(zero_elements))
all_elements = df[columns].shape[0] *df[columns].shape[1]
print("Number of all elements:", '{:,.2f}'.format(all_elements))
print("Sparsity of data in %:", round(100*zero_elements/all_elements, 3))

Dataset 2 - cancer cell lines features + drug description: (2585, 1326)

Number of zero elements: 3,320,870.00
Number of all elements: 3,427,710.00
Sparsity of data in %: 96.883


In [18]:
# dataset3 -  cancer cell lines features + PubChem drug properties
columns = X_cancer_cell_lines + X_PubChem_properties +["MAX_CONC"]
print("Dataset 3 - cancer cell lines features + PubChem drug properties:", df[columns].shape)
zero_elements = sum(sum(df[columns].values==0))
print("\nNumber of zero elements:", '{:,.2f}'.format(zero_elements))
all_elements = df[columns].shape[0] *df[columns].shape[1]
print("Number of all elements:", '{:,.2f}'.format(all_elements))
print("Sparsity of data in %:", round(100*zero_elements/all_elements, 3))

Dataset 3 - cancer cell lines features + PubChem drug properties: (2585, 1100)

Number of zero elements: 2,709,336.00
Number of all elements: 2,843,500.00
Sparsity of data in %: 95.282


In [19]:
# dataset4 -  cancer cell lines features + drug description + PubChem drug properties
columns = X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway +["MAX_CONC"]
print("Dataset 3 - cancer cell lines features + PubChem drug properties:", df[columns].shape)
zero_elements = sum(sum(df[columns].values==0))
print("\nNumber of zero elements:", '{:,.2f}'.format(zero_elements))
all_elements = df[columns].shape[0] *df[columns].shape[1]
print("Number of all elements:", '{:,.2f}'.format(all_elements))
print("Sparsity of data in %:", round(100*zero_elements/all_elements, 3))

Dataset 3 - cancer cell lines features + PubChem drug properties: (2585, 1352)

Number of zero elements: 3,353,711.00
Number of all elements: 3,494,920.00
Sparsity of data in %: 95.96


### Train and test data

In [33]:
# drugs with >50 profiles:

df = pd.read_csv(_FOLDER+"merged_fitted_sigmoid4_123_with_drugs_properties.csv")

In [37]:
with open("results/drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
len(drug_ids_50), drug_ids_50

(11, [170, 173, 180, 200, 219, 272, 273, 274, 276, 328, 346])

In [26]:
train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

In [58]:
conc_columns = ["fd_num_"+str(i) for i in range(10)]
norm_response = ['norm_cells_'+str(i) for i in range(10)]
param = ["param_" +str(i) for i in range(1, 5)]

In [32]:
columns_to_normalise = []
all_columns = X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway +["MAX_CONC"]
for col in all_columns:
    if df[col].nunique()>2:
        columns_to_normalise.append(col)

len(col_to_normalise), len(X_PubChem_properties +["MAX_CONC"]),columns_to_normalise       

(15,
 27,
 ['molecular_weight',
  '2bonds',
  'xlogp',
  'surface_area',
  'complexity',
  'h_bond_donor_count',
  'h_bond_acceptor_count',
  'rotatable_bond_count',
  'heavy_atom_count',
  'atom_stereo_count',
  'defined_atom_stereo_count',
  'undefined_atom_stereo_count',
  'bond_stereo_count',
  'covalent_unit_count',
  'MAX_CONC'])

In [38]:
with open("results/columns_to_normalise.txt", 'w') as f:
    for s in columns_to_normalise:
        f.write(str(s) + '\n')

In [35]:
"MAX_CONC" in df.columns, "MAX_CONC" in train_df.columns

(True, True)

### Previous results

In [59]:
columns_to_normalise = drug_features +["MAX_CONC"]

In [36]:
with open(_FOLDER+"X_features_1123_easy_read.txt", 'r') as f:
    X_columns_1123 = [line.rstrip('\n') for line in f]

with open(_FOLDER+"X_features_1122_easy_read.txt", 'r') as f:
    X_columns_1122 = [line.rstrip('\n') for line in f]
len(X_columns_1122), len(X_columns_1123)

(1122, 1123)

In [37]:
set(X_columns_1123) - set(X_columns_1122)

{'MAX_CONC'}

In [60]:
df = pd.read_csv(_FOLDER+'merged_fitted_sigmoid4_123_with_drugs_properties.csv')
len(df.columns)

1154

In [40]:
"MAX_CONC" in df.columns

False

In [44]:
df3 = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_description.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

In [45]:
df3.columns

Index(['DRUG_ID', 'COSMIC_ID', 'fd_num_0', 'fd_num_1', 'fd_num_2', 'fd_num_3',
       'fd_num_4', 'fd_num_5', 'fd_num_6', 'fd_num_7',
       ...
       'chr9:123555399-123555899(FBXW2)_HypMET',
       'chr9:140310894-140312457(EXD3)_HypMET',
       'chr9:21974578-21975306(CDKN2A)_HypMET',
       'chr9:35756948-35757339(MSMP)_HypMET',
       'chr9:35791584-35791924(NPR2)_HypMET',
       'chr9:4984543-4985630(JAK2)_HypMET',
       'chr9:86571047-86572027(C9orf64)_HypMET',
       'chr9:98783216-98784364(NCRNA00092)_HypMET', 'Target_Pathway',
       'Drug_Name'],
      dtype='object', length=1101)

In [48]:
os.listdir("results")

['drug_cells_kernels_best_parameters.csv',
 'X_features_1122_easy_read.txt',
 'test02_merged_fitted_sigmoid4_123_with_drugs_properties.csv',
 'drug_cells_PubChem_scaled_kernels_best_parameters.csv',
 '.DS_Store',
 'test02_merged_fitted_sigmoid4_123_with_drugs_description.csv',
 'drug_features_with_pubchem_properties.csv',
 'merged_fitted_sigmoid4_123_with_drugs_description.csv',
 'merged_fitted_sigmoid4_123_with_drugs_properties_split_target.csv',
 'X_PubChem_features_easy_read.txt',
 'merged_fitted_sigmoid4_123_with_drugs_description_split_target.csv',
 'statistics_of_sigmoid_coefficients.csv',
 'train08_merged_fitted_sigmoid4_123_with_drugs_description.csv',
 'X_features_cancer_cell_lines_easy_read.txt',
 'train08_merged_fitted_sigmoid4_123_with_drugs_properties.csv',
 'X_target_and_pathway_columns_easy_read.txt',
 'merged_drug_profiles_sigmoid4_123.csv',
 '.ipynb_checkpoints',
 'drug_cells_description_kernels_best_parameters.csv',
 'X_features_1123_easy_read.txt',
 'merged_fitted_si

In [49]:
with open(_FOLDER+"X_features_cancer_cell_lines_easy_read.txt", 'r') as f:
    X_cancer_cells = [line.rstrip('\n') for line in f]
len(X_cancer_cells)

1073

In [50]:
with open("results/X_PubChem_features_easy_read.txt", 'r') as f:
    X_PubChem_features = [line.rstrip('\n') for line in f]
len(X_PubChem_features)

26

In [51]:
with open("results/X_target_and_pathway_columns_easy_read.txt", 'r') as f:
    X_target_and_pathway = [line.rstrip('\n') for line in f]
len(X_target_and_pathway)

26

In [53]:
len(X_cancer_cells+X_target_and_pathway), len(X_cancer_cells+X_target_and_pathway+X_PubChem_features)

(1099, 1125)

### Dataset 2 with drug description

In [94]:
def RunCrossValidation(merged_df, drug_ids, number_coefficients, train_ratio=0.8, column_not_to_use =[], 
                       kernel='linear', param_tested = "C", param_tested_values = [], 
                       degree=3, gamma="scale", coef0=0.0, C=1.0, epsilon=0.1, cache_size=200,
                       features_to_scale=[], scaling=False, print_results=True):
    
    param1 = ["param_" +str(i) for i in range(10)]
    param2 = ["param" +str(i) for i in range(10)] 
    norm_response  = ["norm_cells_"+str(i) for i in range(10)]
    con_columns  = ["fd_num_"+str(i) for i in range(10)]
    not_X_columns = param1 + param2 + norm_response + con_columns+column_not_to_use
    X_columns = set(merged_df.columns) - set(not_X_columns)
    print("Number of X_columns:", len(X_columns))
    
    with open("results/X_columns_1094.txt", 'w') as f:
        for s in X_columns:
            f.write(str(s) + '\n')

def TuneParameters(merged_df, drug_ids, number_coefficients, kernels = [], column_not_to_use =[], 
                   param_tested = "C", param_tested_values = [], 
                   degree=3, gamma='scale', coef0=0.0, C=1.0, epsilon=0.1, cache_size=200,
                   features_to_scale=[], scaling=False, print_results=True):

    for kernel in kernels:
        if kernel == "linear":
            best_epsilon = RunCrossValidation(merged_df, drug_ids, number_coefficients, 
                                            kernel=kernel, 
                                            column_not_to_use=column_not_to_use, 
                                            param_tested = "epsilon", 
                                            param_tested_values = [0.001, 0.01, 0.1, 1, 2, 5],
                                            features_to_scale = features_to_scale, scaling = scaling,
                                            print_results=print_results)


print("\n2. Finding optimal parameters for drug profiles, cell lines and drug description\n")
df = pd.read_csv(_FOLDER+'merged_fitted_sigmoid4_123_with_drugs_description.csv')

# OHE and dumnies columns for Target_Pathway - 21 new columns
df = pd.concat([df, pd.get_dummies(df["Target_Pathway"])], axis=1)

conc_columns= ["fd_num_"+str(i) for i in range(10)]
response_norm = ['norm_cells_'+str(i) for i in range(10)]

gr = df.groupby(["DRUG_ID"])["COSMIC_ID"].count()
drug_ids = list(gr[gr > 50].index)
print("Number of drugs for training:", len(drug_ids))

kernels_to_test = ["linear"]
results = TuneParameters(df, drug_ids, 4, kernels = kernels_to_test, 
                         column_not_to_use=column_not_to_use, print_results=False)


2. Finding optimal parameters for drug profiles, cell lines and drug description

Number of drugs for training: 11
Number of X_columns: 1094


In [85]:
with open("results/X_columns_1094.txt", 'r') as f:
    X_columns_1094 = [line.rstrip('\n') for line in f]
len(X_columns_1094)

1094

In [89]:
drug_description = set(X_columns_1094) - set(X_cancer_cells)
missing = [x for x in X_target_and_pathway if x not in drug_description]
missing

['ABL signaling',
 'Hormone-related',
 'first_target',
 'second_target',
 'third_target']

In [93]:
'ABL signaling' in  X_columns_1094

False

In [95]:
'ABL signaling' in df.columns

False

In [96]:
os.listdir("data/")

['Drug_Features2.csv',
 'Cell_Lines_Details.csv',
 'Drug_Features.csv',
 'Cell_Line_Features_PANCAN_simple_MOBEM.xlsx',
 'Cell_Line_Features_PANCAN_simple_MOBEM.tsv',
 'normalised_dose_response_data.csv']

In [101]:
df4 = pd.read_csv("data/Drug_Features.csv")
df4.columns

Index(['Drug ID', 'Drug Name', 'Synonyms', 'Target', 'Target Pathway'], dtype='object')

In [107]:
df4["Target"].nunique()

198

In [105]:
'ABL signaling' in df4["Target Pathway"].values

False

In [65]:
len(df.columns), "MAX_CONC" in df.columns

(1101, False)

In [64]:
set(df3.columns) - set(conc_columns + norm_response + param) - set(X_cancer_cells+X_target_and_pathway+X_PubChem_features)

{'COSMIC_ID', 'DRUG_ID', 'Drug_Name', 'Target_Pathway'}

In [76]:
other_data = set(df3.columns) - set(conc_columns + norm_response + param) - set(X_cancer_cells)
print(len(other_data))
other_data

4


{'COSMIC_ID', 'DRUG_ID', 'Drug_Name', 'Target_Pathway'}

In [83]:
len(conc_columns + norm_response + param + X_cancer_cells + list(other_data)), len(df3.columns)

(1101, 1101)

In [74]:
missing_data = [x for x in set(df3.columns) - set(conc_columns + norm_response + param) - set(X_cancer_cells+X_target_and_pathway+X_PubChem_features) if x not in df3.columns]
missing_data

['molecular_weight',
 '2bonds',
 '3bonds',
 'xlogp',
 'formal_charge',
 'surface_area',
 'complexity',
 'h_bond_donor_count',
 'h_bond_acceptor_count',
 'rotatable_bond_count',
 'heavy_atom_count',
 'atom_stereo_count',
 'defined_atom_stereo_count',
 'undefined_atom_stereo_count',
 'bond_stereo_count',
 'covalent_unit_count',
 'B',
 'I',
 'Br',
 'Cl',
 'O',
 'N',
 'F',
 'P',
 'S',
 'Pt']

In [67]:
set(X_cancer_cells+X_target_and_pathway) - set(df3.columns)

{'ABL signaling',
 'Apoptosis regulation',
 'Cell cycle',
 'Chromatin histone acetylation',
 'Chromatin histone methylation',
 'Chromatin other',
 'Cytoskeleton',
 'DNA replication',
 'EGFR signaling',
 'ERK MAPK signaling',
 'Genome integrity',
 'Hormone-related',
 'IGFR signaling',
 'JNK and p38 signaling',
 'Metabolism',
 'Mitosis',
 'Other',
 'Other, kinases',
 'PI3K/MTOR signaling',
 'Protein stability and degradation',
 'RTK signaling',
 'WNT signaling',
 'first_target',
 'p53 pathway',
 'second_target',
 'third_target'}

In [68]:
df3.columns

Index(['DRUG_ID', 'COSMIC_ID', 'fd_num_0', 'fd_num_1', 'fd_num_2', 'fd_num_3',
       'fd_num_4', 'fd_num_5', 'fd_num_6', 'fd_num_7',
       ...
       'chr9:123555399-123555899(FBXW2)_HypMET',
       'chr9:140310894-140312457(EXD3)_HypMET',
       'chr9:21974578-21975306(CDKN2A)_HypMET',
       'chr9:35756948-35757339(MSMP)_HypMET',
       'chr9:35791584-35791924(NPR2)_HypMET',
       'chr9:4984543-4985630(JAK2)_HypMET',
       'chr9:86571047-86572027(C9orf64)_HypMET',
       'chr9:98783216-98784364(NCRNA00092)_HypMET', 'Target_Pathway',
       'Drug_Name'],
      dtype='object', length=1101)