# Lasso Regression as feature selection

The idea is to test Lasso regression with the highest possible regularisation parameter 
<br> and compare the results with those obtained via cross-validation


Training for all the datasets by each coefficients in sigmoid_4_param

**Results:** All the coefficients of Lasso turned to be 0, and nothing helped to improve this situation:<br>
-alpha was tested up to the values 2000,
-option for normalisation was turnd on/off

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics import r2_score
from scipy.optimize import curve_fit
from scipy.optimize import leastsq
import scipy.optimize as opt
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import gc

from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import MinMaxScaler
import os
from data_preprocessing import FilteringCurves, ShowResponseCurves
from fitting_curves import FittingColumn, ShowResponseCurvesWithFitting, compute_r2_score
# _FOLDER = "/home/acq18mk/master/results/"
_FOLDER = "../results/"

In [2]:
with open(_FOLDER + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
drug_ids_50

[170, 173, 180, 200, 219, 272, 273, 274, 276, 328, 346]

In [3]:
# potential features for ML:
# - X_cancer_cell_lines - 1073
# - X_PubChem_properties - 26
# - X_targets - 229
# - X_target_pathway -23

#columns to normalise:
with open(_FOLDER+"columns_to_normalise.txt", 'r') as f:
    columns_to_normalise = [line.rstrip('\n') for line in f]
print("Number of cancer cell lines features:", len(columns_to_normalise))
# *****************************************

with open(_FOLDER+"X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]
print("Number of cancer cell lines features:", len(X_cancer_cell_lines))
# *****************************************

with open(_FOLDER+"X_PubChem_properties.txt", 'r') as f:
    X_PubChem_properties = [line.rstrip('\n') for line in f]
print("Number of PubChem drug properties:", len(X_PubChem_properties))
# *****************************************

with open(_FOLDER+"X_features_Targets.txt", 'r') as f:
    X_targets = [line.rstrip('\n') for line in f]
print("Number of possible targets:", len(X_targets))
# *****************************************

with open(_FOLDER+"X_features_Target_Pathway.txt", 'r') as f:
    X_target_pathway = [line.rstrip('\n') for line in f]
print("Number of possible target pathways:", len(X_target_pathway))
# *****************************************

all_columns = X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway +["MAX_CONC"]
print("\nMaximum number of features:",len(X_cancer_cell_lines)+ len(X_PubChem_properties)+len(X_targets) + len(X_target_pathway), len(all_columns))

Number of cancer cell lines features: 15
Number of cancer cell lines features: 1073
Number of PubChem drug properties: 26
Number of possible targets: 229
Number of possible target pathways: 23

Maximum number of features: 1351 1352


In [4]:
train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

In [6]:
train_df.shape, test_df.shape, train_df_50.shape, test_df_50.shape

((1843, 1380), (496, 1380), (781, 1379), (202, 1379))

In [7]:
set(train_df.columns) - set(all_columns)

{'COSMIC_ID',
 'DRUG_ID',
 'Drug_Name',
 'fd_num_0',
 'fd_num_1',
 'fd_num_2',
 'fd_num_3',
 'fd_num_4',
 'fd_num_5',
 'fd_num_6',
 'fd_num_7',
 'fd_num_8',
 'fd_num_9',
 'molecular_formula',
 'norm_cells_0',
 'norm_cells_1',
 'norm_cells_2',
 'norm_cells_3',
 'norm_cells_4',
 'norm_cells_5',
 'norm_cells_6',
 'norm_cells_7',
 'norm_cells_8',
 'norm_cells_9',
 'param_1',
 'param_2',
 'param_3',
 'param_4'}

In [8]:
for drug_id in drug_ids_50:
    print(drug_id, len(np.unique(train_df.loc[drug_id, "MAX_CONC"])), np.unique(train_df.loc[drug_id, "MAX_CONC"]))

170 1 [1.024]
173 1 [1.024]
180 1 [1.024]
200 1 [16.]
219 1 [1.024]
272 1 [0.064]
273 1 [0.064]
274 1 [0.064]
276 1 [0.064]
328 1 [5.12]
346 1 [2.56]


In [9]:
# It is no sense to include MAX_CONC in drug-by-drug training
# Actually, in drug-by-drug training all drug features are the same!!!
# so it is no reason for MinMaxScaling?
# it only makes sense to check negative values

In [10]:
X_feat_dict = {"Dataset 1": X_cancer_cell_lines ,
               "Dataset 2": ["MAX_CONC"] + X_targets + X_target_pathway + X_cancer_cell_lines ,
               "Dataset 3": ["MAX_CONC"] + X_PubChem_properties +  X_cancer_cell_lines,
               "Dataset 4": ["MAX_CONC"] + X_PubChem_properties +  X_targets + X_target_pathway + X_cancer_cell_lines}

### Coefficient 1

In [11]:
drug_ids_50

[170, 173, 180, 200, 219, 272, 273, 274, 276, 328, 346]

In [12]:
datasets = ["Dataset 1", "Dataset 2", "Dataset 3", "Dataset 4"]

In [13]:
results = pd.DataFrame(index = drug_ids_50)

for drug_id in drug_ids_50:
    drug_name = train_df_50.loc[drug_id, "Drug_Name"].values[0]
    print(drug_id, drug_name)
    results.loc[drug_id, "Drug_Name"] =  drug_name
    train_drug = train_df_50.loc[drug_id,:].copy()
    test_drug = test_df_50.loc[drug_id,:].copy()

    for i, data_set in list(enumerate(datasets)):
        X_columns = X_feat_dict[data_set]
#         print("\t\t", data_set, "\t\t", train_df_50[X_columns].shape)
 
        X_train = train_drug[X_columns].values
        y_train = train_drug["param_1"].values

        X_test = test_drug[X_columns].values
        y_test = test_drug["param_1"].values

        alpha = 500
        model = Lasso(alpha=alpha, normalize=True)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        
        results.loc[drug_id, "mae_" + str(i)] = mae
        results.loc[drug_id, "mse_" + str(i)] = mse
        results.loc[drug_id, "intercept_" + str(i)] = model.intercept_
        results.loc[drug_id, "sum_coef_" + str(i)] = sum(model.coef_)

#         print("\t\t MAE %0.3f, MSE: %0.3f" % (mae, mse))
#         print("\t\t Sum of model coefficients:", sum(model.coef_))
#         non_zero_features_dict = dict([(X_columns[i], model.coef_[i]) for i in range(len(model.coef_)) if model.coef_[i]!=0])
#         print("\t\t Non zero features with values:", non_zero_features_dict)
#         print("")
    del train_drug
    del test_drug

170 Shikonin
173 FH535
180 Thapsigargin
200 Dacinostat
219 AT-7519
272 AR-42
273 CUDC-101
274 Belinostat
276 CAY10603
328 SNX-2112
346 THZ-2-102-1


In [14]:
results[["Drug_Name"]+["sum_coef_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,sum_coef_0,sum_coef_1,sum_coef_2,sum_coef_3
170,Shikonin,0.0,0.0,0.0,0.0
173,FH535,0.0,0.0,0.0,0.0
180,Thapsigargin,0.0,0.0,0.0,0.0
200,Dacinostat,0.0,0.0,0.0,0.0
219,AT-7519,0.0,0.0,0.0,0.0
272,AR-42,0.0,0.0,0.0,0.0
273,CUDC-101,0.0,0.0,0.0,0.0
274,Belinostat,0.0,0.0,0.0,0.0
276,CAY10603,0.0,0.0,0.0,0.0
328,SNX-2112,0.0,0.0,0.0,0.0


In [15]:
results[["Drug_Name"]+["intercept_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,intercept_0,intercept_1,intercept_2,intercept_3
170,Shikonin,0.580835,0.580835,0.580835,0.580835
173,FH535,0.614222,0.614222,0.614222,0.614222
180,Thapsigargin,1.571673,1.571673,1.571673,1.571673
200,Dacinostat,0.594152,0.594152,0.594152,0.594152
219,AT-7519,0.669953,0.669953,0.669953,0.669953
272,AR-42,0.664511,0.664511,0.664511,0.664511
273,CUDC-101,0.594359,0.594359,0.594359,0.594359
274,Belinostat,0.604637,0.604637,0.604637,0.604637
276,CAY10603,0.52804,0.52804,0.52804,0.52804
328,SNX-2112,0.477085,0.477085,0.477085,0.477085


In [16]:
results[["Drug_Name"]+["mae_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,mae_0,mae_1,mae_2,mae_3
170,Shikonin,0.124879,0.124879,0.124879,0.124879
173,FH535,0.13535,0.13535,0.13535,0.13535
180,Thapsigargin,2.502282,2.502282,2.502282,2.502282
200,Dacinostat,0.170154,0.170154,0.170154,0.170154
219,AT-7519,0.192086,0.192086,0.192086,0.192086
272,AR-42,0.159842,0.159842,0.159842,0.159842
273,CUDC-101,0.062858,0.062858,0.062858,0.062858
274,Belinostat,0.184657,0.184657,0.184657,0.184657
276,CAY10603,0.122949,0.122949,0.122949,0.122949
328,SNX-2112,0.131667,0.131667,0.131667,0.131667


In [17]:
results[["Drug_Name"]+["mse_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,mse_0,mse_1,mse_2,mse_3
170,Shikonin,0.022347,0.022347,0.022347,0.022347
173,FH535,0.050504,0.050504,0.050504,0.050504
180,Thapsigargin,18.146173,18.146173,18.146173,18.146173
200,Dacinostat,0.040978,0.040978,0.040978,0.040978
219,AT-7519,0.044297,0.044297,0.044297,0.044297
272,AR-42,0.034059,0.034059,0.034059,0.034059
273,CUDC-101,0.005848,0.005848,0.005848,0.005848
274,Belinostat,0.040335,0.040335,0.040335,0.040335
276,CAY10603,0.019273,0.019273,0.019273,0.019273
328,SNX-2112,0.021644,0.021644,0.021644,0.021644


In [18]:
mean_mae = round(results["mae_0"].mean(), 3)
std_mae = round(results["mae_0"].std(), 3)

mean_mae_no_180 = round(results.drop(180, axis=0)["mae_0"].mean(), 3)
std_mae_no_180 = round(results.drop(180, axis=0)["mae_0"].std(), 3)

print("Mean MAE:", mean_mae, mean_mae_no_180)
print("std MAE:", std_mae, std_mae_no_180)

mean_mse = round(results["mse_0"].mean(), 3)
std_mse = round(results["mse_0"].std(), 3)

mean_mse_no_180 = round(results.drop(180, axis=0)["mse_0"].mean(), 3)
std_mse_no_180 = round(results.drop(180, axis=0)["mse_0"].std(), 3)

print("\nMean MSE:", mean_mse, mean_mse_no_180)
print("std MSE:", std_mse, std_mse_no_180)

Mean MAE: 0.351 0.136
std MAE: 0.715 0.042

Mean MSE: 1.676 0.029
std MSE: 5.463 0.015


In [19]:
# Conclusion: In the whole diapason range of alpha [0.001-2000], all the coefficients were set to 0.
# Without intercept coefficients remained 0, but the mae and mse increased almost 10 times
# build-in option normalize=True also didn't help

### Coefficient 2

In [20]:
results = pd.DataFrame(index = drug_ids_50)
for drug_id in drug_ids_50:
    drug_name = train_df_50.loc[drug_id, "Drug_Name"].values[0]
    print(drug_id, drug_name)
    results.loc[drug_id, "Drug_Name"] =  drug_name
    train_drug = train_df_50.loc[drug_id,:].copy()
    test_drug = test_df_50.loc[drug_id,:].copy()

    for i, data_set in list(enumerate(datasets)):
        X_columns = X_feat_dict[data_set]
#         print("\t\t", data_set, "\t\t", train_df_50[X_columns].shape)
 
        X_train = train_drug[X_columns].values
        y_train = train_drug["param_2"].values

        X_test = test_drug[X_columns].values
        y_test = test_drug["param_2"].values

        alpha = 500
        model = Lasso(alpha=alpha)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        
        results.loc[drug_id, "mae_" + str(i)] = mae
        results.loc[drug_id, "mse_" + str(i)] = mse
        results.loc[drug_id, "intercept_" + str(i)] = model.intercept_
        results.loc[drug_id, "sum_coef_" + str(i)] = sum(model.coef_)

#         print("\t\t MAE %0.3f, MSE: %0.3f" % (mae, mse))
#         print("\t\t Sum of model coefficients:", sum(model.coef_))
#         non_zero_features_dict = dict([(X_columns[i], model.coef_[i]) for i in range(len(model.coef_)) if model.coef_[i]!=0])
#         print("\t\t Non zero features with values:", non_zero_features_dict)
#         print("")
    del train_drug
    del test_drug

170 Shikonin
173 FH535
180 Thapsigargin
200 Dacinostat
219 AT-7519
272 AR-42
273 CUDC-101
274 Belinostat
276 CAY10603
328 SNX-2112
346 THZ-2-102-1


In [21]:
results[["Drug_Name"]+["sum_coef_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,sum_coef_0,sum_coef_1,sum_coef_2,sum_coef_3
170,Shikonin,0.0,0.0,0.0,0.0
173,FH535,0.0,0.0,0.0,0.0
180,Thapsigargin,0.0,0.0,0.0,0.0
200,Dacinostat,0.0,0.0,0.0,0.0
219,AT-7519,0.0,0.0,0.0,0.0
272,AR-42,0.0,0.0,0.0,0.0
273,CUDC-101,0.0,0.0,0.0,0.0
274,Belinostat,0.0,0.0,0.0,0.0
276,CAY10603,0.0,0.0,0.0,0.0
328,SNX-2112,0.0,0.0,0.0,0.0


In [22]:
results[["Drug_Name"]+["intercept_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,intercept_0,intercept_1,intercept_2,intercept_3
170,Shikonin,0.776455,0.776455,0.776455,0.776455
173,FH535,1.143384,1.143384,1.143384,1.143384
180,Thapsigargin,-0.176774,-0.176774,-0.176774,-0.176774
200,Dacinostat,0.921471,0.921471,0.921471,0.921471
219,AT-7519,1.030339,1.030339,1.030339,1.030339
272,AR-42,1.014484,1.014484,1.014484,1.014484
273,CUDC-101,1.06975,1.06975,1.06975,1.06975
274,Belinostat,0.826107,0.826107,0.826107,0.826107
276,CAY10603,1.033434,1.033434,1.033434,1.033434
328,SNX-2112,1.140798,1.140798,1.140798,1.140798


In [23]:
results[["Drug_Name"]+["mae_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,mae_0,mae_1,mae_2,mae_3
170,Shikonin,0.269558,0.269558,0.269558,0.269558
173,FH535,0.222711,0.222711,0.222711,0.222711
180,Thapsigargin,2.785129,2.785129,2.785129,2.785129
200,Dacinostat,0.123186,0.123186,0.123186,0.123186
219,AT-7519,0.183067,0.183067,0.183067,0.183067
272,AR-42,0.079124,0.079124,0.079124,0.079124
273,CUDC-101,0.083109,0.083109,0.083109,0.083109
274,Belinostat,0.250585,0.250585,0.250585,0.250585
276,CAY10603,0.039935,0.039935,0.039935,0.039935
328,SNX-2112,0.180893,0.180893,0.180893,0.180893


In [24]:
results[["Drug_Name"]+["mse_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,mse_0,mse_1,mse_2,mse_3
170,Shikonin,0.074066,0.074066,0.074066,0.074066
173,FH535,0.087835,0.087835,0.087835,0.087835
180,Thapsigargin,30.956782,30.956782,30.956782,30.956782
200,Dacinostat,0.016993,0.016993,0.016993,0.016993
219,AT-7519,0.058246,0.058246,0.058246,0.058246
272,AR-42,0.013926,0.013926,0.013926,0.013926
273,CUDC-101,0.018349,0.018349,0.018349,0.018349
274,Belinostat,0.079037,0.079037,0.079037,0.079037
276,CAY10603,0.002344,0.002344,0.002344,0.002344
328,SNX-2112,0.053936,0.053936,0.053936,0.053936


In [25]:
mean_mae = round(results["mae_0"].mean(), 3)
std_mae = round(results["mae_0"].std(), 3)

mean_mae_no_180 = round(results.drop(180, axis=0)["mae_0"].mean(), 3)
std_mae_no_180 = round(results.drop(180, axis=0)["mae_0"].std(), 3)

print("Mean MAE:", mean_mae, mean_mae_no_180)
print("std MAE:", std_mae, std_mae_no_180)

mean_mse = round(results["mse_0"].mean(), 3)
std_mse = round(results["mse_0"].std(), 3)

mean_mse_no_180 = round(results.drop(180, axis=0)["mse_0"].mean(), 3)
std_mse_no_180 = round(results.drop(180, axis=0)["mse_0"].std(), 3)

print("\nMean MSE:", mean_mse, mean_mse_no_180)
print("std MSE:", std_mse, std_mse_no_180)

Mean MAE: 0.398 0.16
std MAE: 0.795 0.077

Mean MSE: 2.855 0.045
std MSE: 9.32 0.031


### Coefficient 3

In [26]:
results = pd.DataFrame(index = drug_ids_50)
for drug_id in drug_ids_50:
    drug_name = train_df_50.loc[drug_id, "Drug_Name"].values[0]
    print(drug_id, drug_name)
    results.loc[drug_id, "Drug_Name"] =  drug_name
    train_drug = train_df_50.loc[drug_id,:].copy()
    test_drug = test_df_50.loc[drug_id,:].copy()

    for i, data_set in list(enumerate(datasets)):
        X_columns = X_feat_dict[data_set]
#         print("\t\t", data_set, "\t\t", train_df_50[X_columns].shape)
 
        X_train = train_drug[X_columns].values
        y_train = train_drug["param_1"].values

        X_test = test_drug[X_columns].values
        y_test = test_drug["param_1"].values

        alpha = 500
        model = Lasso(alpha=alpha)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        
        results.loc[drug_id, "mae_" + str(i)] = mae
        results.loc[drug_id, "mse_" + str(i)] = mse
        results.loc[drug_id, "intercept_" + str(i)] = model.intercept_
        results.loc[drug_id, "sum_coef_" + str(i)] = sum(model.coef_)

#         print("\t\t MAE %0.3f, MSE: %0.3f" % (mae, mse))
#         print("\t\t Sum of model coefficients:", sum(model.coef_))
#         non_zero_features_dict = dict([(X_columns[i], model.coef_[i]) for i in range(len(model.coef_)) if model.coef_[i]!=0])
#         print("\t\t Non zero features with values:", non_zero_features_dict)
#         print("")
    del train_drug
    del test_drug

170 Shikonin
173 FH535
180 Thapsigargin
200 Dacinostat
219 AT-7519
272 AR-42
273 CUDC-101
274 Belinostat
276 CAY10603
328 SNX-2112
346 THZ-2-102-1


In [27]:
results[["Drug_Name"]+["sum_coef_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,sum_coef_0,sum_coef_1,sum_coef_2,sum_coef_3
170,Shikonin,0.0,0.0,0.0,0.0
173,FH535,0.0,0.0,0.0,0.0
180,Thapsigargin,0.0,0.0,0.0,0.0
200,Dacinostat,0.0,0.0,0.0,0.0
219,AT-7519,0.0,0.0,0.0,0.0
272,AR-42,0.0,0.0,0.0,0.0
273,CUDC-101,0.0,0.0,0.0,0.0
274,Belinostat,0.0,0.0,0.0,0.0
276,CAY10603,0.0,0.0,0.0,0.0
328,SNX-2112,0.0,0.0,0.0,0.0


In [28]:
results[["Drug_Name"]+["intercept_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,intercept_0,intercept_1,intercept_2,intercept_3
170,Shikonin,0.580835,0.580835,0.580835,0.580835
173,FH535,0.614222,0.614222,0.614222,0.614222
180,Thapsigargin,1.571673,1.571673,1.571673,1.571673
200,Dacinostat,0.594152,0.594152,0.594152,0.594152
219,AT-7519,0.669953,0.669953,0.669953,0.669953
272,AR-42,0.664511,0.664511,0.664511,0.664511
273,CUDC-101,0.594359,0.594359,0.594359,0.594359
274,Belinostat,0.604637,0.604637,0.604637,0.604637
276,CAY10603,0.52804,0.52804,0.52804,0.52804
328,SNX-2112,0.477085,0.477085,0.477085,0.477085


In [29]:
results[["Drug_Name"]+["mae_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,mae_0,mae_1,mae_2,mae_3
170,Shikonin,0.124879,0.124879,0.124879,0.124879
173,FH535,0.13535,0.13535,0.13535,0.13535
180,Thapsigargin,2.502282,2.502282,2.502282,2.502282
200,Dacinostat,0.170154,0.170154,0.170154,0.170154
219,AT-7519,0.192086,0.192086,0.192086,0.192086
272,AR-42,0.159842,0.159842,0.159842,0.159842
273,CUDC-101,0.062858,0.062858,0.062858,0.062858
274,Belinostat,0.184657,0.184657,0.184657,0.184657
276,CAY10603,0.122949,0.122949,0.122949,0.122949
328,SNX-2112,0.131667,0.131667,0.131667,0.131667


In [30]:
results[["Drug_Name"]+["mse_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,mse_0,mse_1,mse_2,mse_3
170,Shikonin,0.022347,0.022347,0.022347,0.022347
173,FH535,0.050504,0.050504,0.050504,0.050504
180,Thapsigargin,18.146173,18.146173,18.146173,18.146173
200,Dacinostat,0.040978,0.040978,0.040978,0.040978
219,AT-7519,0.044297,0.044297,0.044297,0.044297
272,AR-42,0.034059,0.034059,0.034059,0.034059
273,CUDC-101,0.005848,0.005848,0.005848,0.005848
274,Belinostat,0.040335,0.040335,0.040335,0.040335
276,CAY10603,0.019273,0.019273,0.019273,0.019273
328,SNX-2112,0.021644,0.021644,0.021644,0.021644


In [31]:
mean_mae = round(results["mae_0"].mean(), 3)
std_mae = round(results["mae_0"].std(), 3)

mean_mae_no_180 = round(results.drop(180, axis=0)["mae_0"].mean(), 3)
std_mae_no_180 = round(results.drop(180, axis=0)["mae_0"].std(), 3)

print("Mean MAE:", mean_mae, mean_mae_no_180)
print("std MAE:", std_mae, std_mae_no_180)

mean_mse = round(results["mse_0"].mean(), 3)
std_mse = round(results["mse_0"].std(), 3)

mean_mse_no_180 = round(results.drop(180, axis=0)["mse_0"].mean(), 3)
std_mse_no_180 = round(results.drop(180, axis=0)["mse_0"].std(), 3)

print("\nMean MSE:", mean_mse, mean_mse_no_180)
print("std MSE:", std_mse, std_mse_no_180)

Mean MAE: 0.351 0.136
std MAE: 0.715 0.042

Mean MSE: 1.676 0.029
std MSE: 5.463 0.015


### Coefficient 4

In [32]:
results = pd.DataFrame(index = drug_ids_50)
for drug_id in drug_ids_50:
    drug_name = train_df_50.loc[drug_id, "Drug_Name"].values[0]
    print(drug_id, drug_name)
    results.loc[drug_id, "Drug_Name"] =  drug_name
    train_drug = train_df_50.loc[drug_id,:].copy()
    test_drug = test_df_50.loc[drug_id,:].copy()

    for i, data_set in list(enumerate(datasets)):
        X_columns = X_feat_dict[data_set]
#         print("\t\t", data_set, "\t\t", train_df_50[X_columns].shape)
 
        X_train = train_drug[X_columns].values
        y_train = train_drug["param_1"].values

        X_test = test_drug[X_columns].values
        y_test = test_drug["param_1"].values

        alpha = 500
        model = Lasso(alpha=alpha)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        
        results.loc[drug_id, "mae_" + str(i)] = mae
        results.loc[drug_id, "mse_" + str(i)] = mse
        results.loc[drug_id, "intercept_" + str(i)] = model.intercept_
        results.loc[drug_id, "sum_coef_" + str(i)] = sum(model.coef_)

#         print("\t\t MAE %0.3f, MSE: %0.3f" % (mae, mse))
#         print("\t\t Sum of model coefficients:", sum(model.coef_))
#         non_zero_features_dict = dict([(X_columns[i], model.coef_[i]) for i in range(len(model.coef_)) if model.coef_[i]!=0])
#         print("\t\t Non zero features with values:", non_zero_features_dict)
#         print("")
    del train_drug
    del test_drug

170 Shikonin
173 FH535
180 Thapsigargin
200 Dacinostat
219 AT-7519
272 AR-42
273 CUDC-101
274 Belinostat
276 CAY10603
328 SNX-2112
346 THZ-2-102-1


In [33]:
results[["Drug_Name"]+["sum_coef_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,sum_coef_0,sum_coef_1,sum_coef_2,sum_coef_3
170,Shikonin,0.0,0.0,0.0,0.0
173,FH535,0.0,0.0,0.0,0.0
180,Thapsigargin,0.0,0.0,0.0,0.0
200,Dacinostat,0.0,0.0,0.0,0.0
219,AT-7519,0.0,0.0,0.0,0.0
272,AR-42,0.0,0.0,0.0,0.0
273,CUDC-101,0.0,0.0,0.0,0.0
274,Belinostat,0.0,0.0,0.0,0.0
276,CAY10603,0.0,0.0,0.0,0.0
328,SNX-2112,0.0,0.0,0.0,0.0


In [34]:
results[["Drug_Name"]+["intercept_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,intercept_0,intercept_1,intercept_2,intercept_3
170,Shikonin,0.580835,0.580835,0.580835,0.580835
173,FH535,0.614222,0.614222,0.614222,0.614222
180,Thapsigargin,1.571673,1.571673,1.571673,1.571673
200,Dacinostat,0.594152,0.594152,0.594152,0.594152
219,AT-7519,0.669953,0.669953,0.669953,0.669953
272,AR-42,0.664511,0.664511,0.664511,0.664511
273,CUDC-101,0.594359,0.594359,0.594359,0.594359
274,Belinostat,0.604637,0.604637,0.604637,0.604637
276,CAY10603,0.52804,0.52804,0.52804,0.52804
328,SNX-2112,0.477085,0.477085,0.477085,0.477085


In [35]:
results[["Drug_Name"]+["mae_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,mae_0,mae_1,mae_2,mae_3
170,Shikonin,0.124879,0.124879,0.124879,0.124879
173,FH535,0.13535,0.13535,0.13535,0.13535
180,Thapsigargin,2.502282,2.502282,2.502282,2.502282
200,Dacinostat,0.170154,0.170154,0.170154,0.170154
219,AT-7519,0.192086,0.192086,0.192086,0.192086
272,AR-42,0.159842,0.159842,0.159842,0.159842
273,CUDC-101,0.062858,0.062858,0.062858,0.062858
274,Belinostat,0.184657,0.184657,0.184657,0.184657
276,CAY10603,0.122949,0.122949,0.122949,0.122949
328,SNX-2112,0.131667,0.131667,0.131667,0.131667


In [36]:
results[["Drug_Name"]+["mse_"+str(i) for i in range(4)]]

Unnamed: 0,Drug_Name,mse_0,mse_1,mse_2,mse_3
170,Shikonin,0.022347,0.022347,0.022347,0.022347
173,FH535,0.050504,0.050504,0.050504,0.050504
180,Thapsigargin,18.146173,18.146173,18.146173,18.146173
200,Dacinostat,0.040978,0.040978,0.040978,0.040978
219,AT-7519,0.044297,0.044297,0.044297,0.044297
272,AR-42,0.034059,0.034059,0.034059,0.034059
273,CUDC-101,0.005848,0.005848,0.005848,0.005848
274,Belinostat,0.040335,0.040335,0.040335,0.040335
276,CAY10603,0.019273,0.019273,0.019273,0.019273
328,SNX-2112,0.021644,0.021644,0.021644,0.021644


In [37]:
mean_mae = round(results["mae_0"].mean(), 3)
std_mae = round(results["mae_0"].std(), 3)

mean_mae_no_180 = round(results.drop(180, axis=0)["mae_0"].mean(), 3)
std_mae_no_180 = round(results.drop(180, axis=0)["mae_0"].std(), 3)

print("Mean MAE:", mean_mae, mean_mae_no_180)
print("std MAE:", std_mae, std_mae_no_180)

mean_mse = round(results["mse_0"].mean(), 3)
std_mse = round(results["mse_0"].std(), 3)

mean_mse_no_180 = round(results.drop(180, axis=0)["mse_0"].mean(), 3)
std_mse_no_180 = round(results.drop(180, axis=0)["mse_0"].std(), 3)

print("\nMean MSE:", mean_mse, mean_mse_no_180)
print("std MSE:", std_mse, std_mse_no_180)

Mean MAE: 0.351 0.136
std MAE: 0.715 0.042

Mean MSE: 1.676 0.029
std MSE: 5.463 0.015


### Training for all drugs together

In [38]:
train_drug = train_df_50.loc[drug_id,:].copy()
test_drug = test_df_50.loc[drug_id,:].copy()

for i, data_set in list(enumerate(datasets)):
    
    if ((data_set == "Dataset 3") | (data_set == "Dataset 4")):
        scaler = MinMaxScaler()
        train_drug[columns_to_normalise] = scaler.fit_transform(train_drug[columns_to_normalise])
        test_drug[columns_to_normalise] = scaler.transform(test_drug[columns_to_normalise])
        
    X_columns = X_feat_dict[data_set]
 
    X_train = train_drug[X_columns].values
    y_train = train_drug["param_1"].values

    X_test = test_drug[X_columns].values
    y_test = test_drug["param_1"].values

    alpha = 500
    model = Lasso(alpha=alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print("\t\t MAE %0.3f, MSE: %0.3f" % (mae, mse))
    print("\t\t Sum of model coefficients:", sum(model.coef_))
    non_zero_features_dict = dict([(X_columns[i], model.coef_[i]) for i in range(len(model.coef_)) if model.coef_[i]!=0])
    print("\t\t Non zero features with values:", non_zero_features_dict)
    print("")
del train_drug
del test_drug

		 MAE 0.078, MSE: 0.010
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}

		 MAE 0.078, MSE: 0.010
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}

		 MAE 0.078, MSE: 0.010
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}

		 MAE 0.078, MSE: 0.010
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}



In [39]:
train_drug = train_df_50.loc[drug_id,:].copy()
test_drug = test_df_50.loc[drug_id,:].copy()

for i, data_set in list(enumerate(datasets)):
    
    if ((data_set == "Dataset 3") | (data_set == "Dataset 4")):
        scaler = MinMaxScaler()
        train_drug[columns_to_normalise] = scaler.fit_transform(train_drug[columns_to_normalise])
        test_drug[columns_to_normalise] = scaler.transform(test_drug[columns_to_normalise])
        
    X_columns = X_feat_dict[data_set]
 
    X_train = train_drug[X_columns].values
    y_train = train_drug["param_2"].values

    X_test = test_drug[X_columns].values
    y_test = test_drug["param_2"].values

    alpha = 500
    model = Lasso(alpha=alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print("\t\t MAE %0.3f, MSE: %0.3f" % (mae, mse))
    print("\t\t Sum of model coefficients:", sum(model.coef_))
    non_zero_features_dict = dict([(X_columns[i], model.coef_[i]) for i in range(len(model.coef_)) if model.coef_[i]!=0])
    print("\t\t Non zero features with values:", non_zero_features_dict)
    print("")
del train_drug
del test_drug

		 MAE 0.166, MSE: 0.043
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}

		 MAE 0.166, MSE: 0.043
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}

		 MAE 0.166, MSE: 0.043
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}

		 MAE 0.166, MSE: 0.043
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}



In [40]:
train_drug = train_df_50.loc[drug_id,:].copy()
test_drug = test_df_50.loc[drug_id,:].copy()

for i, data_set in list(enumerate(datasets)):
    
    if ((data_set == "Dataset 3") | (data_set == "Dataset 4")):
        scaler = MinMaxScaler()
        train_drug[columns_to_normalise] = scaler.fit_transform(train_drug[columns_to_normalise])
        test_drug[columns_to_normalise] = scaler.transform(test_drug[columns_to_normalise])
        
    X_columns = X_feat_dict[data_set]
 
    X_train = train_drug[X_columns].values
    y_train = train_drug["param_3"].values

    X_test = test_drug[X_columns].values
    y_test = test_drug["param_3"].values

    alpha = 500
    model = Lasso(alpha=alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print("\t\t MAE %0.3f, MSE: %0.3f" % (mae, mse))
    print("\t\t Sum of model coefficients:", sum(model.coef_))
    non_zero_features_dict = dict([(X_columns[i], model.coef_[i]) for i in range(len(model.coef_)) if model.coef_[i]!=0])
    print("\t\t Non zero features with values:", non_zero_features_dict)
    print("")
del train_drug
del test_drug

		 MAE 7.265, MSE: 75.490
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}

		 MAE 7.265, MSE: 75.490
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}

		 MAE 7.265, MSE: 75.490
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}

		 MAE 7.265, MSE: 75.490
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}



In [41]:
train_drug = train_df_50.loc[drug_id,:].copy()
test_drug = test_df_50.loc[drug_id,:].copy()

for i, data_set in list(enumerate(datasets)):
    
    if ((data_set == "Dataset 3") | (data_set == "Dataset 4")):
        scaler = MinMaxScaler()
        train_drug[columns_to_normalise] = scaler.fit_transform(train_drug[columns_to_normalise])
        test_drug[columns_to_normalise] = scaler.transform(test_drug[columns_to_normalise])
        
    X_columns = X_feat_dict[data_set]
 
    X_train = train_drug[X_columns].values
    y_train = train_drug["param_4"].values

    X_test = test_drug[X_columns].values
    y_test = test_drug["param_4"].values

    alpha = 500
    model = Lasso(alpha=alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print("\t\t MAE %0.3f, MSE: %0.3f" % (mae, mse))
    print("\t\t Sum of model coefficients:", sum(model.coef_))
    non_zero_features_dict = dict([(X_columns[i], model.coef_[i]) for i in range(len(model.coef_)) if model.coef_[i]!=0])
    print("\t\t Non zero features with values:", non_zero_features_dict)
    print("")
del train_drug
del test_drug

		 MAE 0.110, MSE: 0.017
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}

		 MAE 0.110, MSE: 0.017
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}

		 MAE 0.110, MSE: 0.017
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}

		 MAE 0.110, MSE: 0.017
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}



### For one drug

In [50]:
data_set = "Dataset 3"
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

X_columns = X_feat_dict[data_set]

In [51]:
train_drug["molecular_weight"].value_counts()

464.50    92
312.40    85
434.50    84
288.29    81
318.30    80
446.50    80
379.50    69
382.20    68
650.80    57
553.00    45
361.20    40
Name: molecular_weight, dtype: int64

In [52]:
train_drug[columns_to_normalise].head()

Unnamed: 0_level_0,molecular_weight,2bonds,xlogp,surface_area,complexity,h_bond_donor_count,h_bond_acceptor_count,rotatable_bond_count,heavy_atom_count,atom_stereo_count,defined_atom_stereo_count,undefined_atom_stereo_count,bond_stereo_count,covalent_unit_count,MAX_CONC
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
170,288.29,7.0,3.0,94.8,501.0,3.0,5.0,3.0,21.0,1.0,1.0,0.0,0.0,1.0,16.0
170,288.29,7.0,3.0,94.8,501.0,3.0,5.0,3.0,21.0,1.0,1.0,0.0,0.0,1.0,16.0
170,288.29,7.0,3.0,94.8,501.0,3.0,5.0,3.0,21.0,1.0,1.0,0.0,0.0,1.0,16.0
170,288.29,7.0,3.0,94.8,501.0,3.0,5.0,3.0,21.0,1.0,1.0,0.0,0.0,1.0,16.0
170,288.29,7.0,3.0,94.8,501.0,3.0,5.0,3.0,21.0,1.0,1.0,0.0,0.0,1.0,16.0


In [44]:
scaler = MinMaxScaler()
train_drug[columns_to_normalise] = scaler.fit_transform(train_drug[columns_to_normalise])
test_drug[columns_to_normalise] = scaler.transform(test_drug[columns_to_normalise])

In [45]:
train_drug[columns_to_normalise].head()

Unnamed: 0_level_0,molecular_weight,2bonds,xlogp,surface_area,complexity,h_bond_donor_count,h_bond_acceptor_count,rotatable_bond_count,heavy_atom_count,atom_stereo_count,defined_atom_stereo_count,undefined_atom_stereo_count,bond_stereo_count,covalent_unit_count,MAX_CONC
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
170,0.0,0.0,0.206349,0.175214,0.119129,0.666667,0.222222,0.0,0.0,0.125,0.125,0.0,0.0,0.0,0.49187
170,0.0,0.0,0.206349,0.175214,0.119129,0.666667,0.222222,0.0,0.0,0.125,0.125,0.0,0.0,0.0,0.49187
170,0.0,0.0,0.206349,0.175214,0.119129,0.666667,0.222222,0.0,0.0,0.125,0.125,0.0,0.0,0.0,0.49187
170,0.0,0.0,0.206349,0.175214,0.119129,0.666667,0.222222,0.0,0.0,0.125,0.125,0.0,0.0,0.0,0.49187
170,0.0,0.0,0.206349,0.175214,0.119129,0.666667,0.222222,0.0,0.0,0.125,0.125,0.0,0.0,0.0,0.49187


In [46]:
train_drug["molecular_weight"].value_counts()

0.486083    92
0.066509    85
0.403327    84
0.000000    81
0.082784    80
0.436429    80
0.251607    69
0.259055    68
1.000000    57
0.730214    45
0.201125    40
Name: molecular_weight, dtype: int64

In [47]:
data_set = "Dataset 3"
 
X_train = train_drug[X_columns].values
y_train = train_drug["param_1"].values

X_test = test_drug[X_columns].values
y_test = test_drug["param_1"].values

alpha = 500
model = Lasso(alpha=alpha)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [48]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("\t\t MAE %0.3f, MSE: %0.3f" % (mae, mse))
print("\t\t Sum of model coefficients:", sum(model.coef_))
non_zero_features_dict = dict([(X_columns[i], model.coef_[i]) for i in range(len(model.coef_)) if model.coef_[i]!=0])
print("\t\t Non zero features with values:", non_zero_features_dict)
print("")
del train_drug
del test_drug

		 MAE 0.308, MSE: 1.510
		 Sum of model coefficients: 0.0
		 Non zero features with values: {}



In [49]:
model.intercept_

0.6465477106380434