### Predicting separate y-points vs prediction of the coefficients of fitting curve

The idea predict response to each dosage of a drug
- input - x and and a constant set of cell lines and drug features 
- output - just a response
<br><br> We know as we filtered that in that way, that we have sigmoid dependency between x and y
<br>Therefore we need to take a model with sigmoid dependency
- Generalised linear regression? SVR and KernelRidge with sigmoid kernel?
<br><br> Transform dataset so that it has only 1 concentration and one response, i.e. increase the datset in 10 times
<br> Assumption: as the features are not changed in their assense, we can assume that the best hyperparameters found by grid search will also work here

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")
_FOLDER ="results/"

In [2]:
def ReadDataSets(dict_data_type, training_data_type):
    with open(dict_data_type[training_data_type]["drug_ids_list"], 'r') as f:
        drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
    train = pd.read_csv(dict_data_type[training_data_type]["train_df"]).set_index("DRUG_ID")
    test = pd.read_csv(dict_data_type[training_data_type]["test_df"]).set_index("DRUG_ID")               

    return train, test, drug_ids_50

In [3]:
with open(_FOLDER+"X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_PubChem_properties.txt", 'r') as f:
    X_PubChem_properties = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_features_Targets.txt", 'r') as f:
    X_targets = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_features_Target_Pathway.txt", 'r') as f:
    X_target_pathway = [line.rstrip('\n') for line in f]
# *****************************************

all_columns = X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway +["MAX_CONC"]

datasets = ["Dataset_1", "Dataset_2", "Dataset_3", "Dataset_4"]

X_feat_dict = {"Dataset_1": X_cancer_cell_lines ,
               "Dataset_2": ["MAX_CONC"] + X_targets + X_target_pathway + X_cancer_cell_lines ,
               "Dataset_3": ["MAX_CONC"] + X_PubChem_properties +  X_cancer_cell_lines,
               "Dataset_4": ["MAX_CONC"] + X_PubChem_properties +  X_targets + X_target_pathway + X_cancer_cell_lines}

dict_data_type = {
    "original_data": {
        "drug_ids_list": _FOLDER + "drug_ids_50.txt", 
        "train_df" : _FOLDER + "large_train_sigmoid4.csv",
        "test_df" : _FOLDER + "large_test_sigmoid4.csv"},
    
    "reduced_by_R2_data": {
        "drug_ids_list": _FOLDER +"drug_ids_50_restr.txt", 
        "train_df" : _FOLDER + "large_train_sigmoid4_restr.csv",
        "test_df" : _FOLDER + "large_test_sigmoid4_restr.csv"}
}

## Training on the original data

In [4]:
print("Training and testing on the original data\n")

df_train, df_test, drug_ids_50 = ReadDataSets(dict_data_type, training_data_type="original_data")

best_param_dict = {"param_1" : {"kernel" : "rbf", "C" : 0.1, "epsilon": 0.001, "coef0" : 0.01},
                   "param_2" : {"kernel" : "rbf", "C" : 0.1, "epsilon": 0.001, "coef0" : 0.01},
                   "param_3" : {"kernel" : "linear", "C" : 0.01, "epsilon": 0.001, "coef0" : 0.0},
                   "param_4" : {"kernel" : "rbf", "C" : 0.1, "epsilon": 0.001, "coef0" : 0.01}}

### drug-by-drug training SVR
print("\nDrug-by-drug training and testing SVR")

y_pred =[]

for drug_id in drug_ids_50:
    drug_name = df_train.loc[drug_id, "Drug_Name"].values[0]
    train_drug = df_train.loc[drug_id, :]
    test_drug = df_test.loc[drug_id, :]
    y_train_drug = train_drug["norm_y"].values
    y_test_drug =  test_drug["norm_y"].values
    
    X_columns = X_feat_dict["Dataset_4"] + ["scaled_x"]
    scaler = MinMaxScaler().fit(train_drug[X_columns])
    Xtrain_drug = scaler.transform(train_drug[X_columns])
    Xtest_drug = scaler.transform(test_drug[X_columns])
    
    model = SVR(kernel = best_param_dict["param_1"]["kernel"], 
                        C = best_param_dict["param_1"]["C"], 
                        epsilon = best_param_dict["param_1"]["epsilon"], 
                        coef0 = best_param_dict["param_1"]["coef0"])
    
    model.fit(Xtrain_drug, y_train_drug)
    y_pred.extend(model.predict(Xtest_drug))

abs_errors = abs(df_test["norm_y"] - y_pred) 
print("MAE for reconstruction of responses: %0.3f +/- %0.3f" % (abs_errors.mean(), abs_errors.std()))


### All drug training SVR
print("\nAll drugs training and testing SVR")


y_train_drug = df_train["norm_y"].values
y_test_drug =  df_test["norm_y"].values
    
X_columns = X_feat_dict["Dataset_4"] + ["scaled_x"]
scaler = MinMaxScaler().fit(df_train[X_columns])
Xtrain_drug = scaler.transform(df_train[X_columns])
Xtest_drug = scaler.transform(df_test[X_columns])
    
model = SVR(kernel = best_param_dict["param_1"]["kernel"], 
                        C = best_param_dict["param_1"]["C"], 
                        epsilon = best_param_dict["param_1"]["epsilon"], 
                        coef0 = best_param_dict["param_1"]["coef0"])
    
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug)

abs_errors = abs(df_test["norm_y"] - y_pred) 
print("MAE for reconstruction of responses: %0.3f +/- %0.3f" % (abs_errors.mean(), abs_errors.std()))

### Drug-by-drug training KernelRidge

best_param_dict = {"param_1" : {"kernel" : "sigmoid", "alpha" : 0.1, "gamma": 0.00001, "coef0" : 0.01},
                   "param_2" : {"kernel" : "rbf", "alpha" : 0.1, "gamma": 0.00001, "coef0" : 0.01},
                  "param_3" : {"kernel" : "sigmoid", "alpha" : 0.1, "gamma": 0.00001, "coef0" : 0.01},
                  "param_4" : {"kernel" : "sigmoid", "alpha" : 0.1, "gamma": 0.00001, "coef0" : 0.01}}

### drug-by-drug training
print("\nDrug-by-drug training and testing KernelRidge\n")

y_pred =[]
for drug_id in drug_ids_50:
    drug_name = df_train.loc[drug_id, "Drug_Name"].values[0]
    train_drug = df_train.loc[drug_id, :]
    test_drug = df_test.loc[drug_id, :]
    y_train_drug = train_drug["norm_y"].values
    y_test_drug =  test_drug["norm_y"].values
    
    X_columns = X_feat_dict["Dataset_4"] + ["scaled_x"]
    scaler = MinMaxScaler().fit(train_drug[X_columns])
    Xtrain_drug = scaler.transform(train_drug[X_columns])
    Xtest_drug = scaler.transform(test_drug[X_columns])
    
    model = KernelRidge(kernel = best_param_dict["param_1"]["kernel"], 
                        alpha = best_param_dict["param_1"]["alpha"], 
                        coef0 = best_param_dict["param_1"]["coef0"])
    
    model.fit(Xtrain_drug, y_train_drug)
    y_pred.extend(model.predict(Xtest_drug))

abs_errors = abs(df_test["norm_y"] - y_pred) 
print("MAE for reconstruction of responses: %0.3f +/- %0.3f" % (abs_errors.mean(), abs_errors.std()))


### All drugs training and testing KernelRidge
print("\nAll drugs training and testing KernelRidge")

data_set = "Dataset_4"
y_train_drug = df_train["norm_y"].values
y_test_drug =  df_test["norm_y"].values
    
X_columns = X_feat_dict[data_set] + ["scaled_x"]
scaler = MinMaxScaler().fit(df_train[X_columns])
Xtrain_drug = scaler.transform(df_train[X_columns])
Xtest_drug = scaler.transform(df_test[X_columns])
    
model = KernelRidge(kernel = best_param_dict["param_1"]["kernel"], 
                        alpha = best_param_dict["param_1"]["alpha"], 
                        coef0 = best_param_dict["param_1"]["coef0"])
    
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug)

abs_errors = abs(df_test["norm_y"] - y_pred) 
print("MAE for reconstruction of responses: %0.3f +/- %0.3f" % (abs_errors.mean(), abs_errors.std()))

Training and testing on the original data


Drug-by-drug training and testing SVR
MAE for reconstruction of responses: 0.387 +/- 0.196

All drugs training and testing SVR
MAE for reconstruction of responses: 0.293 +/- 0.123

Drug-by-drug training and testing KernelRidge

MAE for reconstruction of responses: 0.371 +/- 0.181

All drugs training and testing KernelRidge
MAE for reconstruction of responses: 0.152 +/- 0.112


## Training on the reduced by R2 data

In [5]:
print("Training and testing on the reduced by R2  data\n")

df_train, df_test, drug_ids_50 = ReadDataSets(dict_data_type, training_data_type="reduced_by_R2_data")

best_param_dict = {"param_1" : {"kernel" : "rbf", "C" : 0.1, "epsilon": 0.001, "coef0" : 0.01},
                   "param_2" : {"kernel" : "rbf", "C" : 0.1, "epsilon": 0.001, "coef0" : 0.01},
                   "param_3" : {"kernel" : "linear", "C" : 0.01, "epsilon": 0.001, "coef0" : 0.0},
                   "param_4" : {"kernel" : "rbf", "C" : 0.1, "epsilon": 0.001, "coef0" : 0.01}}

### drug-by-drug training SVR
print("\nDrug-by-drug training and testing SVR")

y_pred =[]

for drug_id in drug_ids_50:
    drug_name = df_train.loc[drug_id, "Drug_Name"].values[0]
    train_drug = df_train.loc[drug_id, :]
    test_drug = df_test.loc[drug_id, :]
    y_train_drug = train_drug["norm_y"].values
    y_test_drug =  test_drug["norm_y"].values
    
    X_columns = X_feat_dict["Dataset_4"] + ["scaled_x"]
    scaler = MinMaxScaler().fit(train_drug[X_columns])
    Xtrain_drug = scaler.transform(train_drug[X_columns])
    Xtest_drug = scaler.transform(test_drug[X_columns])
    
    model = SVR(kernel = best_param_dict["param_1"]["kernel"], 
                        C = best_param_dict["param_1"]["C"], 
                        epsilon = best_param_dict["param_1"]["epsilon"], 
                        coef0 = best_param_dict["param_1"]["coef0"])
    
    model.fit(Xtrain_drug, y_train_drug)
    y_pred.extend(model.predict(Xtest_drug))

abs_errors = abs(df_test["norm_y"] - y_pred) 
print("MAE for reconstruction of responses: %0.3f +/- %0.3f" % (abs_errors.mean(), abs_errors.std()))


### All drug training SVR
print("\nAll drugs training and testing SVR")


y_train_drug = df_train["norm_y"].values
y_test_drug =  df_test["norm_y"].values
    
X_columns = X_feat_dict["Dataset_4"] + ["scaled_x"]
scaler = MinMaxScaler().fit(df_train[X_columns])
Xtrain_drug = scaler.transform(df_train[X_columns])
Xtest_drug = scaler.transform(df_test[X_columns])
    
model = SVR(kernel = best_param_dict["param_1"]["kernel"], 
                        C = best_param_dict["param_1"]["C"], 
                        epsilon = best_param_dict["param_1"]["epsilon"], 
                        coef0 = best_param_dict["param_1"]["coef0"])
    
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug)

abs_errors = abs(df_test["norm_y"] - y_pred) 
print("MAE for reconstruction of responses: %0.3f +/- %0.3f" % (abs_errors.mean(), abs_errors.std()))

### Drug-by-drug training KernelRidge

best_param_dict = {"param_1" : {"kernel" : "sigmoid", "alpha" : 0.1, "gamma": 0.00001, "coef0" : 0.01},
                   "param_2" : {"kernel" : "rbf", "alpha" : 0.1, "gamma": 0.00001, "coef0" : 0.01},
                  "param_3" : {"kernel" : "sigmoid", "alpha" : 0.1, "gamma": 0.00001, "coef0" : 0.01},
                  "param_4" : {"kernel" : "sigmoid", "alpha" : 0.1, "gamma": 0.00001, "coef0" : 0.01}}

### drug-by-drug training
print("\nDrug-by-drug training and testing KernelRidge\n")

y_pred =[]
for drug_id in drug_ids_50:
    drug_name = df_train.loc[drug_id, "Drug_Name"].values[0]
    train_drug = df_train.loc[drug_id, :]
    test_drug = df_test.loc[drug_id, :]
    y_train_drug = train_drug["norm_y"].values
    y_test_drug =  test_drug["norm_y"].values
    
    X_columns = X_feat_dict["Dataset_4"] + ["scaled_x"]
    scaler = MinMaxScaler().fit(train_drug[X_columns])
    Xtrain_drug = scaler.transform(train_drug[X_columns])
    Xtest_drug = scaler.transform(test_drug[X_columns])
    
    model = KernelRidge(kernel = best_param_dict["param_1"]["kernel"], 
                        alpha = best_param_dict["param_1"]["alpha"], 
                        coef0 = best_param_dict["param_1"]["coef0"])
    
    model.fit(Xtrain_drug, y_train_drug)
    y_pred.extend(model.predict(Xtest_drug))

abs_errors = abs(df_test["norm_y"] - y_pred) 
print("MAE for reconstruction of responses: %0.3f +/- %0.3f" % (abs_errors.mean(), abs_errors.std()))


### All drugs training and testing KernelRidge
print("\nAll drugs training and testing KernelRidge")

data_set = "Dataset_4"
y_train_drug = df_train["norm_y"].values
y_test_drug =  df_test["norm_y"].values
    
X_columns = X_feat_dict[data_set] + ["scaled_x"]
scaler = MinMaxScaler().fit(df_train[X_columns])
Xtrain_drug = scaler.transform(df_train[X_columns])
Xtest_drug = scaler.transform(df_test[X_columns])
    
model = KernelRidge(kernel = best_param_dict["param_1"]["kernel"], 
                        alpha = best_param_dict["param_1"]["alpha"], 
                        coef0 = best_param_dict["param_1"]["coef0"])
    
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug)

abs_errors = abs(df_test["norm_y"] - y_pred) 
print("MAE for reconstruction of responses: %0.3f +/- %0.3f" % (abs_errors.mean(), abs_errors.std()))

Training and testing on the reduced by R2  data


Drug-by-drug training and testing SVR
MAE for reconstruction of responses: 0.384 +/- 0.190

All drugs training and testing SVR
MAE for reconstruction of responses: 0.293 +/- 0.129

Drug-by-drug training and testing KernelRidge

MAE for reconstruction of responses: 0.378 +/- 0.188

All drugs training and testing KernelRidge
MAE for reconstruction of responses: 0.152 +/- 0.113
