## Predict one-by-one

The main idea - follow the same procedure in grid search of hyperparameters
as it was for the all drugs training
The only difference is:
    - the transformation of the datasets, 
    - usage of a drug dosage column as an additional feature 
    - concentration on the separate normalised responses as target variables.
 

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import mean_absolute_error
import time

np.random.seed(123)

# _FOLDER = "/home/acq18mk/master/results/results/"
_FOLDER = "results/"

In [2]:
def ReadDataSets(dict_data_type, training_data_type):
    with open(dict_data_type[training_data_type]["drug_ids_list"], 'r') as f:
        drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
    train = pd.read_csv(dict_data_type[training_data_type]["train_df"]).set_index("DRUG_ID")
    test = pd.read_csv(dict_data_type[training_data_type]["test_df"]).set_index("DRUG_ID")               

    return train, test, drug_ids_50

def mean_relative_error(y_true, y_pred):
    return sum(abs(y_pred-y_true)*100/y_true)/len(y_true)

In [3]:
with open(_FOLDER+"X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_PubChem_properties.txt", 'r') as f:
    X_PubChem_properties = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_features_Targets.txt", 'r') as f:
    X_targets = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_features_Target_Pathway.txt", 'r') as f:
    X_target_pathway = [line.rstrip('\n') for line in f]
# *****************************************

all_columns = X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway +["MAX_CONC"]

dict_data_type = {
    "original_data": {
        "drug_ids_list": _FOLDER + "drug_ids_50.txt", 
        "train_df" : _FOLDER + "large_train_sigmoid4.csv",
        "test_df" : _FOLDER + "large_test_sigmoid4.csv"},
    
    "reduced_by_R2_data": {
        "drug_ids_list": _FOLDER +"drug_ids_50_restr.txt", 
        "train_df" : _FOLDER + "large_train_sigmoid4_restr.csv",
        "test_df" : _FOLDER + "large_test_sigmoid4_restr.csv"}
}

## Original data

In [4]:
df_train, df_test, drug_ids_50 = ReadDataSets(dict_data_type, training_data_type="original_data")

X_columns = ["scaled_x"] + ["MAX_CONC"] + X_PubChem_properties +  X_targets + X_target_pathway + X_cancer_cell_lines

scaler = MinMaxScaler().fit(df_train[X_columns])
X_train = scaler.transform(df_train[X_columns])
y_train = df_train["norm_y"].values

X_test = scaler.transform(df_test[X_columns])
y_test = df_test["norm_y"].values

## RBF KR as for Coef 4

In [5]:
# RBF KernelRidge
i=4
kernel = "rbf"
alpha = 0.1
gamma =  0.0001
coef0 = 0.01
model = KernelRidge(kernel = kernel, 
                    alpha=alpha, 
                    gamma = gamma,
                    coef0= coef0)
model.fit(X_train, y_train)  
y_pred = model.predict(X_test)  
abs_errors = abs(y_test - y_pred)
rel_errors = abs((y_test - y_pred)*100/y_test)

df_test["abs_error"] = abs_errors
df_test["rel_error"] = rel_errors

df_test["pred_norm_y"] = y_pred

mae = np.zeros(len(drug_ids_50))
mre = np.zeros(len(drug_ids_50))
for i, drug_id in list(enumerate(drug_ids_50)):
    y_test_drug = df_test.loc[drug_id, "norm_y"].values
    y_pred = df_test.loc[drug_id, "pred_norm_y"]
    mae[i] = mean_absolute_error(y_test_drug, y_pred)
    mre[i] = mean_relative_error(y_test_drug, y_pred)

print("MAE: %0.3f +/- %0.3f" % (mae.mean(), mae.std()))
print("MRE: %0.1f +/- %0.1f" % (mre.mean(), mre.std()))
print("\nAbsolute Erros:")
print("AE for y<0.2: %.3f +/- %.3f" % (df_test[df_test["norm_y"]<0.2]["abs_error"].mean(),
                                        df_test[df_test["norm_y"]<0.2]["abs_error"].std()))

print("AE for y=[0.4,0.6]: %.3f +/- %.3f" % (df_test[(df_test["norm_y"]>0.4)&(df_test["norm_y"]<0.6)]["abs_error"].mean(),
                                        df_test[(df_test["norm_y"]>0.4)&(df_test["norm_y"]<0.6)]["abs_error"].std()))

print("AE for y>0.8: %.3f +/- %.3f" % (df_test[df_test["norm_y"]>0.8]["abs_error"].mean(),
                                        df_test[df_test["norm_y"]>0.8]["abs_error"].std()))

MAE: 0.200 +/- 0.023
MRE: 386.2 +/- 229.2
\AE for y<0.2: 0.239 +/- 0.105
AE for y=[0.4,0.6]: 0.065 +/- 0.052
AE for y>0.8: 0.227 +/- 0.084


## RFE on the original data

In [6]:
from sklearn.feature_selection import RFE

## Training on the reduced by R2 data

In [7]:
df_train, df_test, drug_ids_50 = ReadDataSets(dict_data_type, training_data_type="reduced_by_R2_data")

X_columns = ["scaled_x"] + ["MAX_CONC"] + X_PubChem_properties +  X_targets + X_target_pathway + X_cancer_cell_lines

scaler = MinMaxScaler().fit(df_train[X_columns])
X_train = scaler.transform(df_train[X_columns])
y_train = df_train["norm_y"].values

X_test = scaler.transform(df_test[X_columns])
y_test = df_test["norm_y"].values

# RBF KernelRidge
i=4
kernel = "rbf"
alpha = 0.1
gamma =  0.0001
coef0 = 0.01
model = KernelRidge(kernel = kernel, 
                    alpha=alpha, 
                    gamma = gamma,
                    coef0= coef0)
model.fit(X_train, y_train)  
y_pred = model.predict(X_test)  
abs_errors = abs(y_test - y_pred)
rel_errors = abs((y_test - y_pred)*100/y_test)

df_test["abs_error"] = abs_errors
df_test["rel_error"] = rel_errors

df_test["pred_norm_y"] = y_pred

mae = np.zeros(len(drug_ids_50))
mre = np.zeros(len(drug_ids_50))
for i, drug_id in list(enumerate(drug_ids_50)):
    y_test_drug = df_test.loc[drug_id, "norm_y"].values
    y_pred = df_test.loc[drug_id, "pred_norm_y"]
    mae[i] = mean_absolute_error(y_test_drug, y_pred)
    mre[i] = mean_relative_error(y_test_drug, y_pred)

print("MAE: %0.3f +/- %0.3f" % (mae.mean(), mae.std()))
print("MRE: %0.1f +/- %0.1f" % (mre.mean(), mre.std()))
print("\nAbsolute Erros:")
print("AE for y<0.2: %.3f +/- %.3f" % (df_test[df_test["norm_y"]<0.2]["abs_error"].mean(),
                                        df_test[df_test["norm_y"]<0.2]["abs_error"].std()))

print("AE for y=[0.4,0.6]: %.3f +/- %.3f" % (df_test[(df_test["norm_y"]>0.4)&(df_test["norm_y"]<0.6)]["abs_error"].mean(),
                                        df_test[(df_test["norm_y"]>0.4)&(df_test["norm_y"]<0.6)]["abs_error"].std()))

print("AE for y>0.8: %.3f +/- %.3f" % (df_test[df_test["norm_y"]>0.8]["abs_error"].mean(),
                                        df_test[df_test["norm_y"]>0.8]["abs_error"].std()))

MAE: 0.201 +/- 0.025
MRE: 397.2 +/- 289.5
\AE for y<0.2: 0.238 +/- 0.104
AE for y=[0.4,0.6]: 0.073 +/- 0.055
AE for y>0.8: 0.231 +/- 0.085


## RFE on the reduced data