## Parameter tuning for Support Vector Regression

1. Select drugs which has more than50 records/drug profiles => 11 drugs
2. for each of the drug run grid search of hyperparameters in cross-validation

In [112]:
import numpy as np
import pandas as pd
from scipy.stats import norm
import os
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.svm import SVR
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
# import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
import time
_FOLDER = "../results/"
import tqdm

In [8]:
os.listdir(_FOLDER)

['drug_cells_kernels_best_parameters.csv',
 'test02_merged_fitted_sigmoid4_123_with_drugs_properties.csv',
 'drug_cells_PubChem_scaled_kernels_best_parameters.csv',
 'kernel_learning_01_2.csv',
 '.DS_Store',
 'kernel_learning_03_2.csv',
 'test02_merged_fitted_sigmoid4_123_with_drugs_description.csv',
 'drug_features_with_pubchem_properties.csv',
 'kernel_learning_03.csv',
 'kernel_learning_02.csv',
 'kernel_ridge_model_parameters_drug_cells_PubChem.csv',
 'merged_fitted_sigmoid4_123_with_drugs_description.csv',
 'filtered_drug_profiles_13.csv',
 'merged_fitted_sigmoid4_123_with_drugs_properties_split_target.csv',
 'kernel_learning_01.csv',
 'filtered_drug_profiles_12.csv',
 'svr_model_parameters_drug_cells_PubChem_Scaling.csv',
 'merged_fitted_sigmoid4_123_with_drugs_description_split_target.csv',
 'statistics_of_sigmoid_coefficients.csv',
 'kernel_ridge_model_parameters_drug_cells_PubChem_Scaling.csv',
 'kernel_learning_04.csv',
 'svr_model_parameters_drug_cells_description.csv',
 'dr

In [9]:
# 1. Just drug profiles and cell lines properties

print("\n1. Finding optimal parameters for just drug profiles and cell lines\n")
df = pd.read_csv(_FOLDER+'merged_fitted_sigmoid4_123_with_drugs_description.csv')

conc_columns= ["fd_num_"+str(i) for i in range(10)]
response_norm = ['norm_cells_'+str(i) for i in range(10)]

gr = df.groupby(["DRUG_ID"])["COSMIC_ID"].count()
drug_ids = list(gr[gr > 50].index)
print("Number of drugs for training:", len(drug_ids))


1. Finding optimal parameters for just drug profiles and cell lines

Number of drugs for training: 11


In [10]:
drug_ids

[170, 173, 180, 200, 219, 272, 273, 274, 276, 328, 346]

(102, 1102)

In [54]:
column_not_to_use = ["Unnamed: 0", "COSMIC_ID", "DRUG_ID", "Drug_Name", "Synonyms", "Target", 
                     "deriv_found", "PubChem_ID", "elements", "inchi_key", "canonical_smiles", 
                     "inchi_string", "molecular_formula", "Target",
                     "third_target", "first_target", "second_target", "Target_Pathway"]

param1 = ["param_" +str(i) for i in range(1, 5)]
con_columns  = ["fd_num_"+str(i) for i in range(10)]
norm_response  = ["norm_cells_"+str(i) for i in range(10)]

not_X_columns = param1 +norm_response + con_columns+column_not_to_use
X_columns = set(df.columns) - set(not_X_columns)
len(X_columns)

1073

In [None]:
# first fix a drug and a coefficient of sigmoid_4_param

In [58]:
drug_id = 170
y_sigmoid = "param_1"
df_170 = df[df["DRUG_ID"]==170]
df_170.shape

X = df_170[X_columns].values
y = df_170[y_sigmoid].values

In [117]:
%%time

number_splits = 10
list_train_index = []
list_test_index = []

loo_spliter = LeaveOneOut()
for train_index, test_index in loo_spliter.split(X, y):
    list_train_index.append(train_index)
    list_test_index.append(test_index)

n_C = 10
n_epsion = 5
n_coef = 5
C_range = np.logspace(-2, 10, n_C)
kernels = np.array(["linear", "poly", "sigmoid","rbf"])
degree_range = np.arange(1,6)

epsilon_range = np.logspace(0.001, 5, n_epsion)
coef0_range = np.logspace(0.001, 5, n_coef)


param_grid = {}

param_grid["linear"] = dict(C = C_range, epsilon = epsilon_range)

param_grid["poly"] = dict(C = C_range, degree = degree_range, 
                  epsilon = epsilon_range, coef0 = coef0_range)

param_grid["sigmoid"] = dict(C = C_range, epsilon = epsilon_range, coef0 = coef0_range)

cv = LeaveOneOut()
accuracies = {}
for kernel in kernels:
    grid = GridSearchCV(SVR(kernel = kernel), param_grid=param_grid[kernel], cv=cv, scoring= "neg_mean_squared_error")
    accuracies[kernel] = np.zeros(len(list_train_index))
    for i in range(len(list_train_index)):
        # We first do the grid search
        Xtrain_i = X[list_train_index[i]]
        ytrain_i = y[list_train_index[i]]
        grid.fit(Xtrain_i, ytrain_i)
        
        # Now with the best parameter we can evaluate the test set
        if kernel == "linear":
            model = SVR(kernel = kernel, C=grid.best_params_["C"], epsilon = grid.best_params_["epsilon"])
        elif kernel == "poly":
            model = SVR(kernel = kernel, degree = grid.best_params_["degree"], coef0 = grid.best_params_["coef0"],
                        C=grid.best_params_["C"], epsilon = grid.best_params_["epsilon"])
        else:
            model = SVR(kernel = kernel, coef0 = grid.best_params_["coef0"],
                        C=grid.best_params_["C"], epsilon = grid.best_params_["epsilon"])
        model.fit(Xtrain_i, ytrain_i)
        Xtest_i = X[list_test_index[i]]
        ytest_i = y[list_test_index[i]]
        ypred = model.predict(Xtest_i)    
        accuracies[kernel][i] = mean_squared_error(ytest_i, ypred)

KeyboardInterrupt: 

In [None]:
number_splits = 10
nC = 10;
nGamma = 10;
nsplits_val = 2;
C_range = np.logspace(-2, 10, nC)
gamma_range = np.logspace(-9, 3, nGamma)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=nsplits_val, test_size=0.5, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
accuracies = np.zeros((number_splits))

list_train_index = list()
list_test_index = list()

skf = StratifiedKFold(n_splits=number_splits, random_state=10)
for train_index, test_index in skf.split(X, y):
    list_train_index.append(train_index)
    list_test_index.append(test_index)
    
for i in range(number_splits):
    # We first do the grid search
    sscaler = StandardScaler().fit(X[list_train_index[i],:])
    Xtrain_local = sscaler.transform(X[list_train_index[i],:])
    ytrain_local = y[list_train_index[i]]
    grid.fit(Xtrain_local, ytrain_local)
    # Now with the best parameter we can evaluate the test set
    clf = SVC(gamma=grid.best_params_["gamma"],C=grid.best_params_["C"])
    clf.fit(Xtrain_local, ytrain_local)
    Xtest_local = sscaler.transform(X[list_test_index[i],:])
    ytest_local = y[list_test_index[i]]
    ypred = clf.predict(Xtest_local)    
    accuracies[i] = accuracy_score(ytest_local, ypred)