In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics import r2_score
from scipy.optimize import curve_fit
from scipy.optimize import leastsq
import scipy.optimize as opt
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import gc

from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import StandardScaler
import os
from data_preprocessing import FilteringCurves, ShowResponseCurves
from fitting_curves import FittingColumn, ShowResponseCurvesWithFitting, compute_r2_score

# from IPython.display import display
_FOLDER = "results/"
# _FOLDER = "/home/acq18mk/master/results/"

In [None]:
def TestTunedKernels(merged_df, drug_ids, number_coefficients, kernel, train_ratio =0.8, column_not_to_use =[], 
                     alpha=1, gamma=None, degree=3, coef0=1, 
                     features_to_scale=[], scaling=False
                     columns_to_use= []):
    """Training and testing Kernels with the best found hyperparameters"""
    
    param1 = ["param_" +str(i) for i in range(10)]
    param2 = ["param" +str(i) for i in range(10)] 
    norm_response  = ["norm_cells_"+str(i) for i in range(10)]
    con_columns  = ["fd_num_"+str(i) for i in range(10)]

    not_X_columns = param1 + param2 + norm_response + con_columns+column_not_to_use
    X_columns = set(df.columns) - set(not_X_columns)
    print("Number of X_columns:", len(X_columns))
    
    df_errors_test = pd.DataFrame()
    df_model_coef = pd.DataFrame(index=X_columns)

    for drug_id in drug_ids:
        
        merged_df_i = merged_df[merged_df["DRUG_ID"]==drug_id]
        np.random.seed(123)
        indexes = np.random.permutation(merged_df_i.index)
        train_size = int(merged_df_i.shape[0]*train_ratio)
        indexes_train = indexes[:train_size]
        indexes_test= indexes[train_size:]
        
        if scaling:
            train = merged_df_i.loc[indexes_train, X_columns].copy()
            test = merged_df_i.loc[indexes_test, X_columns].copy()
            scaler = StandardScaler()
            scaler.fit(train[columns_for_normalisation])
            train[columns_for_normalisation] = scaler.transform(train[columns_for_normalisation])
            X_train = train.values  
            test[columns_for_normalisation] = scaler.transform(test[columns_for_normalisation])
            X_test = test.values
        else:
            X_train = merged_df_i.loc[indexes_train, X_columns].values
            X_test = merged_df_i.loc[indexes_test, X_columns].values
    
        for i in range(number_coefficients):

            y_train = merged_df_i.loc[indexes_train, "param_"+str(i+1)].values
            y_test = merged_df_i.loc[indexes_test, "param_"+str(i+1)].values
            
            #check whether each coefficient needs its own parameters
            if type(alpha)==dict:
                alpha_value = alpha[i+1]
            else:
                alpha_value = alpha
                
            if type(gamma)==dict:
                gamma_value = gamma[i+1]
            else:
                gamma_value = gamma
            
            if type(degree)==dict:
                degree_value = degree[i+1]
            else:
                degree_value = degree
                
            if type(coef0)==dict:
                coef0_value = coef0[i+1]
            else:
                coef0_value = coef0
                
            kr_lin = KernelRidge(kernel = kernel, alpha = alpha_value, gamma=gamma_value, 
                                 degree=degree_value, coef0=coef0_value)
            kr_lin.fit(X_train, y_train)
            y_pred = np.exp(kr_lin.predict(X_test))
            
            merged_df_i.loc[indexes_test, "pred_param_"+str(i+1)] = y_pred
            df_model_coef[drug_id] = 
                           
    return merged_df[columns_to_use]

In [None]:
# что мне надо:
предсказанные коэффициенты кривых для тестовых данных
метрика мне не нужна! - можно посчитать позже и мсе и мае.

In [None]:
вывести предсказанные коэффициенты модели
отдельно для каждого лекарства?
потом сравнить с обобщенными данными

In [None]:
то же самое как для kernelRidge, сделать для SVM
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

In [2]:
os.listdir("results")

['test02_merged_fitted_sigmoid4_123_with_drugs_properties.csv',
 'kernel_learning_01_2.csv',
 'kernel_learning_03_2.csv',
 'test02_merged_fitted_sigmoid4_123_with_drugs_description.csv',
 'drug_features_with_pubchem_properties.csv',
 'kernel_learning_03.csv',
 'kernel_learning_02.csv',
 'merged_fitted_sigmoid4_123_with_drugs_description.csv',
 'filtered_drug_profiles_13.csv',
 'merged_fitted_sigmoid4_123_with_drugs_properties_split_target.csv',
 'kernel_learning_01.csv',
 'filtered_drug_profiles_12.csv',
 'merged_fitted_sigmoid4_123_with_drugs_description_split_target.csv',
 'kernel_learning_04.csv',
 'drug_features_pubchem_id.csv',
 'kernel_learning_02_2.csv',
 'train08_merged_fitted_sigmoid4_123_with_drugs_description.csv',
 'Lasso_2.csv',
 'kernel_learning_2.csv',
 'kernel_learning_3.csv',
 'Lasso_3.csv',
 'Lasso_1.csv',
 'kernel_learning_1.csv',
 'kernel_learning_3_2.csv',
 'train08_merged_fitted_sigmoid4_123_with_drugs_properties.csv',
 'filtered_drug_profiles_23.csv',
 'Lasso_4.c

In [3]:
df = pd.read_csv("results/train08_merged_fitted_sigmoid4_123_with_drugs_description.csv")

In [5]:
df.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,DRUG_ID,COSMIC_ID,fd_num_0,fd_num_1,fd_num_2,fd_num_3,fd_num_4,fd_num_5,...,chr9:123555399-123555899(FBXW2)_HypMET,chr9:140310894-140312457(EXD3)_HypMET,chr9:21974578-21975306(CDKN2A)_HypMET,chr9:35756948-35757339(MSMP)_HypMET,chr9:35791584-35791924(NPR2)_HypMET,chr9:4984543-4985630(JAK2)_HypMET,chr9:86571047-86572027(C9orf64)_HypMET,chr9:98783216-98784364(NCRNA00092)_HypMET,Target_Pathway,Drug_Name
0,87,87,11,1322218,0,0.111111,0.222222,0.333333,0.444444,0.555556,...,0,0,0,0,0,0,0,0,Mitosis,Paclitaxel
1,73,73,11,753614,0,0.111111,0.222222,0.333333,0.444444,0.555556,...,0,0,0,0,0,0,0,0,Mitosis,Paclitaxel


In [6]:
column_not_to_use = ["Unnamed: 0", "COSMIC_ID", "DRUG_ID", "Drug_Name", "Synonyms", "Target", "deriv_found", "PubChem_ID",
                     "elements", "inchi_key", "canonical_smiles", "inchi_string", "third_target", "first_target", "molecular_formula", "second_target", "Target_Pathway"]
param1 = ["param_" +str(i) for i in range(10)]
param2 = ["param" +str(i) for i in range(10)] 
norm_response  = ["norm_cells_"+str(i) for i in range(10)]
con_columns  = ["fd_num_"+str(i) for i in range(10)]

not_X_columns = param1 + param2 + norm_response + con_columns + column_not_to_use
X_columns = set(df.columns) - set(not_X_columns)

In [8]:
y = df["param_1"]
X=df[X_columns]

In [9]:
model = KernelRidge()

In [11]:
model.fit(X,y)

KernelRidge(alpha=1, coef0=1, degree=3, gamma=None, kernel='linear',
            kernel_params=None)

In [13]:
len(model.dual_coef_)

1817

In [14]:
len(X_columns)

1074

In [15]:
model.kernel

'linear'