In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import r2_score
from scipy.optimize import curve_fit
from scipy.optimize import leastsq
import scipy.optimize as opt
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import os
from data_preprocessing import FilteringCurves, ShowResponseCurves
from fitting_curves import FittingColumn, ShowResponseCurvesWithFitting, compute_r2_score
_FOLDER = "./data/"

# NEED TO CHANGE - what file to read
df = pd.read_csv("./results/merged_drug_profiles_sigmoid4_123.csv")
df.shape

conc_columns= ["fd_num_"+str(i) for i in range(10)]
response_norm = ['norm_cells_'+str(i) for i in range(10)]

### Training and testing tuned kernels

# select subsets for each drug and divide each of them into train and test data
# concatenate all the train and test subsets
min_records = 50
gr = df.groupby(["DRUG_ID"])["COSMIC_ID"].count()
good_drug_ids = gr[gr>min_records].index
print("Number of drugs with more than %d records: %d" % (min_records, len(good_drug_ids)))

# making train and test sets
train = pd.DataFrame(columns=df.columns)
test = pd.DataFrame(columns=df.columns)

for drug_id in good_drug_ids:
    df_i = df[df["DRUG_ID"]==drug_id]
    np.random.seed(123)
    indexes = np.random.permutation(df_i.index)
    train_size = int(df_i.shape[0]*0.7)
    indexes_train = indexes[:train_size]
    indexes_test= indexes[train_size:]
    
    train_set = df_i.loc[indexes_train, :]
    test_set = df_i.loc[indexes_test, :]
    
    train = pd.concat([train, train_set], axis=0)
    test = pd.concat([test, test_set], axis=0)
    
print("Maid train and test sets:", train.shape, test.shape)

# training and testing

df_errors = pd.DataFrame()
df_errors["DRUG_ID"] = good_drug_ids
df_errors.set_index("DRUG_ID", inplace =True)

test_columns_to_use = ['COSMIC_ID', 'DRUG_ID']+conc_columns+response_norm+["param_"+str(i)for i in range(1,5)]
short_test = test[test_columns_to_use].copy()

for drug_id in df_errors.index:
    
    train_i = train[train["DRUG_ID"]==drug_id]
    test_i = test[test["DRUG_ID"]==drug_id]

    X_train = train_i[train_i.columns[26:-4]].values
    X_test = test_i[test_i.columns[26:-4]].values

    for i in range(4):
        y_train = train_i["param_"+str(i+1)]
        y_test = test_i["param_"+str(i+1)]
        
        # train kernels with best parameters
        # @ TO CHANGE !!!
        kr_lin = KernelRidge(kernel='linear')
        kr_lin.fit(X_train, y_train)
        y_pred = kr_lin.predict(X_test)
        
        # collect errors
        mse = mean_squared_error(y_test, y_pred)
        df_errors.loc[drug_id, "mse_param_"+str(i+1)] = mse
        mae = mean_absolute_error(y_test, y_pred)
        df_errors.loc[drug_id, "mae_param_"+str(i+1)] = mae
        
        # show on graph fitted and predicted curves
        # not all the data was merged=learnd by the model
        short_test.loc[test_i.index, "pred_param_"+str(i+1)] = kr_lin.predict(X_test)
#         print(merged_df.loc[merged_df_i.index, "pred_param_"+str(i+1)])

# Analysis of the results
fitting_cols =["param_"+str(i) for i in range(1,5)]
pred_fitting_cols = ["pred_param_"+str(i) for i in range(1,5)]
fitting_function="sigmoid_4_param"

short_test["r2_fitted"] = compute_r2_score(short_test, x_columns = conc_columns, y_columns = response_norm, 
                              fitting_parameters=fitting_cols, fitting_function = fitting_function)
short_test["r2_predicted"] = compute_r2_score(short_test, x_columns = conc_columns, y_columns = response_norm, 
                              fitting_parameters=pred_fitting_cols, fitting_function = fitting_function)
df_errors.describe()

Number of drugs with more than 50 records: 11
Maid train and test sets: (683, 1100) (300, 1100)


Unnamed: 0,mse_param_1,mae_param_1,mse_param_2,mae_param_2,mse_param_3,mae_param_3,mse_param_4,mae_param_4
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,2.90407,0.678699,6.018416,0.894316,591.589625,14.209816,0.030484,0.089112
std,4.69501,0.69708,9.992729,0.896737,664.961714,8.015762,0.057176,0.033918
min,0.021081,0.113052,0.093379,0.236605,66.275078,6.7482,0.003616,0.029845
25%,0.055201,0.194046,0.249604,0.408432,115.489903,8.511401,0.009314,0.065713
50%,0.935599,0.546047,0.476037,0.534229,360.710725,11.234837,0.013485,0.090251
75%,3.005612,0.741883,8.446279,0.967509,788.979109,17.781224,0.019686,0.110931
max,14.729717,2.561256,31.784409,3.344451,1941.714281,33.002168,0.201778,0.144722
