## Pipeline for Data preprocessing and modelling

In [1]:
fitting_functions = ["sigmoid_4_param", "logistic4", "logLogistR"]
filtering_scenarios = [[1,2,3],[1,2,3,4]]
r2_restrictions = [None, 0, 0.9]

In [2]:
import pandas as pd
import numpy as np
import os
import gc

import pickle
import matplotlib.pyplot as plt
%matplotlib inline

_FOLDER = "data/"
_FOLDER_2 = "results/"

In [3]:
# all functions are stored in all_functions.py

from all_functions import DataPreprocessing, TrainTestSplit, r2_score_fitting, mae_score_reconstruct
from training_testing import TrainTest_Alg1, TrainTest_Alg2, ShowErrors

In [4]:
with open(_FOLDER_2+"X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER_2+"X_PubChem_properties.txt", 'r') as f:
    X_PubChem_properties = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER_2+"X_features_Targets.txt", 'r') as f:
    X_targets = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER_2+"X_features_Target_Pathway.txt", 'r') as f:
    X_target_pathway = [line.rstrip('\n') for line in f]
# *****************************************

X_columns = ["MAX_CONC"] + X_PubChem_properties +  X_targets + X_target_pathway + X_cancer_cell_lines


In [5]:
def CompareDataPreprocessing_Alg2(train_df, test_df, drug_ids_list, fitting_function="sigmoid_4_param",
                                  X_columns = X_columns, n_param=4):
    i=1
    target = "param_"+str(i)
    
    #RBF SVR
    print("Coefficient ", i)
    model = "SVR"

    parameters = {"kernel" :"rbf",
              "C" : 0.5,
              "epsilon" : 0.01,
              "coef0" : 0.01}

    mae, mre, y_pred, train_shape, test_shape = TrainTest_Alg2(train_df, test_df, target, drug_ids_list, 
                                X_columns = X_columns, model_name = model, model_parameters = parameters)

    i=2
    target = "param_"+str(i)

    #RBF SVR
    print("Coefficient ", i)

    model = "SVR"

    parameters = {"kernel" :"rbf",
              "C" : 0.1,
              "epsilon" : 0.01,
              "coef0" : 0.01}

    mae, mre, y_pred, train_shape, test_shape = TrainTest_Alg2(train_df, test_df, target, drug_ids_list, 
                                X_columns = X_columns, model_name = model, model_parameters = parameters)

    i=3
    target = "param_"+str(i)

    #RBF SVR
    print("Coefficient ", i)

    model = "SVR"

    parameters = {"kernel" :"linear",
              "C" : 0.1,
              "epsilon" : 1}

    mae, mre, y_pred, train_shape, test_shape = TrainTest_Alg2(train_df, test_df, target, drug_ids_list, 
                                X_columns = X_columns, model_name = model, model_parameters = parameters)

    if n_param ==4:
        i=4
        target = "param_"+str(i)

        #RBF SVR
        print("Coefficient ", i)

        model = "SVR"

        parameters = {"kernel" :"rbf",
              "C" : 0.1,
              "epsilon" : 0.001, 
             "coef0" : 0.01}

        mae, mre, y_pred, train_shape, test_shape = TrainTest_Alg2(train_df, test_df, target, drug_ids_list, 
                                X_columns = X_columns, model_name = model, model_parameters = parameters)
    
    print("\n****************************************")
    print("Reconstruction of Drug Response curves:\n")

    conc_columns= ["fd_num_"+str(i) for i in range(10)]
    response_norm = ['norm_cells_'+str(i) for i in range(10)]

    r2_fitting_predicted = r2_score_fitting(df = test_df, x_columns= conc_columns,y_columns=response_norm,
                              fitting_function = fitting_function, param_columns = ["pred_param_"+str(i)for i in range(1,n_param+1)])

    r2_direct_fitting = r2_score_fitting(df = test_df, x_columns= conc_columns,y_columns=response_norm,
                              fitting_function = fitting_function, param_columns = ["param_"+str(i)for i in range(1,n_param+1)])

    mae_reconstruct = mae_score_reconstruct(df = test_df, x_columns= conc_columns, fitting_function = fitting_function,
                                        y_columns=response_norm, param_columns = ["pred_param_"+str(i)for i in range(1,n_param+1)])
    
    return r2_fitting_predicted, r2_direct_fitting, mae_reconstruct

## ML training

In [6]:
ml_scenario = "Filtering [1,2,3], no restriction r2, sigmoid_4_param / previous parameters"

print(ml_scenario)
df= DataPreprocessing(folder_with_original_data= _FOLDER, folder_with_results =_FOLDER_2, 
                      filtering_scenario = [1,2,3],
                      first_points_lower_limit = 0.8, last_points_upper_limit = 0.4,
                      middle_points_limit =-0.1,
                     fitting_function = "sigmoid_4_param", 
                      keep_r2_column = True,
                     print_progress_info = False)

drug_ids_limit, train_df_limit, test_df_limit = TrainTestSplit(df, min_number_drug_profiles =50, train_ratio= 0.8, 
                                                               r2_restriction = 0, print_progress_info = False)

r2_fitting_predicted, r2_direct_fitting, mae_reconstruct = CompareDataPreprocessing_Alg2(train_df_limit, test_df_limit, 
                                                        drug_ids_list =drug_ids_limit, X_columns = X_columns)
print(ml_scenario)

print("\nMAE for reconstructed points: %0.3f"% mae_reconstruct.mean())
print("R2 for direct fitting: %0.3f"% r2_direct_fitting.mean())
print("R2 with predicted parameters: %0.3f"% r2_fitting_predicted.mean())

Filtering [1,2,3], no restriction r2, sigmoid_4_param / previous parameters


100%|██████████| 2776/2776 [00:09<00:00, 279.14it/s]


Coefficient  1

MAE: 0.464 +/- 0.835
MRE: 25.6 +/- 9.9

(781, 1370) (202, 1371)
Coefficient  2

MAE: 0.469 +/- 0.944
MRE: 8.3 +/- 19.4

(781, 1370) (202, 1372)
Coefficient  3

MAE: 9.008 +/- 5.635
MRE: -43.2 +/- 12.8

(781, 1370) (202, 1373)
Coefficient  4

MAE: 0.125 +/- 0.137
MRE: 93.1 +/- 132.9

(781, 1370) (202, 1374)

****************************************
Reconstruction of Drug Response curves:

Filtering [1,2,3], no restriction r2, sigmoid_4_param / previous parameters

MAE for reconstructed points: 0.105
R2 for direct fitting: 0.957
R2 with predicted parameters: 0.757


In [7]:
ml_scenario = "Filtering [1,2,3,4], no restriction r2, sigmoid_4_param / previous parameters"

print(ml_scenario)
df= DataPreprocessing(folder_with_original_data= _FOLDER, folder_with_results =_FOLDER_2, 
                      filtering_scenario = [1,2,3,4],
                      first_points_lower_limit = 0.8, last_points_upper_limit = 0.4,
                      middle_points_limit =-0.1,
                     fitting_function = "sigmoid_4_param", 
                      keep_r2_column = True,
                     print_progress_info = False)

drug_ids_limit, train_df_limit, test_df_limit = TrainTestSplit(df, min_number_drug_profiles =50, train_ratio= 0.8, 
                                                               r2_restriction = 0, print_progress_info = False)

r2_fitting_predicted, r2_direct_fitting, mae_reconstruct = CompareDataPreprocessing_Alg2(train_df_limit, test_df_limit, 
                                                        drug_ids_list =drug_ids_limit, X_columns = X_columns)
print(ml_scenario)

print("\nMAE for reconstructed points: %0.3f"% mae_reconstruct.mean())
print("R2 for direct fitting: %0.3f"% r2_direct_fitting.mean())
print("R2 with predicted parameters: %0.3f"% r2_fitting_predicted.mean())

Filtering [1,2,3,4], no restriction r2, sigmoid_4_param / previous parameters


100%|██████████| 2600/2600 [00:08<00:00, 309.11it/s]


Coefficient  1

MAE: 0.443 +/- 0.801
MRE: 24.5 +/- 11.6

(703, 1370) (180, 1371)
Coefficient  2

MAE: 0.432 +/- 0.773
MRE: 3.0 +/- 11.6

(703, 1370) (180, 1372)
Coefficient  3

MAE: 6.011 +/- 3.291
MRE: -35.7 +/- 8.1

(703, 1370) (180, 1373)
Coefficient  4

MAE: 0.074 +/- 0.043
MRE: 265.7 +/- 825.4

(703, 1370) (180, 1374)

****************************************
Reconstruction of Drug Response curves:

Filtering [1,2,3,4], no restriction r2, sigmoid_4_param / previous parameters

MAE for reconstructed points: 0.104
R2 for direct fitting: 0.966
R2 with predicted parameters: 0.753


In [8]:
ml_scenario = "Filtering [1,2,3,4], no restriction r2, logistic4 / previous parameters"
fitting_function = "logistic4"
print(ml_scenario)
df= DataPreprocessing(folder_with_original_data= _FOLDER, folder_with_results =_FOLDER_2, 
                      filtering_scenario = [1,2,3,4],
                      first_points_lower_limit = 0.8, last_points_upper_limit = 0.4,
                      middle_points_limit =-0.1,
                     fitting_function = "logistic4", 
                      keep_r2_column = True,
                     print_progress_info = False)

drug_ids_limit, train_df_limit, test_df_limit = TrainTestSplit(df, min_number_drug_profiles =50, train_ratio= 0.8, 
                                                               r2_restriction = 0, print_progress_info = False)

r2_fitting_predicted, r2_direct_fitting, mae_reconstruct = CompareDataPreprocessing_Alg2(train_df_limit, test_df_limit, 
                                            fitting_function = fitting_function, drug_ids_list =drug_ids_limit, X_columns = X_columns)
print(ml_scenario)

print("\nMAE for reconstructed points: %0.3f"% mae_reconstruct.mean())
print("R2 for direct fitting: %0.3f"% r2_direct_fitting.mean())
print("R2 with predicted parameters: %0.3f"% r2_fitting_predicted.mean())

Filtering [1,2,3,4], no restriction r2, logistic4 / previous parameters


100%|██████████| 2600/2600 [00:13<00:00, 192.99it/s]


Coefficient  1

MAE: 0.015 +/- 0.003
MRE: 1.6 +/- 0.3

(705, 1370) (180, 1371)
Coefficient  2

MAE: 5.192 +/- 2.757
MRE: 45.5 +/- 6.7

(705, 1370) (180, 1372)
Coefficient  3

MAE: 0.662 +/- 0.072
MRE: 170.4 +/- 42.9

(705, 1370) (180, 1373)
Coefficient  4

MAE: 0.084 +/- 0.05
MRE: 3881.0 +/- 11656.2

(705, 1370) (180, 1374)

****************************************
Reconstruction of Drug Response curves:

Filtering [1,2,3,4], no restriction r2, logistic4 / previous parameters

MAE for reconstructed points: 0.409
R2 for direct fitting: 0.994
R2 with predicted parameters: -0.929


In [9]:
ml_scenario = "Filtering [1,2,3,4], no restriction r2, logLogistR / previous parameters"
fitting_function = "logLogistR"
print(ml_scenario)
df= DataPreprocessing(folder_with_original_data= _FOLDER, folder_with_results =_FOLDER_2, 
                      filtering_scenario = [1,2,3,4],
                      first_points_lower_limit = 0.8, last_points_upper_limit = 0.4,
                      middle_points_limit =-0.1,
                     fitting_function = "logLogistR", 
                      keep_r2_column = True,
                     print_progress_info = False)

drug_ids_limit, train_df_limit, test_df_limit = TrainTestSplit(df, min_number_drug_profiles =50, train_ratio= 0.8, 
                                                               r2_restriction = 0, print_progress_info = False)

r2_fitting_predicted, r2_direct_fitting, mae_reconstruct = CompareDataPreprocessing_Alg2(train_df_limit, test_df_limit, 
                                      fitting_function=fitting_function, drug_ids_list =drug_ids_limit, X_columns = X_columns,
                                                                                        n_param=3)
print(ml_scenario)

print("\nMAE for reconstructed points: %0.3f"% mae_reconstruct.mean())
print("R2 for direct fitting: %0.3f"% r2_direct_fitting.mean())
print("R2 with predicted parameters: %0.3f"% r2_fitting_predicted.mean())

Filtering [1,2,3,4], no restriction r2, logLogistR / previous parameters


100%|██████████| 2600/2600 [00:18<00:00, 141.05it/s]


Coefficient  1

MAE: 0.091 +/- 0.034
MRE: -29.1 +/- 12.1

(705, 1369) (180, 1370)
Coefficient  2

MAE: 2.333 +/- 1.181
MRE: -35.1 +/- 7.0

(705, 1369) (180, 1371)
Coefficient  3

MAE: 0.132 +/- 0.025
MRE: -74.3 +/- 1209.0

(705, 1369) (180, 1372)

****************************************
Reconstruction of Drug Response curves:

Filtering [1,2,3,4], no restriction r2, logLogistR / previous parameters

MAE for reconstructed points: 0.123
R2 for direct fitting: 0.994
R2 with predicted parameters: 0.779
