## All drugs training and RFE

The main task - repeat the procedure of evaluation of feature importance performed for drug-by-drug 
<br> but apply to all drug training

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error


import warnings
warnings.filterwarnings("ignore")

_FOLDER = "results/"

In [2]:
def mean_relative_error(y_true, y_pred):
    return sum(abs(y_pred-y_true)*100/y_true)/len(y_true)

def TrainTest_SVR(dict_data_type, training_data_type, X_columns, coefficient, kernel, epsilon, C, coef0, feature_subset = False):
    
    with open(dict_data_type[training_data_type]["drug_ids_list"], 'r') as f:
        drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
    train_df = pd.read_csv(dict_data_type[training_data_type]["train_df"]).drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
    test_df = pd.read_csv(dict_data_type[training_data_type]["test_df"]).drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

    train = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
    test = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
    
    y_train = train["param_"+str(coefficient)].values
    y_test =  test["param_"+str(coefficient)].values
    
    scaler = MinMaxScaler().fit(train[X_columns])
    X_train = scaler.transform(train[X_columns])
    X_test = scaler.transform(test[X_columns])

    model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)
    model.fit(X_train, y_train)
    test["pred_param_"+str(coefficient)] = model.predict(X_test) 
    
    #evaluate mae and mre for each drug profile
    mae = np.zeros(len(drug_ids_50))
    mre = np.zeros(len(drug_ids_50))
    for i, drug_id in list(enumerate(drug_ids_50)):
        y_test_drug = test.loc[drug_id, "param_"+str(coefficient)].values
        y_pred = test.loc[drug_id, "pred_param_"+str(coefficient)]
        mae[i] = mean_absolute_error(y_test_drug, y_pred)
        mre[i] = mean_relative_error(y_test_drug, y_pred)
    
    if feature_subset:
        print("\nCoefficient %d, Training on the %s and top50 feature subset\n"% (coefficient, training_data_type))
    else:
        print("\nCoefficient %d, Training on the %s\n"% (coefficient, training_data_type))
    print("MAE: %0.3f +/- %0.3f" % (mae.mean(), mae.std()))
    print("MRE: %0.1f +/- %0.1f" % (mre.mean(), mre.std()))

def PrepareData(dict_data_type, training_data_type, X_columns, coefficient):
    with open(dict_data_type[training_data_type]["drug_ids_list"], 'r') as f:
        drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
    train_df = pd.read_csv(dict_data_type[training_data_type]["train_df"]).drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
    test_df = pd.read_csv(dict_data_type[training_data_type]["test_df"]).drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

    train = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
    test = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
    
    y_train = train["param_"+str(coefficient)].values
    y_test =  test["param_"+str(coefficient)].values
    
    scaler = MinMaxScaler().fit(train[X_columns])
    X_train = scaler.transform(train[X_columns])
    X_test = scaler.transform(test[X_columns])
    return X_train, X_test, y_train, y_test

def RFE_most_important(dict_data_type, training_data_type, X_columns, coefficient, C, epsilon):
    
    X_train, X_test, y_train, y_test = PrepareData(dict_data_type= dict_data_type, training_data_type = training_data_type, 
                                               X_columns =X_columns, coefficient=coefficient)
    most_important_features = []

    estimator = SVR(kernel = "linear", C = C, epsilon = epsilon)

    # feature selector
    selector = RFE(estimator, n_features_to_select=50, step=10)
    selector = selector.fit(X_train, y_train)
    most_important_features = np.array(X_columns)[selector.support_]
    
    #models parameters
    estimator.fit(X_train, y_train)
    print("Number of zero features:", len(estimator.coef_[estimator.coef_==0]))
    print("Maximal importance:", round(abs(estimator.coef_[0]).max(), 3))
    print("Number of features with importance more_0_01 :", sum(abs(estimator.coef_[0])>0.01))

    from_CCLE = set(most_important_features) & set(X_cancer_cell_lines)
    print("from CCL:", len(set(most_important_features) & set(X_cancer_cell_lines)))
    print("from PubChem_properties",len( set(most_important_features) & set(X_PubChem_properties)))
    print("from targets:", len(set(most_important_features) & set(X_targets)))
    print("from target pathway:", len(set(most_important_features) & set(X_target_pathway)))
    return most_important_features

### Reading the default parameters

In [3]:
with open(_FOLDER+"X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_PubChem_properties.txt", 'r') as f:
    X_PubChem_properties = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_features_Targets.txt", 'r') as f:
    X_targets = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_features_Target_Pathway.txt", 'r') as f:
    X_target_pathway = [line.rstrip('\n') for line in f]
# *****************************************

all_columns = X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway +["MAX_CONC"]

datasets = ["Dataset 1", "Dataset 2", "Dataset 3", "Dataset 4"]

X_feat_dict = {"Dataset 1": X_cancer_cell_lines ,
               "Dataset 2": ["MAX_CONC"] + X_targets + X_target_pathway + X_cancer_cell_lines ,
               "Dataset 3": ["MAX_CONC"] + X_PubChem_properties +  X_cancer_cell_lines,
               "Dataset 4": ["MAX_CONC"] + X_PubChem_properties +  X_targets + X_target_pathway + X_cancer_cell_lines}

dict_data_type = {
    "original_data": {
        "drug_ids_list": _FOLDER + "drug_ids_50.txt", 
        "train_df" : _FOLDER + "train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv",
        "test_df" : _FOLDER + "test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv"},
    
    "reduced_by_R2_data": {
        "drug_ids_list": _FOLDER +"drug_ids_50_restr.txt", 
        "train_df" : _FOLDER + "train08_min10_restr.csv",
        "test_df" : _FOLDER + "test02_min10_restr.csv"}
}

## Coefficient 1

## RFE on the original data

need to apply Linear SVR to get model weights and feature importance atteibutes 

In [4]:
kernel ="linear"
C= 0.01
epsilon = 0.1

most_important = RFE_most_important(training_data_type = "original_data", coefficient=1, 
                                    dict_data_type= dict_data_type, C=C, epsilon=epsilon,
                                    X_columns =X_feat_dict["Dataset 4"])

Number of zero features: 334
Maximal importance: 0.045
Number of features with importance more_0_01 : 211
from CCL: 34
from PubChem_properties 5
from targets: 8
from target pathway: 3


In [5]:
most_important

array(['3bonds', 'complexity', 'h_bond_acceptor_count',
       'bond_stereo_count', 'F', 'HDAC1', 'HSP90', 'CDK7', 'HDAC1-10',
       'PPARdelta', 'PPARgamma', 'ERBB2', 'EGFR',
       'Protein stability and degradation',
       'Chromatin histone acetylation', 'WNT signaling', 'EWSR1-FLI1_mut',
       'MLL2_mut', 'MLL3_mut', 'PGR_mut', 'PIK3CB_mut', 'XRN1_mut',
       'loss:cnaPANCAN6', 'loss:cnaPANCAN20', 'gain:cnaPANCAN61',
       'loss:cnaPANCAN112 (CREBBP)', 'loss:cnaPANCAN113',
       'loss:cnaPANCAN115', 'gain:cnaPANCAN139',
       'gain:cnaPANCAN141 (GNAQ,NTRK2,PCSK5,TJP2)', 'loss:cnaPANCAN203',
       'gain:cnaPANCAN214', 'gain:cnaPANCAN239 (FOXP1,MITF)',
       'loss:cnaPANCAN263', 'loss:cnaPANCAN265', 'loss:cnaPANCAN294',
       'loss:cnaPANCAN310 (MAP2K4)', 'gain:cnaPANCAN367 (ARFGAP1,GNAS)',
       'gain:cnaPANCAN383', 'gain:cnaPANCAN384 (ERCC5,ING1,IRS2,TFDP1)',
       'loss:cnaPANCAN386', 'chr1:150266476-150266689(MRPS21)_HypMET',
       'chr1:181451311-181452049()_HypMET

## Training on the orginal and reduced data: Coef 1

In [6]:
#RBF SVR
i=1
kernel ="rbf"
C= 0.5
epsilon = 0.01
coef0 = 0.01

X_columns = X_feat_dict["Dataset 4"]
## Training on the original data 

TrainTest_SVR(training_data_type = "original_data", coefficient = 1, 
              dict_data_type = dict_data_type, X_columns=X_columns, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)

## Training on the Reduced by R2 data

TrainTest_SVR(training_data_type = "reduced_by_R2_data", coefficient = 1, 
              dict_data_type = dict_data_type, X_columns=X_columns, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)


## Training on the reduced data and top50 features subset

TrainTest_SVR(training_data_type = "reduced_by_R2_data", coefficient = 1, feature_subset=True,
              dict_data_type = dict_data_type, X_columns=most_important, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)


## Training on the original data and top50 features subset

TrainTest_SVR(training_data_type = "original_data", coefficient = 1, feature_subset=True,
              dict_data_type = dict_data_type, X_columns=most_important, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)


Coefficient 1, Training on the original_data

MAE: 0.239 +/- 0.464
MRE: 21.6 +/- 8.1

Coefficient 1, Training on the reduced_by_R2_data

MAE: 0.087 +/- 0.018
MRE: 19.7 +/- 3.9

Coefficient 1, Training on the reduced_by_R2_data and top50 feature subset

MAE: 0.080 +/- 0.021
MRE: 17.8 +/- 4.1

Coefficient 1, Training on the original_data and top50 feature subset

MAE: 0.243 +/- 0.467
MRE: 22.5 +/- 7.7


## Coefficient 2

## RFE on the original data

In [7]:
kernel ="linear"
C= 0.01
epsilon = 0.001

most_important = RFE_most_important(training_data_type = "original_data", coefficient = 2, 
                                    dict_data_type= dict_data_type, C=C, epsilon=epsilon,
                                    X_columns =X_feat_dict["Dataset 4"])

Number of zero features: 283
Maximal importance: 0.053
Number of features with importance more_0_01 : 400
from CCL: 38
from PubChem_properties 3
from targets: 6
from target pathway: 3


## Training on the original and reduced data: Coef 2

In [8]:
#RBF SVR
i=2
kernel ="rbf"
C= 0.1
epsilon = 0.01
coef0 = 0.01

X_columns = X_feat_dict["Dataset 4"]
## Training on the original data 

TrainTest_SVR(training_data_type = "original_data", coefficient = 2, 
              dict_data_type = dict_data_type, X_columns=X_columns, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)

## Training on the Reduced by R2 data

TrainTest_SVR(training_data_type = "reduced_by_R2_data", coefficient = 2, 
              dict_data_type = dict_data_type, X_columns=X_columns, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)

## Training on the reduced data and top50 features subset

TrainTest_SVR(training_data_type = "reduced_by_R2_data", coefficient = 2, feature_subset=True,
              dict_data_type = dict_data_type, X_columns=most_important, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)


## Training on the original data and top50 features subset

TrainTest_SVR(training_data_type = "original_data", coefficient = 2, feature_subset=True,
              dict_data_type = dict_data_type, X_columns=most_important, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)


Coefficient 2, Training on the original_data

MAE: 0.272 +/- 0.525
MRE: 7.6 +/- 7.6

Coefficient 2, Training on the reduced_by_R2_data

MAE: 0.125 +/- 0.079
MRE: 10.2 +/- 4.7

Coefficient 2, Training on the reduced_by_R2_data and top50 feature subset

MAE: 0.115 +/- 0.070
MRE: 9.5 +/- 4.4

Coefficient 2, Training on the original_data and top50 feature subset

MAE: 0.271 +/- 0.524
MRE: 7.8 +/- 8.1


## Coefficient 3

## RFE on the original data

In [9]:
kernel ="linear"
C= 0.1
epsilon = 1

most_important = RFE_most_important(training_data_type = "original_data", coefficient = 3, 
                                    dict_data_type= dict_data_type, C=C, epsilon=epsilon,
                                    X_columns =X_feat_dict["Dataset 4"])

Number of zero features: 302
Maximal importance: 2.608
Number of features with importance more_0_01 : 983
from CCL: 29
from PubChem_properties 8
from targets: 9
from target pathway: 4


## Training on the original and reduced data: Coef 3

In [10]:
#Linear SVR
i=3
kernel ="linear"
C = 0.1

X_columns = X_feat_dict["Dataset 4"]
## Training on the original data 

TrainTest_SVR(training_data_type = "original_data", coefficient = 3, 
              dict_data_type = dict_data_type, X_columns=X_columns, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)

## Training on the Reduced by R2 data

TrainTest_SVR(training_data_type = "reduced_by_R2_data", coefficient = 3, 
              dict_data_type = dict_data_type, X_columns=X_columns, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)

## Training on the reduced data and top50 features subset

TrainTest_SVR(training_data_type = "reduced_by_R2_data", coefficient = 3, feature_subset=True,
              dict_data_type = dict_data_type, X_columns=most_important, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)

## Training on the original data and top50 features subset

TrainTest_SVR(training_data_type = "original_data", coefficient = 3, feature_subset=True,
              dict_data_type = dict_data_type, X_columns=most_important, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)


Coefficient 3, Training on the original_data

MAE: 9.813 +/- 8.526
MRE: -42.2 +/- 19.1

Coefficient 3, Training on the reduced_by_R2_data

MAE: 11.271 +/- 7.949
MRE: -44.5 +/- 15.4

Coefficient 3, Training on the reduced_by_R2_data and top50 feature subset

MAE: 11.082 +/- 8.037
MRE: -41.8 +/- 13.6

Coefficient 3, Training on the original_data and top50 feature subset

MAE: 9.791 +/- 8.433
MRE: -42.2 +/- 18.4


## Coefficient 4

## RFE on the original data

In [11]:
kernel ="linear"
C= 0.01
epsilon = 0.01

most_important = RFE_most_important(training_data_type = "original_data", coefficient = 4, 
                                    dict_data_type= dict_data_type, C=C, epsilon=epsilon,
                                    X_columns =X_feat_dict["Dataset 4"])

Number of zero features: 284
Maximal importance: 0.034
Number of features with importance more_0_01 : 325
from CCL: 38
from PubChem_properties 3
from targets: 8
from target pathway: 1


## Data Reduction: Coef 4

In [12]:
#RBF SVR
i=4
kernel ="rbf"
C= 0.1
epsilon = 0.001
coef0 = 0.01

X_columns = X_feat_dict["Dataset 4"]
## Training on the original data 

TrainTest_SVR(training_data_type = "original_data", coefficient = 4, 
              dict_data_type = dict_data_type, X_columns=X_columns, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)

## Training on the Reduced by R2 data

TrainTest_SVR(training_data_type = "reduced_by_R2_data", coefficient = 4, 
              dict_data_type = dict_data_type, X_columns=X_columns, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)

## Training on the reduced data and top50 features subset

TrainTest_SVR(training_data_type = "reduced_by_R2_data", coefficient = 4, feature_subset=True,
              dict_data_type = dict_data_type, X_columns=most_important, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)


## Training on the original data and top50 features subset

TrainTest_SVR(training_data_type = "original_data", coefficient = 4, feature_subset=True,
              dict_data_type = dict_data_type, X_columns=most_important, 
              kernel= kernel, epsilon = epsilon, C= C, coef0= coef0)


Coefficient 4, Training on the original_data

MAE: 0.078 +/- 0.045
MRE: -31.5 +/- 286.9

Coefficient 4, Training on the reduced_by_R2_data

MAE: 0.080 +/- 0.048
MRE: 259.9 +/- 732.6

Coefficient 4, Training on the reduced_by_R2_data and top50 feature subset

MAE: 0.071 +/- 0.043
MRE: 114.5 +/- 260.6

Coefficient 4, Training on the original_data and top50 feature subset

MAE: 0.075 +/- 0.041
MRE: -131.4 +/- 575.6
