### Feature Importance

The idea - repeat the procedure of evaluation of feature importance performed for drug-by-drug 
<br> but apply to all drug training

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.feature_selection import RFE

import warnings
warnings.filterwarnings("ignore")
_FOLDER = "results/"

## Training on the Original data

In [2]:
with open(_FOLDER+"X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_PubChem_properties.txt", 'r') as f:
    X_PubChem_properties = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_features_Targets.txt", 'r') as f:
    X_targets = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_features_Target_Pathway.txt", 'r') as f:
    X_target_pathway = [line.rstrip('\n') for line in f]
# *****************************************

all_columns = X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway +["MAX_CONC"]

datasets = ["Dataset 1", "Dataset 2", "Dataset 3", "Dataset 4"]

X_feat_dict = {"Dataset 1": X_cancer_cell_lines ,
               "Dataset 2": ["MAX_CONC"] + X_targets + X_target_pathway + X_cancer_cell_lines ,
               "Dataset 3": ["MAX_CONC"] + X_PubChem_properties +  X_cancer_cell_lines,
               "Dataset 4": ["MAX_CONC"] + X_PubChem_properties +  X_targets + X_target_pathway + X_cancer_cell_lines}

In [3]:
with open(_FOLDER + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

X_columns = X_feat_dict["Dataset 4"]
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

## Coefficient 1

need to apply Linear SVR to get model weights and feature importance atteibutes 

Coefficient_1 ....
Linear SVR
Dataset: 1 best C: 0.01
Dataset: 1 best_epsilon 0.1

In [4]:
i=1
y = test_df_50["param_1"]
y_train_drug = train_drug["param_1"].values
y_test_drug =  test_drug["param_1"].values

kernel ="linear"
C= 0.01
epsilon = 0.1

most_important = []

estimator = SVR(kernel = "linear", C = C, epsilon = epsilon)

# feature selector
selector = RFE(estimator, n_features_to_select=50, step=10)
selector = selector.fit(Xtrain_drug, y_train_drug)
most_important = np.array(X_columns)[selector.support_]
    
#models parameters
estimator.fit(Xtrain_drug, y_train_drug)
print("Number of zero features:", len(estimator.coef_[estimator.coef_==0]))
print("Maximal importance:", round(abs(estimator.coef_[0]).max(), 3))
print("Number of features with importance more_0_01 :", sum(abs(estimator.coef_[0])>0.01))

from_CCLE = set(most_important) & set(X_cancer_cell_lines)
print("from CCL:", len(set(most_important) & set(X_cancer_cell_lines)))
print("from PubChem_properties",len( set(most_important) & set(X_PubChem_properties)))
print("from targets:", len(set(most_important) & set(X_targets)))
print("from target pathway:", len(set(most_important) & set(X_target_pathway)))

Number of zero features: 334
Maximal importance: 0.045
Number of features with importance more_0_01 : 211
from CCL: 34
from PubChem_properties 5
from targets: 8
from target pathway: 3


In [5]:
most_important

array(['3bonds', 'complexity', 'h_bond_acceptor_count',
       'bond_stereo_count', 'F', 'HDAC1', 'HSP90', 'CDK7', 'HDAC1-10',
       'PPARdelta', 'PPARgamma', 'ERBB2', 'EGFR',
       'Protein stability and degradation',
       'Chromatin histone acetylation', 'WNT signaling', 'EWSR1-FLI1_mut',
       'MLL2_mut', 'MLL3_mut', 'PGR_mut', 'PIK3CB_mut', 'XRN1_mut',
       'loss:cnaPANCAN6', 'loss:cnaPANCAN20', 'gain:cnaPANCAN61',
       'loss:cnaPANCAN112 (CREBBP)', 'loss:cnaPANCAN113',
       'loss:cnaPANCAN115', 'gain:cnaPANCAN139',
       'gain:cnaPANCAN141 (GNAQ,NTRK2,PCSK5,TJP2)', 'loss:cnaPANCAN203',
       'gain:cnaPANCAN214', 'gain:cnaPANCAN239 (FOXP1,MITF)',
       'loss:cnaPANCAN263', 'loss:cnaPANCAN265', 'loss:cnaPANCAN294',
       'loss:cnaPANCAN310 (MAP2K4)', 'gain:cnaPANCAN367 (ARFGAP1,GNAS)',
       'gain:cnaPANCAN383', 'gain:cnaPANCAN384 (ERCC5,ING1,IRS2,TFDP1)',
       'loss:cnaPANCAN386', 'chr1:150266476-150266689(MRPS21)_HypMET',
       'chr1:181451311-181452049()_HypMET

## Data Reduction

In [6]:
## Training on the original data 

with open(_FOLDER + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# original X features
X_columns = X_feat_dict["Dataset 4"]
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=1
y = test_df_50["param_1"]
y_train_drug = train_drug["param_1"].values
y_test_drug =  test_drug["param_1"].values
#RBF SVR
i=1
kernel ="rbf"
C= 0.5
epsilon = 0.01
coef0 = 0.01
model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_1"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 1, Training on original data\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))

## Training on the Reduced data: Coef 1

#reduced by R2 data

with open("results/drug_ids_50_restr.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    

train_df = pd.read_csv("results/train08_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv("results/test02_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# original X features
X_columns = X_feat_dict["Dataset 4"]
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=1
y = test_df_50["param_1"]
y_train_drug = train_drug["param_1"].values
y_test_drug =  test_drug["param_1"].values
#RBF SVR
i=1
kernel ="rbf"
C= 0.5
epsilon = 0.01
coef0 = 0.01
model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_1"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 1, Training on the reduced data\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))

## Training on the original data and Feature subset: Coef 1

with open(_FOLDER + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# feature subset
X_columns = most_important
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=1
y = test_df_50["param_1"]
y_train_drug = train_drug["param_1"].values
y_test_drug =  test_drug["param_1"].values
#RBF SVR
i=1
kernel ="rbf"
C= 0.5
epsilon = 0.01
coef0 = 0.01
model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_1"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 1, Training on original data and Feature subset\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))

## Training on the Reduced data and Feature subset: Coef 1

#reduced by R2 data

with open(_FOLDER+ "drug_ids_50_restr.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    

train_df = pd.read_csv("results/train08_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv("results/test02_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# feature subset
X_columns = most_important
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=1
y = test_df_50["param_1"]
y_train_drug = train_drug["param_1"].values
y_test_drug =  test_drug["param_1"].values
#RBF SVR
i=1
kernel ="rbf"
C= 0.5
epsilon = 0.01
coef0 = 0.01
model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_1"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 1, Training on the reduced data and feature subset\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))


 Coefficient 1, Training on original data

MAE: 0.208 +/- 1.230
MRE: -3.3 +/- 27.5

 Coefficient 1, Training on the reduced data

MAE: 0.087 +/- 0.064
MRE: -2.6 +/- 24.6

 Coefficient 1, Training on original data and Feature subset

MAE: 0.212 +/- 1.235
MRE: -4.3 +/- 28.5

 Coefficient 1, Training on the reduced data and feature subset

MAE: 0.080 +/- 0.058
MRE: -1.5 +/- 22.5


## Coefficient 2

Coefficient_2 ....
Linear SVR
Dataset: 1 best C: 0.01
Dataset: 1 best_epsilon 0.001

In [7]:
with open(_FOLDER + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

X_columns = X_feat_dict["Dataset 4"]
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])


i=2
y = test_df_50["param_2"]
y_train_drug = train_drug["param_2"].values
y_test_drug =  test_drug["param_2"].values

kernel ="linear"
C= 0.01
epsilon = 0.001

most_important = []

estimator = SVR(kernel = "linear", C = C, epsilon = epsilon)

# feature selector
selector = RFE(estimator, n_features_to_select=50, step=10)
selector = selector.fit(Xtrain_drug, y_train_drug)
most_important = np.array(X_columns)[selector.support_]
    
#models parameters
estimator.fit(Xtrain_drug, y_train_drug)
print("Number of zero features:", len(estimator.coef_[estimator.coef_==0]))
print("Maximal importance:", round(abs(estimator.coef_[0]).max(), 3))
print("Number of features with importance more_0_01 :", sum(abs(estimator.coef_[0])>0.01))

from_CCLE = set(most_important) & set(X_cancer_cell_lines)
print("from CCL:", len(set(most_important) & set(X_cancer_cell_lines)))
print("from PubChem_properties",len( set(most_important) & set(X_PubChem_properties)))
print("from targets:", len(set(most_important) & set(X_targets)))
print("from target pathway:", len(set(most_important) & set(X_target_pathway)))

Number of zero features: 283
Maximal importance: 0.053
Number of features with importance more_0_01 : 400
from CCL: 38
from PubChem_properties 3
from targets: 6
from target pathway: 3


## Data reduction: Coef 2

In [8]:
## Training on the original data 

_FOLDER = _FOLDER = "../drug_results/"

with open(_FOLDER + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# original X features
X_columns = X_feat_dict["Dataset 4"]
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=2
y = test_df_50["param_2"]
y_train_drug = train_drug["param_2"].values
y_test_drug =  test_drug["param_2"].values
#RBF SVR
i=2
kernel ="rbf"
C= 0.1
epsilon = 0.01
coef0 = 0.01
model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_2"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 2, Training on original data\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))

## Training on the Reduced data: Coef 2

_FOLDER = _FOLDER = "../drug_results/"

#reduced by R2 data

with open("results/drug_ids_50_restr.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    

train_df = pd.read_csv("results/train08_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv("results/test02_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# original X features
X_columns = X_feat_dict["Dataset 4"]
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=2
y = test_df_50["param_2"]
y_train_drug = train_drug["param_2"].values
y_test_drug =  test_drug["param_2"].values
#RBF SVR
i=2
kernel ="rbf"
C= 0.1
epsilon = 0.01
coef0 = 0.01
model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_2"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 2, Training on the reduced data\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))

## Training on the original data and Feature subset: Coef 2

_FOLDER = _FOLDER = "../drug_results/"

with open(_FOLDER + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# feature subset
X_columns = most_important
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=2
y = test_df_50["param_2"]
y_train_drug = train_drug["param_2"].values
y_test_drug =  test_drug["param_2"].values
#RBF SVR
i=2
kernel ="rbf"
C= 0.1
epsilon = 0.01
coef0 = 0.01
model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)

model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_2"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 2, Training on original data and Feature subset\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))

## Training on the Reduced data and Feature subset: Coef 2

_FOLDER = _FOLDER = "../drug_results/"

#reduced by R2 data

with open("results/drug_ids_50_restr.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    

train_df = pd.read_csv("results/train08_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv("results/test02_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# feature subset
X_columns = most_important
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=2
y = test_df_50["param_2"]
y_train_drug = train_drug["param_2"].values
y_test_drug =  test_drug["param_2"].values
#RBF SVR
i=2
kernel ="rbf"
C= 0.1
epsilon = 0.01
coef0 = 0.01
model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)

model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_2"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 2, Training on the reduced data and feature subset\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))


 Coefficient 2, Training on original data

MAE: 0.236 +/- 1.585
MRE: 2.9 +/- 18.3

 Coefficient 2, Training on the reduced data

MAE: 0.121 +/- 0.158
MRE: 2.6 +/- 15.7

 Coefficient 2, Training on original data and Feature subset

MAE: 0.234 +/- 1.582
MRE: 1.6 +/- 18.7

 Coefficient 2, Training on the reduced data and feature subset

MAE: 0.111 +/- 0.142
MRE: 1.4 +/- 14.8


## Coefficient 3

Coefficient_3 ....
Linear SVR
Dataset:4, best C: 0.1
Dataset:4, best_epsilon 1

In [9]:
i=3
y = test_df_50["param_3"]
y_train_drug = train_drug["param_3"].values
y_test_drug =  test_drug["param_3"].values

kernel ="linear"
C= 0.1
epsilon = 1

most_important = []

estimator = SVR(kernel = "linear", C = C, epsilon = epsilon)

# feature selector
selector = RFE(estimator, n_features_to_select=50, step=10)
selector = selector.fit(Xtrain_drug, y_train_drug)
most_important = np.array(X_columns)[selector.support_]
    
#models parameters
estimator.fit(Xtrain_drug, y_train_drug)
print("Number of zero features:", len(estimator.coef_[estimator.coef_==0]))
print("Maximal importance:", round(abs(estimator.coef_[0]).max(), 3))
print("Number of features with importance more_0_01 :", sum(abs(estimator.coef_[0])>0.01))

from_CCLE = set(most_important) & set(X_cancer_cell_lines)
print("from CCL:", len(set(most_important) & set(X_cancer_cell_lines)))
print("from PubChem_properties",len( set(most_important) & set(X_PubChem_properties)))
print("from targets:", len(set(most_important) & set(X_targets)))
print("from target pathway:", len(set(most_important) & set(X_target_pathway)))

Number of zero features: 3
Maximal importance: 2.802
Number of features with importance more_0_01 : 47
from CCL: 38
from PubChem_properties 5
from targets: 6
from target pathway: 1


## Data Reduction: Coef 3

In [10]:
## Training on the original data 

_FOLDER = _FOLDER = "../drug_results/"

with open(_FOLDER + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# original X features
X_columns = X_feat_dict["Dataset 4"]
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=3
y = test_df_50["param_3"]
y_train_drug = train_drug["param_3"].values
y_test_drug =  test_drug["param_3"].values 

#Linear SVR
i=3
kernel ="linear"
C = 0.1
epsilon = 1
model = SVR(kernel = kernel, epsilon = epsilon, C=C)
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_3"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 3, Training on original data\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))

## Training on the Reduced data: Coef 3

_FOLDER = _FOLDER = "../drug_results/"

#reduced by R2 data

with open("results/drug_ids_50_restr.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    

train_df = pd.read_csv("results/train08_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv("results/test02_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# original X features
X_columns = X_feat_dict["Dataset 4"]
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=3
y = test_df_50["param_3"]
y_train_drug = train_drug["param_3"].values
y_test_drug =  test_drug["param_3"].values 

#Linear SVR
i=3
kernel ="linear"
C = 0.1
epsilon = 1
model = SVR(kernel = kernel, epsilon = epsilon, C=C)
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_3"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 3, Training on the reduced data\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))

## Training on the original data and Feature subset: Coef 3

_FOLDER = _FOLDER = "../drug_results/"

with open(_FOLDER + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# feature subset
X_columns = most_important
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=3
y = test_df_50["param_3"]
y_train_drug = train_drug["param_3"].values
y_test_drug =  test_drug["param_3"].values 

#Linear SVR
i=3
kernel ="linear"
C = 0.1
epsilon = 1
model = SVR(kernel = kernel, epsilon = epsilon, C=C)

model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_3"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 3, Training on original data and Feature subset\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))

## Training on the Reduced data and Feature subset: Coef 3

_FOLDER = _FOLDER = "../drug_results/"

#reduced by R2 data

with open("results/drug_ids_50_restr.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    

train_df = pd.read_csv("results/train08_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv("results/test02_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# feature subset
X_columns = most_important
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=3
y = test_df_50["param_3"]
y_train_drug = train_drug["param_3"].values
y_test_drug =  test_drug["param_3"].values 

#Linear SVR
i=3
kernel ="linear"
C = 0.1
epsilon = 1
model = SVR(kernel = kernel, epsilon = epsilon, C=C)

model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_3"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 3, Training on the reduced data and feature subset\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))


 Coefficient 3, Training on original data

MAE: 8.955 +/- 22.108
MRE: -8.1 +/- 64.1

 Coefficient 3, Training on the reduced data

MAE: 7.148 +/- 11.786
MRE: -12.2 +/- 58.8

 Coefficient 3, Training on original data and Feature subset

MAE: 9.584 +/- 22.378
MRE: -9.7 +/- 72.6

 Coefficient 3, Training on the reduced data and feature subset

MAE: 7.893 +/- 12.535
MRE: -10.9 +/- 64.6


## Coefficient 4

Coefficient_4 ....
Linear SVR
Dataset: 1 best C: 0.01
Dataset: 1 best_epsilon 0.01

In [11]:
i=4
y = test_df_50["param_4"]
y_train_drug = train_drug["param_4"].values
y_test_drug =  test_drug["param_4"].values

kernel ="linear"
C= 0.01
epsilon = 0.01

most_important = []

estimator = SVR(kernel = "linear", C = C, epsilon = epsilon)

# feature selector
selector = RFE(estimator, n_features_to_select=50, step=10)
selector = selector.fit(Xtrain_drug, y_train_drug)
most_important = np.array(X_columns)[selector.support_]
    
#models parameters
estimator.fit(Xtrain_drug, y_train_drug)
print("Number of zero features:", len(estimator.coef_[estimator.coef_==0]))
print("Maximal importance:", round(abs(estimator.coef_[0]).max(), 3))
print("Number of features with importance more_0_01 :", sum(abs(estimator.coef_[0])>0.01))

from_CCLE = set(most_important) & set(X_cancer_cell_lines)
print("from CCL:", len(set(most_important) & set(X_cancer_cell_lines)))
print("from PubChem_properties",len( set(most_important) & set(X_PubChem_properties)))
print("from targets:", len(set(most_important) & set(X_targets)))
print("from target pathway:", len(set(most_important) & set(X_target_pathway)))

Number of zero features: 3
Maximal importance: 0.091
Number of features with importance more_0_01 : 20
from CCL: 38
from PubChem_properties 5
from targets: 6
from target pathway: 1


## Data Reduction: Coef 4

In [12]:
## Training on the original data 

_FOLDER = _FOLDER = "../drug_results/"

with open(_FOLDER + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# original X features
X_columns = X_feat_dict["Dataset 4"]
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=4
y = test_df_50["param_4"]
y_train_drug = train_drug["param_4"].values
y_test_drug =  test_drug["param_4"].values

#RBF SVR
i=4
kernel ="rbf"
C= 0.1
epsilon = 0.001
coef0 = 0.01
model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_4"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 4, Training on original data\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))

## Training on the Reduced data: Coef 4

_FOLDER = _FOLDER = "../drug_results/"

#reduced by R2 data

with open("results/drug_ids_50_restr.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    

train_df = pd.read_csv("results/train08_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv("results/test02_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# original X features
X_columns = X_feat_dict["Dataset 4"]
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=4
y = test_df_50["param_4"]
y_train_drug = train_drug["param_4"].values
y_test_drug =  test_drug["param_4"].values

#RBF SVR
i=4
kernel ="rbf"
C= 0.1
epsilon = 0.001
coef0 = 0.01
model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)
model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_4"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 4, Training on the reduced data\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))

## Training on the original data and Feature subset: Coef 4

_FOLDER = _FOLDER = "../drug_results/"

with open(_FOLDER + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
train_df = pd.read_csv(_FOLDER+"train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv(_FOLDER+"test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# feature subset
X_columns = most_important
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=4
y = test_df_50["param_4"]
y_train_drug = train_drug["param_4"].values
y_test_drug =  test_drug["param_4"].values

#RBF SVR
i=4
kernel ="rbf"
C= 0.1
epsilon = 0.001
coef0 = 0.01
model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)

model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_4"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 4, Training on original data and Feature subset\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))

## Training on the Reduced data and Feature subset: Coef 4

_FOLDER = _FOLDER = "../drug_results/"

#reduced by R2 data

with open("results/drug_ids_50_restr.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    

train_df = pd.read_csv("results/train08_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
test_df = pd.read_csv("results/test02_min10_restr.csv").drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

train_df_50 = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
test_df_50 = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

### All drugs training
train_drug = train_df_50.copy()
test_drug = test_df_50.copy()

# feature subset
X_columns = most_important
scaler = MinMaxScaler().fit(train_drug[X_columns])
Xtrain_drug = scaler.transform(train_drug[X_columns])
Xtest_drug = scaler.transform(test_drug[X_columns])

i=4
y = test_df_50["param_4"]
y_train_drug = train_drug["param_4"].values
y_test_drug =  test_drug["param_4"].values

#RBF SVR
i=4
kernel ="rbf"
C= 0.1
epsilon = 0.001
coef0 = 0.01
model = SVR(kernel = kernel, epsilon = epsilon, C=C, coef0 = coef0)

model.fit(Xtrain_drug, y_train_drug)
y_pred = model.predict(Xtest_drug) 
test_df_50["pred_param_4"] = y_pred
abs_error = abs(y - y_pred)
rel_error = (y-y_pred)*100/y
print("\n Coefficient 4, Training on the reduced data and feature subset\n")
print("MAE: %0.3f +/- %0.3f" % (abs_error.mean(), abs_error.std()))
print("MRE: %0.1f +/- %0.1f" % (rel_error.mean(), rel_error.std()))


 Coefficient 4, Training on original data

MAE: 0.072 +/- 0.099
MRE: 67.0 +/- 1404.6

 Coefficient 4, Training on the reduced data

MAE: 0.072 +/- 0.086
MRE: 34.9 +/- 976.1

 Coefficient 4, Training on original data and Feature subset

MAE: 0.072 +/- 0.091
MRE: 22.2 +/- 1876.6

 Coefficient 4, Training on the reduced data and feature subset

MAE: 0.068 +/- 0.084
MRE: 58.6 +/- 1227.4
