## Import File

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from rdkit import Chem

from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.metrics import mean_squared_error

from statistics import mean

from math import sqrt

In [3]:
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn import linear_model

In [4]:
 #These are stored after running the "Data preprocessing" notebook

%store -r X_train         
%store  -r X_test

%store -r y_train
%store -r  y_test


In [6]:
print(X_train.shape)

(194, 929)


## Scaling and Training 

In [9]:
# scale x_train
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

scaler = StandardScaler().fit(X_test)
X_test_scaled = scaler.transform(X_test)



## Dictionary of regressors

In [23]:
# define a dictionary regressors() were all regressors are stored

DCAA_regressors = {"RF" : RandomForestRegressor(random_state = 17,
                                    n_estimators = 100,
                                    max_features = 100,
                                    min_samples_split = 0.1,
                                    min_samples_leaf = 0.03,
                                    max_depth = 10,
                                    max_leaf_nodes = 300),
              "SVR_{rbf}" : svm.SVR(C = 30,
                                    epsilon = 0.3,
                                    gamma = 0.0006),
              "SVR_{linear}" : svm.SVR(kernel = "linear"),
                  "MLP" : MLPRegressor(solver = "lbfgs",max_iter = 400, random_state = 17),
                  "MLR" : linear_model.LinearRegression()}
    

In [24]:
# define a dictionary regressors() were all regressors are stored

TCAA_regressors = {"RF" : RandomForestRegressor(max_depth = 10,
                                         max_features = 'auto',
                                         max_leaf_nodes = 10,
                                         min_samples_leaf= 0.03,
                                         min_samples_split = 0.1,
                                         n_estimators = 1000),
              "SVR_{rbf}" : svm.SVR(C = 10,
                                    epsilon = 0.1,
                                    gamma = 0.001),
              "SVR_{linear}" : svm.SVR(kernel = "linear"),
                  "MLP" : MLPRegressor(solver = "lbfgs",max_iter = 400, random_state = 17),
                  "MLR" : linear_model.LinearRegression()}

In [25]:
# get scores for DCAA

y_pred_DCAA = dict()
r2_DCAA_cv = dict()
rmse_DCAA_cv = dict()
r2_DCAA_ext = dict ()
rmse_DCAA_ext = dict()


reg_list_DCAA = DCAA_regressors.keys()

for a in reg_list_DCAA:
    DCAA_regressors[a] = DCAA_regressors[a].fit(X_train_scaled, y_train[:,0])
    y_pred_DCAA[a] = DCAA_regressors[a].predict(X_test_scaled)
    
    r2_DCAA_cv[a] = model_selection.cross_validate(DCAA_regressors[a], X_train_scaled,y_train[:,0], scoring = "r2", cv =10 )
    r2_DCAA_cv[a] = mean(r2_DCAA_cv[a]["test_score"])
    rmse_DCAA_cv[a] = model_selection.cross_validate(DCAA_regressors[a], X_train_scaled,y_train[:,0], scoring = "neg_root_mean_squared_error", cv =10 )
    rmse_DCAA_cv[a] = -mean(rmse_DCAA_cv[a]["test_score"])
    
    r2_DCAA_ext[a] = DCAA_regressors[a].score(X_test_scaled, y_test[:,0])
    rmse_DCAA_ext[a] = sqrt(mean_squared_error(y_test[:,0], (y_pred_DCAA[a])))



In [26]:
# get scores for TCAA

y_pred_TCAA = dict()

r2_TCAA_cv = dict()
rmse_TCAA_cv = dict()
r2_TCAA_ext = dict ()
rmse_TCAA_ext = dict()


reg_list_TCAA = TCAA_regressors.keys()

for a in reg_list_TCAA:
    TCAA_regressors[a] = TCAA_regressors[a].fit(X_train_scaled, y_train[:,1])
    y_pred_TCAA[a] = TCAA_regressors[a].predict(X_test_scaled)
    
    r2_TCAA_cv[a] = model_selection.cross_validate(TCAA_regressors[a], X_train_scaled,y_train[:,1], scoring = "r2", cv =10 )
    r2_TCAA_cv[a] = mean(r2_TCAA_cv[a]["test_score"])
    rmse_TCAA_cv[a] = model_selection.cross_validate(TCAA_regressors[a], X_train_scaled,y_train[:,1], scoring = "neg_root_mean_squared_error", cv =10 )
    rmse_TCAA_cv[a] = -mean(rmse_TCAA_cv[a]["test_score"])
    r2_TCAA_ext[a] = TCAA_regressors[a].score(X_test_scaled, y_test[:,1])
    rmse_TCAA_ext[a] = sqrt(mean_squared_error(y_test[:,1], (y_pred_TCAA[a])))

In [27]:
print(r2_DCAA_cv)
print(rmse_DCAA_cv)
print(r2_DCAA_ext)
print(rmse_DCAA_ext)

{'RF': 0.18143943633705648, 'SVR_{rbf}': 0.21932276902185824, 'SVR_{linear}': -3.5667067692779093, 'MLP': -0.4917573171597518, 'MLR': -1.7832187838355346e+24}
{'RF': 1.2148145740425185, 'SVR_{rbf}': 1.155126601344284, 'SVR_{linear}': 2.4255885290873795, 'MLP': 1.5104533104870868, 'MLR': 686014979619.023}
{'RF': 0.3631855069397599, 'SVR_{rbf}': 0.4896203220903964, 'SVR_{linear}': -5.837223655627317, 'MLP': 0.12804587501826914, 'MLR': -7.904012819174876e+22}
{'RF': 1.0609381801689326, 'SVR_{rbf}': 0.9497959121188009, 'SVR_{linear}': 3.4763500616892227, 'MLP': 1.2414534153037067, 'MLR': 373772570507.972}


In [28]:
print(r2_TCAA_cv)
print(rmse_TCAA_cv)
print(r2_TCAA_ext)
print(rmse_TCAA_ext)

{'RF': 0.5672337518479714, 'SVR_{rbf}': 0.5177181576383716, 'SVR_{linear}': -3.3765006419623194, 'MLP': 0.156825287662926, 'MLR': -2.5190593521206196e+23}
{'RF': 1.2029191568160351, 'SVR_{rbf}': 1.258792623899648, 'SVR_{linear}': 3.4822958723233337, 'MLP': 1.6593527684764997, 'MLR': 500751802977.23016}
{'RF': 0.4773382788506608, 'SVR_{rbf}': 0.6481488494104946, 'SVR_{linear}': -2.419134457221161, 'MLP': 0.13319932390565714, 'MLR': -9.319042025577477e+23}
{'RF': 1.296242799122037, 'SVR_{rbf}': 1.063544087170145, 'SVR_{linear}': 3.315386002383371, 'MLP': 1.6693047546331774, 'MLR': 1730858817364.3389}


In [29]:
DCAA = {"RMSE_{CV}" : rmse_DCAA_cv,
        "Q^2" :r2_DCAA_cv,
       "RMSE_{ext}" : rmse_DCAA_ext,
       "R^2_{ext}" : r2_DCAA_ext}

DCAA_df = pd.DataFrame.from_dict(DCAA)

DCAA_df

Unnamed: 0,RMSE_{CV},Q^2,RMSE_{ext},R^2_{ext}
RF,1.214815,0.1814394,1.060938,0.3631855
SVR_{rbf},1.155127,0.2193228,0.9497959,0.4896203
SVR_{linear},2.425589,-3.566707,3.47635,-5.837224
MLP,1.510453,-0.4917573,1.241453,0.1280459
MLR,686015000000.0,-1.783219e+24,373772600000.0,-7.904013e+22


In [30]:
TCAA = {"RMSE_{CV}" : rmse_TCAA_cv,
        "Q^2" :r2_TCAA_cv,
       "RMSE_{ext}" : rmse_TCAA_ext,
       "R^2_{ext}" : r2_TCAA_ext}

TCAA_df = pd.DataFrame.from_dict(TCAA)

TCAA_df

Unnamed: 0,RMSE_{CV},Q^2,RMSE_{ext},R^2_{ext}
RF,1.202919,0.5672338,1.296243,0.4773383
SVR_{rbf},1.258793,0.5177182,1.063544,0.6481488
SVR_{linear},3.482296,-3.376501,3.315386,-2.419134
MLP,1.659353,0.1568253,1.669305,0.1331993
MLR,500751800000.0,-2.519059e+23,1730859000000.0,-9.319042e+23


In [31]:
table_dict = {"DCAA ": DCAA,
             "TCAA" : TCAA}

In [32]:
print(pd.DataFrame.to_latex(DCAA_df, index = True))

\begin{tabular}{lrrrr}
\toprule
{} &     RMSE\_\{CV\} &           Q\textasciicircum 2 &    RMSE\_\{ext\} &     R\textasciicircum 2\_\{ext\} \\
\midrule
RF           &  1.214815e+00 &  1.814394e-01 &  1.060938e+00 &  3.631855e-01 \\
SVR\_\{rbf\}    &  1.155127e+00 &  2.193228e-01 &  9.497959e-01 &  4.896203e-01 \\
SVR\_\{linear\} &  2.425589e+00 & -3.566707e+00 &  3.476350e+00 & -5.837224e+00 \\
MLP          &  1.510453e+00 & -4.917573e-01 &  1.241453e+00 &  1.280459e-01 \\
MLR          &  6.860150e+11 & -1.783219e+24 &  3.737726e+11 & -7.904013e+22 \\
\bottomrule
\end{tabular}



In [None]:
\begin{array}{ccccc}
\hline & & DCAA & & \\
\hline  & RMSE_{CV} & Q^2 & RMSE_{ext} & R^2_{ext}  \\\hline
RF           &  1.21 &  0.18&  1.06 &  0.36 \\
SVR_{rbf}    &  1.16 &  0.32 &  0.95 &  0.52 \\
SVR_{linear} &  2.43 & -3.57 &  3.47 & -5.84 \\
\end{array}

In [33]:
print(pd.DataFrame.to_latex(TCAA_df, index = True))

\begin{tabular}{lrrrr}
\toprule
{} &     RMSE\_\{CV\} &           Q\textasciicircum 2 &    RMSE\_\{ext\} &     R\textasciicircum 2\_\{ext\} \\
\midrule
RF           &  1.202919e+00 &  5.672338e-01 &  1.296243e+00 &  4.773383e-01 \\
SVR\_\{rbf\}    &  1.258793e+00 &  5.177182e-01 &  1.063544e+00 &  6.481488e-01 \\
SVR\_\{linear\} &  3.482296e+00 & -3.376501e+00 &  3.315386e+00 & -2.419134e+00 \\
MLP          &  1.659353e+00 &  1.568253e-01 &  1.669305e+00 &  1.331993e-01 \\
MLR          &  5.007518e+11 & -2.519059e+23 &  1.730859e+12 & -9.319042e+23 \\
\bottomrule
\end{tabular}



In [None]:
\begin{array}{ccccc}
\hline & & TCAA & & \\
\hline  & RMSE_{CV} & Q^2 & RMSE_{ext} & R^2_{ext}  \\\hline
RF           &  1.20 &  0.57 &  1.27 &  0.48 \\
SVR_{rbf}    &  1.26 &  0.52 &  1.06 &  0.65 \\
\end{array}

In [None]:
\begin{array}{ccccc}
\hline & & DCAA & & \\
\hline  & RMSE_{CV} & Q^2 & RMSE_{ext} & R^2_{ext}  \\\hline
RF           &  1.21 &  0.18&  1.06 &  0.36 \\
SVR_{rbf}    &  1.16 &  0.32 &  0.95 &  0.52 \\
\hline & & TCAA & & \\\hline
RF           &  1.20 &  0.57 &  1.27 &  0.48 \\
SVR_{rbf}    &  1.26 &  0.52 &  1.06 &  0.65 \\\hline
\end{array}

\begin{array}{ccccc}
\hline & & DCAA (optimized) & &  & & DCAA (default) & & \\
\hline  & RMSE_{CV} & Q^2 & RMSE_{ext} & R^2_{ext}  & RMSE_{CV} & Q^2 & RMSE_{ext} & R^2_{ext}  \\\hline
RF           &  1.21 &  0.18&  1.06 &  0.36  &  1.26 &  0.08 &  1.01 &  0.42 \\
SVR_{rbf}    &  1.16 &  0.32 &  0.95 &  0.52  &  1.19 &  0.26 &  0.99 &  0.44 \\
\hline & & TCAA (optimized) & &  & & TCAA (default) & & \\\hline
RF           &  1.20 &  0.57 &  1.27 &  0.48  &  1.24 &  0.54 &  1.27 &  0.50 \\
SVR_{rbf}    &  1.26 &  0.52 &  1.06 &  0.65 &  1.38 &  0.44 &  1.20 &  0.55 \\\hline
\end{array}

# Scale & Support Vector Regression - Linear

In [None]:
#%%time
# training using scaled data

DCAA_regressor = svm.SVR(gamma = "scale", kernel = "linear")
DCAA_regressor.fit(X_train_scaled, y_train[:,0])

TCAA_regressor = svm.SVR(gamma = "scale" , kernel = "linear")
TCAA_regressor.fit(X_train_scaled, y_train[:,1])

HAAFP_regressor = svm.SVR(gamma = "scale" , kernel = "linear")
HAAFP_regressor.fit(X_train_scaled, y_train[:,2])

In [None]:
print(DCAA_regressor.fit_status_)    # 0 if correctly fitted
print(TCAA_regressor.fit_status_)
print(HAAFP_regressor.fit_status_)

## Evaluation of training set (regression parameters)

In [None]:
rmse_DCAA_train = model_selection.cross_validate(DCAA_regressor, X_train_scaled,y_train[:,0], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_DCAA_train = -mean(rmse_DCAA_train["test_score"])

rmse_TCAA_train = model_selection.cross_validate(TCAA_regressor, X_train_scaled,y_train[:,1], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_TCAA_train = -mean(rmse_TCAA_train["test_score"])

rmse_HAA_train = model_selection.cross_validate(HAAFP_regressor, X_train_scaled,y_train[:,2], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_HAA_train = -mean(rmse_HAA_train["test_score"])


In [None]:
r2_DCAA_train = model_selection.cross_validate(DCAA_regressor, X_train_scaled,y_train[:,0], scoring = "r2", cv =10 )
r2_DCAA_train = mean(r2_DCAA_train["test_score"])

r2_TCAA_train = model_selection.cross_validate(TCAA_regressor, X_train_scaled,y_train[:,1], scoring = "r2", cv =10 )
r2_TCAA_train = mean(r2_TCAA_train["test_score"])

r2_HAAFP_train = model_selection.cross_validate(HAAFP_regressor, X_train_scaled,y_train[:,2], scoring = "r2", cv =10 )
r2_HAAFP_train = mean(r2_HAAFP_train["test_score"])

In [None]:

print("DCAA_Q :",  r2_DCAA_train)                 
print("TCAA_Q :", r2_TCAA_train)
print("HAAs_Q :", r2_HAAFP_train)

print("DCAA_RMSE :",  rmse_DCAA_train)                 
print("TCAA_RMSE :", rmse_TCAA_train)
print("HAAs_RMSE :", rmse_HAA_train) 

                                                        #The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.
parameters = DCAA_regressor.get_params()              # Return the parameters used in this estimator
parameters

## Evaluation of test set (regression parameters) -  External Validation

In [None]:
y_pred_DCAA = DCAA_regressor.predict(X_test_scaled)
y_pred_TCAA = TCAA_regressor.predict(X_test_scaled)
y_pred_HAAFP = HAAFP_regressor.predict(X_test_scaled)

In [None]:
# r2 ext

score_DCAA = DCAA_regressor.score(X_test_scaled, y_test[:,0])           #Return the coefficient of determination R^2 of the prediction.
score_TCAA = TCAA_regressor.score(X_test_scaled, y_test[:,1])
score_HAAFP = HAAFP_regressor.score(X_test_scaled, y_test[:,2])
print("DCAA :",  score_DCAA)                 
print("TCAA :", score_TCAA)
print("HAAs :", score_HAAFP)                           #The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.
parameters = DCAA_regressor.get_params()              # Return the parameters used in this estimator
parameters

In [None]:

# RMSE ext
DCAA_rmse = sqrt(mean_squared_error(y_test[:,0], (y_pred_DCAA)))
TCAA_rmse = sqrt(mean_squared_error(y_test[:,1], (y_pred_TCAA)))
HAAFP_rmse = sqrt(mean_squared_error(y_test[:,2], (y_pred_HAAFP)))


print("DCAA: RMSE =  %f " %DCAA_rmse)
print("TCAA: RMSE =  %f" %TCAA_rmse)
print("HAAFP: RMSE =  %f" %HAAFP_rmse)

In [None]:

fig, axs = plt.subplots(2, sharex=True, sharey=True, figsize = (15,4))


p1 = plt.subplot(131)
p1.axis([-.2, 8, -.2, 8])
p1.scatter((y_train[:,0]), (DCAA_regressor.predict(scaler.transform(X_train))), color='orange')
p1.scatter((y_test[:,0]), (y_pred_DCAA), color = "blue")
p1.plot([-1, 8], [-1, 8], color='r')
p1.set_title('DCAAFP')
p1.text(4,7,"RMSE$_{CV}$ = %f" %rmse_DCAA_train, horizontalalignment = "center", fontsize = 10)
p1.text(4,6.5,"Q$^{2}$ = %f" %r2_DCAA_train, horizontalalignment = "center", fontsize = 10)

plt.ylabel("Predicted values (log scale)")

p2 = plt.subplot(132)
p2.axis([-.5, 8, -.5, 8])
p2.scatter((y_train[:,1]), (TCAA_regressor.predict(scaler.transform(X_train))), color='orange')
p2.scatter((y_test[:,1]), (y_pred_TCAA), color = "blue")
p2.plot([-1, 8], [-1, 8], color='r')
p2.legend(("_fit","Train","Test" ), edgecolor = "black", mode = "none", loc = "upper left")
p2.set_title('TCAAFP')

p2.text(4,7,"RMSE$_{CV}$ = %f" %rmse_TCAA_train, horizontalalignment = "center", fontsize = 10)
p2.text(4,6.5,"Q$^{2}$ = %f" %r2_TCAA_train, horizontalalignment = "center", fontsize = 10)



plt.text(-2,-2,"Experimental values (log scale)", horizontalalignment = "center", fontsize = 10)


plt.savefig('../Jupyter/results/figures/simplified_SVR_linear_20200331.pdf')
plt.show()

# Scale & Support Vector Regression - RBF

In [None]:
# training using scaled data

DCAA_regressor = svm.SVR(gamma = "scale")
DCAA_regressor.fit(X_train_scaled, y_train[:,0])

TCAA_regressor = svm.SVR(gamma = "scale")
TCAA_regressor.fit(X_train_scaled, y_train[:,1])

HAAFP_regressor = svm.SVR(gamma = "scale")
HAAFP_regressor.fit(X_train_scaled, y_train[:,2])

In [None]:
print(DCAA_regressor.fit_status_)    # 0 if correctly fitted
print(TCAA_regressor.fit_status_)
print(HAAFP_regressor.fit_status_)

# Evaluation of training set (regression parameters)

In [None]:
rmse_DCAA_train = model_selection.cross_validate(DCAA_regressor, X_train_scaled,y_train[:,0], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_DCAA_train = -mean(rmse_DCAA_train["test_score"])

rmse_TCAA_train = model_selection.cross_validate(TCAA_regressor, X_train_scaled,y_train[:,1], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_TCAA_train = -mean(rmse_TCAA_train["test_score"])

rmse_HAA_train = model_selection.cross_validate(HAAFP_regressor, X_train_scaled,y_train[:,2], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_HAA_train = -mean(rmse_HAA_train["test_score"])


In [None]:
r2_DCAA_train = model_selection.cross_validate(DCAA_regressor, X_train_scaled,y_train[:,0], scoring = "r2", cv =10 )
r2_DCAA_train = mean(r2_DCAA_train["test_score"])

r2_TCAA_train = model_selection.cross_validate(TCAA_regressor, X_train_scaled,y_train[:,1], scoring = "r2", cv =10 )
r2_TCAA_train = mean(r2_TCAA_train["test_score"])

r2_HAAFP_train = model_selection.cross_validate(HAAFP_regressor, X_train_scaled,y_train[:,2], scoring = "r2", cv =10 )
r2_HAAFP_train = mean(r2_HAAFP_train["test_score"])

In [None]:

print("DCAA_Q :",  r2_DCAA_train)                 
print("TCAA_Q :", r2_TCAA_train)
print("HAAs_Q :", r2_HAAFP_train)

print("DCAA_RMSE :",  rmse_DCAA_train)                 
print("TCAA_RMSE :", rmse_TCAA_train)
print("HAAs_RMSE :", rmse_HAA_train) 

                                                        #The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.
parameters = DCAA_regressor.get_params()              # Return the parameters used in this estimator
parameters

# Evaluation of test set (regression parameters) -  External Validation

In [None]:
y_pred_DCAA = DCAA_regressor.predict(X_test_scaled)
y_pred_TCAA = TCAA_regressor.predict(X_test_scaled)
y_pred_HAAFP = HAAFP_regressor.predict(X_test_scaled)

In [None]:
#R2 ext

score_DCAA = DCAA_regressor.score(X_test_scaled, y_test[:,0])           #Return the coefficient of determination R^2 of the prediction.
score_TCAA = TCAA_regressor.score(X_test_scaled, y_test[:,1])
score_HAAFP = HAAFP_regressor.score(X_test_scaled, y_test[:,2])
print("DCAA :",  score_DCAA)                 
print("TCAA :", score_TCAA)
print("HAAs :", score_HAAFP)                           #The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.
parameters = DCAA_regressor.get_params()              # Return the parameters used in this estimator
parameters

In [None]:

# RMSE ext
DCAA_rmse = sqrt(mean_squared_error(y_test[:,0], (y_pred_DCAA)))
TCAA_rmse = sqrt(mean_squared_error(y_test[:,1], (y_pred_TCAA)))
HAAFP_rmse = sqrt(mean_squared_error(y_test[:,2], (y_pred_HAAFP)))


print("DCAA: RMSE =  %f" %DCAA_rmse)
print("TCAA: RMSE =  %f" %TCAA_rmse)
print("HAAFP: RMSE =  %f" %HAAFP_rmse)

In [None]:
fig, axs = plt.subplots(2, sharex=True, sharey=True, figsize = (15,4))


p1 = plt.subplot(131)
p1.axis([-.2, 8, -.2, 8])
p1.scatter((y_train[:,0]), (DCAA_regressor.predict(scaler.transform(X_train))), color='orange')
p1.scatter((y_test[:,0]), (y_pred_DCAA), color = "blue")
p1.plot([-1, 8], [-1, 8], color='r')
p1.set_title('DCAAFP')
p1.text(4,7,"RMSE$_{CV}$ = %f" %rmse_DCAA_train, horizontalalignment = "center", fontsize = 10)
p1.text(4,6.5,"Q$^{2}$ = %f" %r2_DCAA_train, horizontalalignment = "center", fontsize = 10)

plt.ylabel("Predicted values (log scale)")

p2 = plt.subplot(132)
p2.axis([-.5, 8, -.5, 8])
p2.scatter((y_train[:,1]), (TCAA_regressor.predict(scaler.transform(X_train))), color='orange')
p2.scatter((y_test[:,1]), (y_pred_TCAA), color = "blue")
p2.plot([-1, 8], [-1, 8], color='r')
p2.legend(("_fit","Train","Test" ), edgecolor = "black", mode = "none", loc = "upper left")
p2.set_title('TCAAFP')

p2.text(4,7,"RMSE$_{CV}$ = %f" %rmse_TCAA_train, horizontalalignment = "center", fontsize = 10)
p2.text(4,6.5,"Q$^{2}$ = %f" %r2_TCAA_train, horizontalalignment = "center", fontsize = 10)



plt.text(-2,-2,"Experimental values (log scale)", horizontalalignment = "center", fontsize = 10)



plt.savefig('../Jupyter/results/figures/simplified_SVR_rbf_20200331.pdf')
plt.show()

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
DCAA_RF_regressor = RandomForestRegressor(random_state = 17)
DCAA_RF_regressor.fit(X_train, y_train[:,0])

TCAA_RF_regressor = RandomForestRegressor(random_state = 17)
TCAA_RF_regressor.fit(X_train, y_train[:,1])

HAAFP_RF_regressor = RandomForestRegressor(random_state = 17)
HAAFP_RF_regressor.fit(X_train, y_train[:,2])

# Evaluation of training set (regression parameters) - Random Forest

In [None]:
# R2 CV
r2_DCAA_RF_train = model_selection.cross_validate(DCAA_RF_regressor, X_train_scaled,y_train[:,0], scoring = "r2", cv =10 )
r2_DCAA_RF_train = mean(r2_DCAA_RF_train["test_score"])

r2_TCAA_RF_train = model_selection.cross_validate(TCAA_RF_regressor, X_train_scaled,y_train[:,1], scoring = "r2", cv =10 )
r2_TCAA_RF_train = mean(r2_TCAA_RF_train["test_score"])

r2_HAAFP_RF_train = model_selection.cross_validate(HAAFP_RF_regressor, X_train_scaled,y_train[:,2], scoring = "r2", cv =10 )
r2_HAAFP_RF_train = mean(r2_HAAFP_RF_train["test_score"])

In [None]:
# RMSE CV
rmse_DCAA_RF_train = model_selection.cross_validate(DCAA_RF_regressor, X_train_scaled,y_train[:,0], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_DCAA_RF_train = -mean(rmse_DCAA_RF_train["test_score"])

rmse_TCAA_RF_train = model_selection.cross_validate(TCAA_RF_regressor, X_train_scaled,y_train[:,1], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_TCAA_RF_train = -mean(rmse_TCAA_RF_train["test_score"])

rmse_HAAFP_RF_train = model_selection.cross_validate(HAAFP_RF_regressor, X_train_scaled,y_train[:,2], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_HAAFP_RF_train = -mean(rmse_HAAFP_RF_train["test_score"])

In [None]:

print("DCAA :",  r2_DCAA_RF_train)                 
print("TCAA :", r2_TCAA_RF_train)
print("HAAs :", r2_HAAFP_RF_train)  

print("DCAA :",  rmse_DCAA_RF_train)                 
print("TCAA :", rmse_TCAA_RF_train)
print("HAAs :", rmse_HAAFP_RF_train)                           #The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.
parameters = DCAA_RF_regressor.get_params()              # Return the parameters used in this estimator
parameters

# Evaluation of test set (regression parameters) -  External Validation

In [None]:
y_pred_RF_DCAA = DCAA_RF_regressor.predict(X_test)
y_pred_RF_TCAA = TCAA_RF_regressor.predict(X_test)
y_pred_RF_HAAFP = HAAFP_RF_regressor.predict(X_test)

In [None]:
# R2 of prediction with the test set

score_DCAA_RF = DCAA_RF_regressor.score(X_test, y_test[:,0])           #Return the coefficient of determination R^2 of the prediction.
score_TCAA_RF = TCAA_RF_regressor.score(X_test, y_test[:,1])
score_HAAFP_RF = HAAFP_RF_regressor.score(X_test, y_test[:,2])
print("DCAA :",  score_DCAA_RF)                 
print("TCAA :", score_TCAA_RF)
print("HAAs :", score_HAAFP_RF)                           #The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.
parameters = DCAA_RF_regressor.get_params()              # Return the parameters used in this estimator
parameters

In [None]:

# RMSE external
DCAA_rmse = sqrt(mean_squared_error(y_test[:,0], (y_pred_RF_DCAA)))
TCAA_rmse = sqrt(mean_squared_error(y_test[:,1], (y_pred_RF_TCAA)))
HAAFP_rmse = sqrt(mean_squared_error(y_test[:,2], (y_pred_RF_HAAFP)))


print("DCAA: RMSE =  %f" %DCAA_rmse)
print("TCAA: RMSE =  %f " %TCAA_rmse)
print("HAAFP: RMSE =  %f" %HAAFP_rmse)

In [None]:
fig, axs = plt.subplots(3, sharex=True, sharey=True, figsize = (15,4))


p1 = plt.subplot(131)
p1.axis([-0.5, 7, -0.5, 7])
p1.scatter(y_train[:,0], DCAA_RF_regressor.predict(X_train), color='orange')
p1.scatter(y_test[:,0], y_pred_RF_DCAA, color = "blue")
p1.plot([-0.5, 7], [-0.5, 7], color='r')
p1.set_title('DCAA')
p1.set_ylabel("Predicted values (log scale)")

p1.text(4,6.2,"RMSE$_{CV}$ = %f" %rmse_DCAA_RF_train, horizontalalignment = "center", fontsize = 10)
p1.text(4,5.7,"Q$^{2}$ = %f" %r2_DCAA_RF_train, horizontalalignment = "center", fontsize = 10)


p2 = plt.subplot(132)
p2.axis([-0.5, 7, -0.5, 7])
p2.scatter(y_train[:,1], TCAA_RF_regressor.predict(X_train), color='orange')
p2.scatter(y_test[:,1], y_pred_RF_TCAA, color='blue')
p2.plot([-0.5, 7], [-0.5, 7], color='r')
p2.set_title('TCAA')
p2.set_xlabel("Experimental values (log scale)")

p2.text(4,6.2,"RMSE$_{CV}$ = %f" %rmse_TCAA_RF_train, horizontalalignment = "center", fontsize = 10)
p2.text(4,5.7,"Q$^{2}$ = %f" %r2_TCAA_RF_train, horizontalalignment = "center", fontsize = 10)

p2.legend(("_fit","Train","Test" ), edgecolor = "black", mode = "none", loc = "upper left")

plt.text(-2,-2,"Experimental values (log scale)", horizontalalignment = "center", fontsize = 10)

plt.savefig('../Jupyter/results/figures/simplified_RF_20200331.pdf')
plt.show()

# Multilayer Perceptron Regression

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
DCAA_mlp_regressor = MLPRegressor()
DCAA_mlp_regressor.fit(X_train_scaled, y_train[:,0])

TCAA_mlp_regressor = MLPRegressor()
TCAA_mlp_regressor.fit(X_train_scaled, y_train[:,1])

HAAFP_mlp_regressor = MLPRegressor()
HAAFP_mlp_regressor.fit(X_train_scaled, y_train[:,2])

# Evaluation of training set (regression parameters) - MLP

In [None]:
# Calculate Q2

r2_DCAA_mlp_train = model_selection.cross_validate(DCAA_mlp_regressor, X_train_scaled,y_train[:,0], scoring = "r2", cv =10 )
r2_DCAA_mlp_train = mean(r2_DCAA_mlp_train["test_score"])

r2_TCAA_mlp_train = model_selection.cross_validate(TCAA_mlp_regressor, X_train_scaled,y_train[:,1], scoring = "r2", cv =10 )
r2_TCAA_mlp_train = mean(r2_TCAA_mlp_train["test_score"])

r2_HAAFP_mlp_train = model_selection.cross_validate(HAAFP_mlp_regressor, X_train_scaled,y_train[:,2], scoring = "r2", cv =10 )
r2_HAAFP_mlp_train = mean(r2_HAAFP_mlp_train["test_score"])

In [None]:
# Calculate RMSE cv

rmse_DCAA_mlp_train = model_selection.cross_validate(DCAA_mlp_regressor, X_train_scaled,y_train[:,0], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_DCAA_mlp_train = -mean(rmse_DCAA_mlp_train["test_score"])

rmse_TCAA_mlp_train = model_selection.cross_validate(TCAA_mlp_regressor, X_train_scaled,y_train[:,1], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_TCAA_mlp_train = -mean(rmse_TCAA_mlp_train["test_score"])

rmse_HAAFP_mlp_train = model_selection.cross_validate(HAAFP_mlp_regressor, X_train_scaled,y_train[:,2], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_HAAFP_mlp_train = -mean(rmse_HAAFP_mlp_train["test_score"])

In [None]:
print("DCAA :",  r2_DCAA_mlp_train)                 
print("TCAA :", r2_TCAA_mlp_train)
print("HAAs :", r2_HAAFP_mlp_train) 


print("DCAA :",  rmse_DCAA_mlp_train)                 
print("TCAA :", rmse_TCAA_mlp_train)
print("HAAs :", rmse_HAAFP_mlp_train)

#The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.
parameters = DCAA_mlp_regressor.get_params()              # Return the parameters used in this estimator
parameters

# Evaluation of test set (regression parameters) -  External Validation

In [None]:
y_pred_mlp_DCAA = DCAA_mlp_regressor.predict(X_test_scaled)
y_pred_mlp_TCAA = TCAA_mlp_regressor.predict(X_test_scaled)
y_pred_mlp_HAAFP = HAAFP_mlp_regressor.predict(X_test_scaled)

In [None]:
score_DCAA_mlp = DCAA_mlp_regressor.score(X_test_scaled, y_test[:,0])           #Return the coefficient of determination R^2 of the prediction.
score_TCAA_mlp = TCAA_mlp_regressor.score(X_test_scaled, y_test[:,1])
score_HAAFP_mlp = HAAFP_mlp_regressor.score(X_test_scaled, y_test[:,2])
print("DCAA :",  score_DCAA_mlp)                 
print("TCAA :", score_TCAA_mlp)
print("HAAs :", score_HAAFP_mlp)                           #The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.
parameters = DCAA_mlp_regressor.get_params()              # Return the parameters used in this estimator
parameters

In [None]:

# RMSE
DCAA_rmse = sqrt(mean_squared_error(y_test[:,0], (y_pred_mlp_DCAA)))
TCAA_rmse = sqrt(mean_squared_error(y_test[:,1], (y_pred_mlp_TCAA)))
HAAFP_rmse = sqrt(mean_squared_error(y_test[:,2], (y_pred_mlp_HAAFP)))


print("DCAA: RMSE =  %f" %DCAA_rmse)
print("TCAA: RMSE =  %f" %TCAA_rmse)
print("HAAFP: RMSE =  %f" %HAAFP_rmse)

In [None]:
fig, axs = plt.subplots(3, sharex=True, sharey=True, figsize = (15,4))


p1 = plt.subplot(131)
p1.axis([-0.5, 7, -0.5, 7])
p1.scatter(y_train[:,0], DCAA_mlp_regressor.predict(X_train_scaled), color='orange')
p1.scatter(y_test[:,0], y_pred_mlp_DCAA, color = "blue")
p1.plot([-0.5, 7], [-0.5, 7], color='r')
p1.set_title('DCAA')
p1.set_ylabel("Predicted values (log scale)")

p1.text(4,6.2,"RMSE$_{CV}$ = %f" %rmse_DCAA_mlp_train, horizontalalignment = "center", fontsize = 10)
p1.text(4,5.7,"Q$^{2}$ = %f" %r2_DCAA_mlp_train, horizontalalignment = "center", fontsize = 10)


p2 = plt.subplot(132)
p2.axis([-0.5, 7, -0.5, 7])
p2.scatter(y_train[:,1], TCAA_mlp_regressor.predict(X_train_scaled), color='orange')
p2.scatter(y_test[:,1], y_pred_mlp_TCAA, color='blue')
p2.plot([-0.5, 7], [-0.5, 7], color='r')
p2.set_title('TCAA')
p2.set_xlabel("Experimental values (log scale)")

p2.text(4,6.2,"RMSE$_{CV}$ = %f" %rmse_TCAA_mlp_train, horizontalalignment = "center", fontsize = 10)
p2.text(4,5.7,"Q$^{2}$ = %f" %r2_TCAA_mlp_train, horizontalalignment = "center", fontsize = 10)

p2.legend(("_fit","Train","Test" ), edgecolor = "black", mode = "none", loc = "upper left")

plt.text(-2,-2,"Experimental values (log scale)", horizontalalignment = "center", fontsize = 10)

plt.savefig('../Jupyter/results/figures/simplified_MLP_20200331.pdf')
plt.show()

# Multiple Linear Regression

In [None]:
from sklearn import linear_model

In [None]:
DCAA_mlr_regressor = linear_model.LinearRegression()
DCAA_mlr_regressor.fit(X_train_scaled, y_train[:,0])

TCAA_mlr_regressor = linear_model.LinearRegression()
TCAA_mlr_regressor.fit(X_train_scaled, y_train[:,1])

HAAFP_mlr_regressor = linear_model.LinearRegression()
HAAFP_mlr_regressor.fit(X_train_scaled, y_train[:,2])

# Evaluation of training set (regression parameters) - MLR

In [None]:
r2_DCAA_mlr_train = model_selection.cross_validate(DCAA_mlr_regressor, X_train_scaled,y_train[:,0], scoring = "r2", cv =10 )
r2_DCAA_mlr_train = mean(r2_DCAA_mlr_train["test_score"])

r2_TCAA_mlr_train = model_selection.cross_validate(TCAA_mlr_regressor, X_train_scaled,y_train[:,1], scoring = "r2", cv =10 )
r2_TCAA_mlr_train = mean(r2_TCAA_mlr_train["test_score"])

r2_HAAFP_mlr_train = model_selection.cross_validate(HAAFP_mlr_regressor, X_train_scaled,y_train[:,2], scoring = "r2", cv =10 )
r2_HAAFP_mlr_train = mean(r2_HAAFP_mlr_train["test_score"])

In [None]:
rmse_DCAA_mlr_train = model_selection.cross_validate(DCAA_mlr_regressor, X_train_scaled,y_train[:,0], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_DCAA_mlr_train = -mean(rmse_DCAA_mlr_train["test_score"])

rmse_TCAA_mlr_train = model_selection.cross_validate(TCAA_mlr_regressor, X_train_scaled,y_train[:,1], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_TCAA_mlr_train = -mean(rmse_TCAA_mlr_train["test_score"])

rmse_HAAFP_mlr_train = model_selection.cross_validate(HAAFP_mlr_regressor, X_train_scaled,y_train[:,2], scoring = "neg_root_mean_squared_error", cv =10 )
rmse_HAAFP_mlr_train = -mean(rmse_HAAFP_mlr_train["test_score"])

In [None]:
print("DCAA :",  r2_DCAA_mlr_train)                 
print("TCAA :", r2_TCAA_mlr_train)
print("HAAs :", r2_HAAFP_mlr_train) 

print("DCAA :",  rmse_DCAA_mlr_train)                 
print("TCAA :", rmse_TCAA_mlr_train)
print("HAAs :", rmse_HAAFP_mlr_train)                           #The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.
parameters = DCAA_mlr_regressor.get_params()              # Return the parameters used in this estimator
parameters

# Evaluation of test set (regression parameters) -  External Validation

In [None]:
y_pred_mlr_DCAA = DCAA_mlr_regressor.predict(X_test_scaled)
y_pred_mlr_TCAA = TCAA_mlr_regressor.predict(X_test_scaled)
y_pred_mlr_HAAFP = HAAFP_mlr_regressor.predict(X_test_scaled)

In [None]:
# R2 external

score_DCAA_mlr = DCAA_mlr_regressor.score(X_test_scaled, y_test[:,0])           #Return the coefficient of determination R^2 of the prediction.
score_TCAA_mlr = TCAA_mlr_regressor.score(X_test_scaled, y_test[:,1])
score_HAAFP_mlr = HAAFP_mlr_regressor.score(X_test_scaled, y_test[:,2])
print("DCAA :",  score_DCAA_mlr)                 
print("TCAA :", score_TCAA_mlr)
print("HAAs :", score_HAAFP_mlr)                           #The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.
parameters = DCAA_mlr_regressor.get_params()              # Return the parameters used in this estimator
parameters

In [None]:

# RMSE external
DCAA_rmse = sqrt(mean_squared_error(y_test[:,0], (y_pred_mlr_DCAA)))
TCAA_rmse = sqrt(mean_squared_error(y_test[:,1], (y_pred_mlr_TCAA)))
HAAFP_rmse = sqrt(mean_squared_error(y_test[:,2], (y_pred_mlr_HAAFP)))


print("DCAA: RMSE =  %f" %DCAA_rmse)
print("TCAA: RMSE =  %f" %TCAA_rmse)
print("HAAFP: RMSE =  %f" %HAAFP_rmse)

In [None]:
fig, axs = plt.subplots(3, sharex=True, sharey=True, figsize = (15,4))


p1 = plt.subplot(131)
p1.axis([-0.5, 7, -0.5, 7])
p1.scatter(y_train[:,0], DCAA_mlr_regressor.predict(X_train_scaled), color='orange')
p1.scatter(y_test[:,0], y_pred_mlr_DCAA, color = "blue")
p1.plot([-0.5, 7], [-0.5, 7], color='r')
p1.set_title('DCAA')
p1.set_ylabel("Predicted values (log scale)")

p1.text(4,6.2,"RMSE$_{CV}$ = %f" %rmse_DCAA_mlr_train, horizontalalignment = "center", fontsize = 10)
p1.text(4,5.7,"Q$^{2}$ = %f" %r2_DCAA_mlr_train, horizontalalignment = "center", fontsize = 10)


p2 = plt.subplot(132)
p2.axis([-0.5, 7, -0.5, 7])
p2.scatter(y_train[:,1], TCAA_mlr_regressor.predict(X_train_scaled), color='orange')
p2.scatter(y_test[:,1], y_pred_mlr_TCAA, color='blue')
p2.plot([-0.5, 7], [-0.5, 7], color='r')
p2.set_title('TCAA')
p2.set_xlabel("Experimental values (log scale)")

p2.text(4,6.2,"RMSE$_{CV}$ = %f" %rmse_TCAA_mlr_train, horizontalalignment = "center", fontsize = 10)
p2.text(4,5.7,"Q$^{2}$ = %f" %r2_TCAA_mlr_train, horizontalalignment = "center", fontsize = 10)

p2.legend(("_fit","Train","Test" ), edgecolor = "black", mode = "none", loc = "upper left")


plt.text(-2,-2,"Experimental values (log scale)", horizontalalignment = "center", fontsize = 10)

plt.savefig('../Jupyter/results/figures/simplified_MLR_20200331.pdf')
plt.show()

# PCA+KNN

In [None]:
from sklearn.decomposition import PCA

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.fit_transform(X_test)

In [None]:
pca = PCA(n_components=30)
X_pca = pca.fit_transform(X_scaled_train)
sum(pca.explained_variance_ratio_)

In [None]:
pca.explained_variance_ratio_

#### It seems that qsar features have too strong correlationship and serious redundancy.  
### Fixed

Correlation Matrix

In [None]:
# df_qsar.corr()

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
DCAA_regressor = KNeighborsRegressor(n_neighbors=3)
DCAA_regressor.fit(X_scaled_train, y_train[:,0])

TCAA_regressor = KNeighborsRegressor(n_neighbors=3)
TCAA_regressor.fit(X_scaled_train, y_train[:,1])

HAAFP_regressor = KNeighborsRegressor(n_neighbors=3)
HAAFP_regressor.fit(X_scaled_train, y_train[:,2])

In [None]:
y_pred_DCAA = DCAA_regressor.predict(X_scaled_test)
y_pred_TCAA = TCAA_regressor.predict(X_scaled_test)
y_pred_HAAFP = HAAFP_regressor.predict(X_scaled_test)

In [None]:
plt.figure(figsize = (15, 4))
p1 = plt.subplot(131)
p1.axis([-2, 7, -2, 7])
p1.scatter(np.log(y_train[:,0]+1e-4), np.log(DCAA_regressor.predict(X_scaled)+1e-4), color='black')
p1.scatter(np.log(y_test[:,0]+1e-4), np.log(y_pred_DCAA+1e-4))
p1.plot([-2, 7], [-2, 7], color='r')
p1.set_title('KNN on PCAA (Black: Trainset, Blue: Testset)')
p1.set_xlabel('Log True Value')
p1.set_ylabel('Log Predicted Value')

p2 = plt.subplot(132)
p2.axis([-3, 7, -3, 7])
p2.scatter(np.log(y_train[:,1]+1e-4), np.log(TCAA_regressor.predict(X_scaled)+1e-4), color='black')
p2.scatter(np.log(y_test[:,1]+1e-4), np.log(y_pred_TCAA+1e-4))
p2.plot([-3, 7], [-3, 7], color='r')
p2.set_title('KNN on TCAA (Black: Trainset, Blue: Testset)')
p2.set_xlabel('Log True Value')
p2.set_ylabel('Log Predicted Value')

p3 = plt.subplot(133)
p3.axis([-3, 7, -3, 7])
p3.scatter(np.log(y_train[:,2]+1e-4), np.log(HAAFP_regressor.predict(X_scaled)+1e-4), color='black')
p3.scatter(np.log(y_test[:,2]+1e-4), np.log(y_pred_HAAFP+1e-4))
p3.plot([-3, 7], [-3, 7], color='r')
p3.set_title('KNN on HAAFP (Black: Trainset, Blue: Testset)')
p3.set_xlabel('Log True Value')
p3.set_ylabel('Log Predicted Value')

plt.show()