## Import File

In [1]:
%matplotlib inline 

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

from rdkit import Chem

from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.model_selection import LeaveOneOut
from sklearn import metrics
from sklearn.metrics import mean_squared_error

from statistics import mean

from math import sqrt

In [3]:
from sklearn.ensemble import RandomForestRegressor

In [4]:
from sklearn import svm
#from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn import linear_model

In [5]:
 #These are stored after running the "Data preprocessing" notebook

%store -r X_train         
%store  -r X_test

%store -r y_train
%store -r  y_test

%store -r X
%store -r log_y


## Scaling and Training 

In [6]:
# scale x_train
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)



## Dictionary of regressors

In [7]:
# define a dictionary regressors() were all regressors are stored

DCAA_regressors = {"RF" : RandomForestRegressor(random_state = 17),
              "SVR_{rbf}" : svm.SVR(),
              "SVR_{linear}" : svm.SVR(kernel = "linear"),
                  "MLP" : MLPRegressor(solver = "lbfgs", max_iter = 400, random_state = 17),
                  "MLR" : linear_model.LinearRegression()}
    

In [8]:
# define a dictionary regressors() were all regressors are stored

TCAA_regressors = {"RF" : RandomForestRegressor(random_state = 17),
              "SVR_{rbf}" : svm.SVR(),
              "SVR_{linear}" : svm.SVR(kernel = "linear"),
                  "MLP" : MLPRegressor(solver = "lbfgs", max_iter = 400, random_state = 17),
                  "MLR" : linear_model.LinearRegression()}
    

In [9]:
# get scores for DCAA

y_pred_DCAA = dict()
r2_DCAA_cv = dict()
rmse_DCAA_cv = dict()
r2_DCAA_ext = dict ()
rmse_DCAA_ext = dict()


reg_list_DCAA = DCAA_regressors.keys()

for a in reg_list_DCAA:
    DCAA_regressors[a] = DCAA_regressors[a].fit(X_train_scaled, y_train[:,0])
    y_pred_DCAA[a] = DCAA_regressors[a].predict(X_test_scaled)
    
    r2_DCAA_cv[a] = model_selection.cross_validate(DCAA_regressors[a], X_train_scaled,y_train[:,0], scoring = "r2", cv = 5)
    r2_DCAA_cv[a] = mean(r2_DCAA_cv[a]["test_score"])
    rmse_DCAA_cv[a] = model_selection.cross_validate(DCAA_regressors[a], X_train_scaled,y_train[:,0], scoring = "neg_root_mean_squared_error", cv = 5)
    rmse_DCAA_cv[a] = -mean(rmse_DCAA_cv[a]["test_score"])
    
    r2_DCAA_ext[a] = DCAA_regressors[a].score(X_test_scaled, y_test[:,0])
    rmse_DCAA_ext[a] = sqrt(mean_squared_error(y_test[:,0], (y_pred_DCAA[a])))



In [10]:
print(r2_DCAA_ext)
print(rmse_DCAA_ext)

{'RF': 0.07088550695793006, 'SVR_{rbf}': 0.4000501014173681, 'SVR_{linear}': -11.872446267696033, 'MLP': -4.319283237844951, 'MLR': -68.62458458252267}
{'RF': 1.235657742644714, 'SVR_{rbf}': 0.9929355720756857, 'SVR_{linear}': 4.599325236576214, 'MLP': 2.9565834089549172, 'MLR': 10.696578580816558}


In [11]:
# get scores for TCAA

y_pred_TCAA = dict()

r2_TCAA_cv = dict()
rmse_TCAA_cv = dict()
r2_TCAA_ext = dict ()
rmse_TCAA_ext = dict()


reg_list_TCAA = TCAA_regressors.keys()

for a in reg_list_TCAA:
    TCAA_regressors[a] = TCAA_regressors[a].fit(X_train_scaled, y_train[:,1])
    y_pred_TCAA[a] = TCAA_regressors[a].predict(X_test_scaled)
    
    r2_TCAA_cv[a] = model_selection.cross_validate(TCAA_regressors[a], X_train_scaled,y_train[:,1], scoring = "r2", cv =5 )
    r2_TCAA_cv[a] = mean(r2_TCAA_cv[a]["test_score"])
    rmse_TCAA_cv[a] = model_selection.cross_validate(TCAA_regressors[a], X_train_scaled,y_train[:,1], scoring = "neg_root_mean_squared_error", cv =5 )
    rmse_TCAA_cv[a] = -mean(rmse_TCAA_cv[a]["test_score"])
    r2_TCAA_ext[a] = TCAA_regressors[a].score(X_test_scaled, y_test[:,1])
    rmse_TCAA_ext[a] = sqrt(mean_squared_error(y_test[:,1], (y_pred_TCAA[a])))

In [None]:
#print(r2_TCAA_ext)
#print(rmse_TCAA_ext)

In [None]:
# calculate the leave one out cross validation for DCAA


loo_q2_DCAA = dict()
loo_rmse_DCAA = dict ()


reg_list_DCAA = DCAA_regressors.keys()

Xr = X_scaled
yr = log_y[:,0]

for a in reg_list_DCAA:
    loo = LeaveOneOut()
    ytests = []
    ypreds = []
    for train_idx, test_idx in loo.split(Xr):
        X_train, X_test = Xr[train_idx], Xr[test_idx] #requires arrays
        y_train, y_test = yr[train_idx], yr[test_idx]
    
        #model = svm.SVR(kernel = "linear")
        DCAA_regressors[a].fit(X = X_train, y = y_train) 
        y_pred = DCAA_regressors[a].predict(X_test)
        
        # there is only one y-test and y-pred per iteration over the loo.split, 
        # so to get a proper graph, we append them to respective lists.
        
        ytests += list(y_test)
        ypreds += list(y_pred)
        
    loo_q2_DCAA[a] = metrics.r2_score(ytests, ypreds)
    loo_rmse_DCAA[a] = metrics.mean_squared_error(ytests, ypreds, squared = False)
        
    print("Leave One Out Cross Validation" + str(a))
    print("LOO $Q^2$: {:.5f}, MSE: {:.5f}".format(loo_q2_DCAA[a], loo_rmse_DCAA[a]))

In [None]:
#Calculate the LOO-CV for TCAA

loo_q2_TCAA = dict()
loo_rmse_TCAA = dict ()

reg_list_TCAA = TCAA_regressors.keys()

Xr = X_scaled
yr = log_y[:,1]

for a in reg_list_TCAA:
    loo = LeaveOneOut()
    ytests = []
    ypreds = []
    for train_idx, test_idx in loo.split(Xr):
        X_train, X_test = Xr[train_idx], Xr[test_idx] #requires arrays
        y_train, y_test = yr[train_idx], yr[test_idx]
    
        #model = svm.SVR(kernel = "linear")
        TCAA_regressors[a].fit(X = X_train, y = y_train) 
        y_pred = TCAA_regressors[a].predict(X_test)
        
        # there is only one y-test and y-pred per iteration over the loo.split, 
        # so to get a proper graph, we append them to respective lists.
        
        ytests += list(y_test)
        ypreds += list(y_pred)
    
    plt.axis([-.5, 8, -.5, 8])
    plt.scatter(ytests,ypreds)
    plt.plot([-1, 8], [-1, 8], color='r')
    plt.savefig("../Jupyter/results/figures/LOO_TCAA_" + str(a) + "_20200505.pdf")
    plt.show
        
    loo_q2_TCAA[a] = metrics.r2_score(ytests, ypreds)
    loo_rmse_TCAA[a] = metrics.mean_squared_error(ytests, ypreds, squared = False)
        
    print("Leave One Out Cross Validation")
    print("LOO $Q^2$: {:.5f}%, RMSE: {:.5f}".format(loo_q2_TCAA[a], loo_rmse_TCAA[a]))

In [None]:

plt.axis([-.5, 8, -.5, 8])
plt.scatter(ytests,ypreds)
plt.plot([-1, 8], [-1, 8], color='r')
plt.savefig("../Jupyter/results/figures/LOO_TCAA_" + str(a) + "_202000505.pdf")
plt.show



In [None]:
DCAA = {"RMSE_{CV}" : rmse_DCAA_cv,
        "Q^2" :r2_DCAA_cv,
       "RMSE_{ext}" : rmse_DCAA_ext,
       "R^2_{ext}" : r2_DCAA_ext,
       "LOO-RMSE" : loo_rmse_DCAA,
       "LOO-Q^2" : loo_q2_DCAA}

DCAA_df = pd.DataFrame.from_dict(DCAA)

DCAA_df

In [None]:
TCAA = {"RMSE_{CV}" : rmse_TCAA_cv,
        "Q^2" :r2_TCAA_cv,
       "RMSE_{ext}" : rmse_TCAA_ext,
       "R^2_{ext}" : r2_TCAA_ext,
       "LOO-RMSE" : loo_rmse_TCAA,
       "LOO-Q^2" : loo_q2_TCAA}

TCAA_df = pd.DataFrame.from_dict(TCAA)

TCAA_df

In [None]:
table_dict = {"DCAA ": DCAA,
             "TCAA" : TCAA}

In [None]:
print(pd.DataFrame.to_latex(DCAA_df, index = True))

In [None]:
# \begin{array}{ccccc}
# \hline & & DCAA & & \\
# \hline  & RMSE_{CV} & Q^2 & RMSE_{ext} & R^2_{ext}  \\\hline
# RF           &  1.26 &  0.08 &  1.01 &  0.42 \\
# SVR_{rbf}    &  1.19 &  0.26 &  0.99 &  0.44 \\
# SVR_{linear} &  2.43 & -3.57 &  3.48 & -5.84 \\
# MLP          &  1.51 & -0.49 &  1.24 &  0.13 \\
# MLR          &  6.86x10^{11} & -1.78x10^{24} &  3.74x10^{11} & -7.90x10^{22} \\
# \end{array}

In [None]:
print(pd.DataFrame.to_latex(TCAA_df, index = True))

In [None]:
# \begin{array}{ccccc}
# \hline & & TCAA & & \\
# \hline  & RMSE_{CV} & Q^2 & RMSE_{ext} & R^2_{ext}  \\\hline
# RF           &  1.24 &  0.54 &  1.27 &  0.50 \\
# SVR_{rbf}    &  1.38 &  0.44 &  1.20 &  0.55 \\
# SVR_{linear} &  3.48 & -3.38 &  3.32 & -2.42 \\
# MLP          &  1.66 &  0.16 &  1.67 &  0.13 \\
# MLR          &  5.01x10^{11} & -2.52x10^{23} &  1.73x10^{12} & -9.312x10^{23} \\
# \end{array}

\begin{array}{lcccccccc}
\hline & & DCAA & & & & TCAA & \\
\hline  & RMSE_{CV} & Q^2 & RMSE_{ext} & R^2_{ext}  & RMSE_{CV} & Q^2 & RMSE_{ext} & R^2_{ext} \\\hline
RF           &  1.26 &  0.08 &  1.01 &  0.42 &  1.24 &  0.54 &  1.27 &  0.50 \\
SVR_{rbf}    &  1.19 &  0.26 &  0.99 &  0.44 &  1.38 &  0.44 &  1.20 &  0.55 \\
SVR_{linear} &  2.43 & -3.57 &  3.48 & -5.84 &  3.48 & -3.38 &  3.32 & -2.42 \\
MLP          &  1.51 & -0.49 &  1.24 &  0.13 &  1.66 &  0.16 &  1.67 &  0.13 \\
MLR          &  5.007518e+11 & -2.519059e+23 &  1.730859e+12 & -9.319042e+23  &  5.01x10^{11} & -2.52x10^{23} &  1.73x10^{12} & -9.312x10^{23} \\\hline
\end{array}

# Scale & Support Vector Regression - Linear

In [None]:
#too many indices for array error - I guess I need to redifine and refit the regressors 
#probably if I get the LOO-CV first and then the external it would be ok. 
#Lets check for coherenece

In [None]:
for a in reg_list_DCAA:
    fig, axs = plt.subplots(2, sharex=True, sharey=True, figsize = (12,4))


    p1 = plt.subplot(121)
    p1.axis([-.2, 8, -.2, 8])
    p1.scatter((y_train[:,0]), (DCAA_regressors[a].predict(scaler.transform(X_train))), color='orange')
    p1.scatter((y_test[:,0]), (y_pred_DCAA[a]), color = "blue")
    p1.plot([-1, 8], [-1, 8], color='r')
    p1.set_title('DCAAFP')
    p1.text(4,7,"R$^2_{ext}$ = %f" %r2_DCAA_ext[a], horizontalalignment = "center", fontsize = 10)
    p1.text(4,6.5,"LOO-Q$^{2}$ = %f" %loo_q2_DCAA[a], horizontalalignment = "center", fontsize = 10)

    plt.ylabel("Predicted values (log scale)")

    p2 = plt.subplot(122)
    p2.axis([-.5, 8, -.5, 8])
    p2.scatter((y_train[:,1]), (TCAA_regressors[a].predict(scaler.transform(X_train))), color='orange')
    p2.scatter((y_test[:,1]), (y_pred_TCAA[a]), color = "blue")
    p2.plot([-1, 8], [-1, 8], color='r')
    p2.legend(("_fit","Train","Test" ), edgecolor = "black", mode = "none", loc = "upper left")
    p2.set_title('TCAAFP')
    p2.text(4,7,"R$^2_{ext}$ = %f" %r2_TCAA_ext[a], horizontalalignment = "center", fontsize = 10)
    p2.text(4,6.5,"LOO-Q$^{2}$ = %f" %loo_q2_TCAA[a], horizontalalignment = "center", fontsize = 10)



    plt.text(-2,-2,"Experimental values (log scale)", horizontalalignment = "center", fontsize = 10)


    plt.savefig("../Jupyter/results/figures/" + str(a) + "_20200505.pdf")
    plt.show()