## Import and Configure Everything We Need

In [23]:
%matplotlib inline

from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
import deepchem as dc
import torch

from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*') 

import basic as b
import chemprop_ish as c

pd.set_option('display.float_format', lambda x: '%.3f' % x)  # Display floats without scientific notation

# In many cases NaN
not_used_desc = ['MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge']

# Create a descriptor calculator for all RDKit descriptors except the ones above
desc_calc = MolecularDescriptorCalculator([x for x in [x[0] for x in Descriptors.descList] if x not in not_used_desc])

---
## Loading Precombined Dataset

In [64]:
data = pd.read_csv('combisolv_exp2.csv')
solute = data['smiles_solute'].tolist()
solvent = data['smiles_solvent'].tolist()
pka = data['dGsolv_avg [kcal/mol]'].tolist()
sol_solv = [[x,y] for x,y in zip(solute,solvent)]
#preprocess pka too

In [69]:
H_list = []
for x in range(len(sol_solv)):
    if sol_solv[x][0] in ["[H][H]","[2H][2H]","[HH]"]:
        H_list.append(x)
for x in sorted(H_list, reverse = True):
    del sol_solv[x]
    del pka[x]

In [70]:
sol_solv_mol = [[Chem.MolFromSmiles(x) for x in y] for y in sol_solv]

In [71]:
len(sol_solv_mol)

8700

In [72]:
#check for mols with 0 heavy atoms
for x in sol_solv_mol:
    for y in x:
        if y.GetNumHeavyAtoms() == 0:
            print(Chem.MolToSmiles(y))

## Calculating Descriptors and Fingerprints
- 196/200 RDKit descriptors
- Morgan FP with radius=3 and useFeatures=True (FMorgan3)

In [73]:
descs, fmorgan3, descs_fmorgan3 = b.calc_xy_data(sol_solv_mol)

---
## Training Random Forest, Support Vector Machine (two configurations) and Multilayer Perceptron (three configurations)
#### Using the following training sets with 5-fold cross-validation (shuffled)
1. RDKit descriptor set
2. FMorgan3
3. RDKit descriptor set + FMorgan3

### Prepare for Training

In [74]:
seed = 24

est_jobs = 12
verbose = False

y_data = np.array(pka)
desc_sets = [[descs, 'Desc'],[fmorgan3, 'FMorgan3'],[descs_fmorgan3, 'Desc_FMorgan3']]

models = ddict(odict)  # estimator => training set => [model, scaler]

def train_all_sets(est_cls, params, name, torch_model):
    for x_data, set_name in desc_sets:
        models[name][set_name] = b.train_cv_model(est_cls, x_data, y_data, params, seed, torch_model=torch_model)

In [75]:
def generate_score_board(name):
    print(f'{name} CV Scores:')
    for ts, m in models[name].items():
        print(f'\t{ts}')
        for k, v in m.cv_scores.items():
            print(f'\t\t- {k}: {np.mean(v):.3f} ± {np.std(v):.3f}')

---
### RandomForest (n_estimators=1000)

In [76]:
est_cls = RandomForestRegressor
rf_params = dict(n_estimators=1000, n_jobs=est_jobs, verbose=verbose, random_state=seed)
name = 'RandomForest (n_estimators=1000)'

train_all_sets(est_cls, rf_params, name, False)
generate_score_board(name)

RandomForest (n_estimators=1000) CV Scores:
	Desc
		- mean_absolute_error: 0.314 ± 0.008
		- rmse: 0.671 ± 0.034
		- r2_score: 0.978 ± 0.002
	FMorgan3
		- mean_absolute_error: 0.717 ± 0.022
		- rmse: 1.380 ± 0.049
		- r2_score: 0.907 ± 0.006
	Desc_FMorgan3
		- mean_absolute_error: 0.314 ± 0.008
		- rmse: 0.672 ± 0.036
		- r2_score: 0.978 ± 0.002


---
### SupportVectorMachine (gamma='scale')

In [77]:
est_cls = SVR
svr_params = dict(cache_size=4096, verbose=verbose)
name = 'SupportVectorMachine (gamma="scale")'

train_all_sets(est_cls, svr_params, name, False)
generate_score_board(name)

SupportVectorMachine (gamma="scale") CV Scores:
	Desc
		- mean_absolute_error: 3.095 ± 0.054
		- rmse: 4.689 ± 0.079
		- r2_score: -0.073 ± 0.014
	FMorgan3
		- mean_absolute_error: 0.938 ± 0.017
		- rmse: 1.918 ± 0.059
		- r2_score: 0.820 ± 0.009
	Desc_FMorgan3
		- mean_absolute_error: 3.095 ± 0.054
		- rmse: 4.689 ± 0.079
		- r2_score: -0.073 ± 0.014


---
### SupportVectorMachine (gamma='auto')

In [78]:
est_cls = SVR
svr_params = dict(cache_size=4096, verbose=verbose, gamma='auto')
name = 'SupportVectorMachine (gamma="auto")'

train_all_sets(est_cls, svr_params, name, False)
generate_score_board(name)

SupportVectorMachine (gamma="auto") CV Scores:
	Desc
		- mean_absolute_error: 2.929 ± 0.057
		- rmse: 4.541 ± 0.087
		- r2_score: -0.007 ± 0.018
	FMorgan3
		- mean_absolute_error: 2.659 ± 0.045
		- rmse: 4.111 ± 0.075
		- r2_score: 0.175 ± 0.013
	Desc_FMorgan3
		- mean_absolute_error: 1.210 ± 0.047
		- rmse: 2.613 ± 0.103
		- r2_score: 0.667 ± 0.019


---
### Multi Layer Perceptron (early_stopping=False, hidden_layer_sizes=(500, 500))

In [79]:
est_cls = MLPRegressor
mlp_params = dict(hidden_layer_sizes=(500, 500), verbose=verbose, random_state=seed)
name = 'Multi Layer Perceptron (early_stopping=False, hidden_layer_sizes=(500, 500))'

train_all_sets(est_cls, mlp_params, name, False)
generate_score_board(name)

Multi Layer Perceptron (early_stopping=False, hidden_layer_sizes=(500, 500)) CV Scores:
	Desc
		- mean_absolute_error: 164906611.176 ± 181177898.511
		- rmse: 1609817340.697 ± 1790354766.720
		- r2_score: -280813531084887296.000 ± 352954091191318848.000
	FMorgan3
		- mean_absolute_error: 0.661 ± 0.028
		- rmse: 1.272 ± 0.068
		- r2_score: 0.921 ± 0.008
	Desc_FMorgan3
		- mean_absolute_error: 76720537.574 ± 60967739.463
		- rmse: 779932876.426 ± 630328309.503
		- r2_score: -49246725664475824.000 ± 50634856180989016.000


---
### Multi Layer Perceptron (early_stopping=True, hidden_layer_sizes=(500, 500))

In [80]:
est_cls = MLPRegressor
mlp_params = dict(hidden_layer_sizes=(500, 500), verbose=verbose, random_state=seed, early_stopping=True)
name = 'Multi Layer Perceptron (early_stopping=True, hidden_layer_sizes=(500, 500))'

train_all_sets(est_cls, mlp_params, name, False)
generate_score_board(name)



AttributeError: 'MLPRegressor' object has no attribute '_best_coefs'

---
### Multi Layer Perceptron (early_stopping=True, hidden_layer_sizes=(250, 250, 250))

In [82]:
est_cls = MLPRegressor
mlp_params = dict(hidden_layer_sizes=(250, 250, 250), verbose=verbose, random_state=seed, early_stopping=True)
name = 'Multi Layer Perceptron (early_stopping=True, hidden_layer_sizes=(250, 250, 250))'

train_all_sets(est_cls, mlp_params, name, False)
generate_score_board(name)

Multi Layer Perceptron (early_stopping=True, hidden_layer_sizes=(250, 250, 250)) CV Scores:
	Desc
		- mean_absolute_error: 4165369.456 ± 6817569.897
		- rmse: 38747282.618 ± 61925880.287
		- r2_score: -257493886094999.188 ± 491086682662495.438
	FMorgan3
		- mean_absolute_error: 0.677 ± 0.033
		- rmse: 1.251 ± 0.043
		- r2_score: 0.923 ± 0.005
	Desc_FMorgan3
		- mean_absolute_error: 116006.029 ± 229890.703
		- rmse: 1010317.069 ± 1998715.688
		- r2_score: -237141818932.427 ± 474254063249.370


---
### XGradientBoost

In [81]:
est_cls = xgb.XGBRegressor
xgb_params = dict(verbosity=2 if verbose else 0, random_state=seed, n_jobs=est_jobs)
name = 'XGradientBoost'

train_all_sets(est_cls, xgb_params, name, False)
generate_score_board(name)

XGradientBoost CV Scores:
	Desc
		- mean_absolute_error: 0.301 ± 0.004
		- rmse: 0.614 ± 0.041
		- r2_score: 0.982 ± 0.002
	FMorgan3
		- mean_absolute_error: 0.870 ± 0.027
		- rmse: 1.429 ± 0.043
		- r2_score: 0.900 ± 0.006
	Desc_FMorgan3
		- mean_absolute_error: 0.335 ± 0.010
		- rmse: 0.643 ± 0.045
		- r2_score: 0.980 ± 0.003


---
## Training torch models
#### Using the following training sets with 5-fold cross-validation (shuffled)
1. Sol / solvent pairs

---
### Torch models

In [7]:
seed = 24
verbose = False

y_data = torch.Tensor(pka)
x_data = sol_solv

In [8]:
args = c.TrainArgs()
args.__dict__.update({"depth":3, "dropout":0.2})
est_cls = c.double_MPNN(args)
name = 'MPNN'

b.train_cv_model(est_cls, x_data, y_data, random_state=seed)
generate_score_board(name)

TypeError: train_cv_model() missing 1 required positional argument: 'random_state'