## Import and Configure Everything We Need

In [1]:
%matplotlib inline

from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
import deepchem as dc
import torch

from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*') 

import basic as b
import chemprop_ish as c

pd.set_option('display.float_format', lambda x: '%.3f' % x)  # Display floats without scientific notation

# In many cases NaN
not_used_desc = ['MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge']

# Create a descriptor calculator for all RDKit descriptors except the ones above
desc_calc = MolecularDescriptorCalculator([x for x in [x[0] for x in Descriptors.descList] if x not in not_used_desc])



---
## Loading Precombined Dataset

In [2]:
data = pd.read_csv('combisolv_exp2.csv')
solute = data['smiles_solute'].tolist()
solvent = data['smiles_solvent'].tolist()
pka = data['dGsolv_avg [kcal/mol]'].tolist()
sol_solv = [[x,y] for x,y in zip(solute,solvent)]
#preprocess pka too

In [3]:
H_list = []
for x in range(len(sol_solv)):
    if sol_solv[x][0] in ["[H][H]","[2H][2H]","[HH]"]:
        H_list.append(x)
for x in sorted(H_list, reverse = True):
    del sol_solv[x]
    del pka[x]

In [38]:
imp.reload(b)

<module 'basic' from '/Users/u6676643/codes/diympnn/deepchemMPNN/basic.py'>

In [31]:
model = c.double_MPNN(args)

In [32]:
model(x_data[0:20])

tensor([[0.0147],
        [0.0161],
        [0.0159],
        [0.0269],
        [0.0282],
        [0.0153],
        [0.0087],
        [0.0073],
        [0.0197],
        [0.0215],
        [0.0276],
        [0.0308],
        [0.0122],
        [0.0153],
        [0.0335],
        [0.0090],
        [0.0233],
        [0.0183],
        [0.0240],
        [0.0147]], grad_fn=<AddmmBackward>)

---
## Training torch models
#### Using the following training sets with 5-fold cross-validation (shuffled)
1. Solute / solvent pairs

---
### Torch models

In [4]:
seed = 24
verbose = False

y_data = torch.Tensor(pka)
x_data = sol_solv
models = ddict(odict)

In [5]:
def generate_score_board(name):
    print(f'{name} CV Scores:')
    for k, v in models[name].cv_scores.items():
         print(f'\t\t- {k}: {np.mean(v):.3f} ± {np.std(v):.3f}')

In [46]:
test = [[x_data[0][0]],[x_data[0][1]]]
print(models[name].models[0](test))

tensor([[-1.6097]], grad_fn=<AddmmBackward>)


In [52]:
args = c.TrainArgs()
est_cls = c.double_MPNN
name = 'MPNN, no interaction'

models[name] = b.train_cv_model(est_cls, x_data, y_data, params=args, random_state=seed)
generate_score_board(name)

Fold done
Fold done
Fold done
Fold done
Fold done
MPNN, no interaction CV Scores:
		- mean_absolute_error: 1.274 ± 0.052
		- rmse: 2.028 ± 0.099
		- r2_score: 0.799 ± 0.016


In [51]:
import imp
imp.reload(b)

<module 'basic' from '/Users/u6676643/codes/diympnn/deepchemMPNN/basic.py'>

In [53]:
args = c.TrainArgs()
args.__dict__.update({"interaction":True})
est_cls = c.double_MPNN
name = 'MPNN, interaction'

models[name] = b.train_cv_model(est_cls, x_data, y_data, params=args, random_state=seed)
generate_score_board(name)

Fold done
Fold done
Fold done
Fold done
Fold done
MPNN, interaction CV Scores:
		- mean_absolute_error: 0.722 ± 0.058
		- rmse: 1.093 ± 0.062
		- r2_score: 0.941 ± 0.008


In [54]:
args = c.TrainArgs()
args.__dict__.update({"depth":3, "dropout":0.2, "interaction":True, "atom_messages":False})
est_cls = c.double_MPNN
name = 'D-MPNN, interaction'

models[name] = b.train_cv_model(est_cls, x_data, y_data, params=args, random_state=seed)
generate_score_board(name)

Fold done
Fold done
Fold done
Fold done
Fold done
D-MPNN, interaction CV Scores:
		- mean_absolute_error: 0.720 ± 0.040
		- rmse: 1.100 ± 0.055
		- r2_score: 0.941 ± 0.006


In [55]:
args = c.TrainArgs()
args.__dict__.update({"depth":3, "dropout":0.2, "interaction":False, "atom_messages":False})
est_cls = c.double_MPNN
name = 'D-MPNN, no interaction'

models[name] = b.train_cv_model(est_cls, x_data, y_data, params=args, random_state=seed)
generate_score_board(name)

Fold done
Fold done
Fold done
Fold done
Fold done
D-MPNN, no interaction CV Scores:
		- mean_absolute_error: 1.245 ± 0.012
		- rmse: 2.009 ± 0.052
		- r2_score: 0.803 ± 0.009
