#### Testing for non-generic torch models:
1. Delfos (with and without attention)
2. MPNN (with and without attention)

In [646]:
%matplotlib inline

from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.Chem.Descriptors import MolWt
from sklearn.ensemble import RandomForestRegressor
import sklearn
from rdkit.Chem.rdmolops import GetFormalCharge
import torch
import deepchem as dc
import copy
from sklearn.model_selection import train_test_split

import basicest as b
import chemprop_ish as c
import delfos as d
import imp

pd.set_option('display.float_format', lambda x: '%.3f' % x)  # Display floats without scientific notation

---
## Loading Dataset

In [679]:
data = pd.read_csv('full_pka_data.csv')
solute = data['Solute SMILES'].tolist()
solvent = data['Solvent SMILES'].tolist()
pka = data['pKa (avg)'].tolist()
data_size = len(solute)

In [680]:
indices = list(range(data_size))
CV_ids, holdout_ids, _, _ = train_test_split(indices, solvent, test_size=0.2, random_state=1, stratify=solvent)
CV_datasets = b.data_maker(solute, solvent, pka, CV_ids)
datasets = b.data_maker(solute, solvent, pka)

In [693]:
desc = datasets['descriptors']
desc_scaler = sklearn.preprocessing.StandardScaler()
print(desc[0][[0,1,2,3,4]].shape)

(5, 400)


In [660]:
imp.reload(d)
imp.reload(c)

<module 'chemprop_ish' from '/Users/u6676643/codes/testing/chemprop_ish.py'>

---
## Hyperoptimisation

In [672]:
#RNN
model_dict = {'name':'RNN', 'model':d.dnet, 'model_type':'torch', 'data_type':'sentences'}
param_space = {'features':300,
               'interaction':None,
               'RNN_hidden':hp.choice('RNN_hidden', [128,256,512]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512,1024,2048]),
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['features','interaction','RNN_hidden','NN_hidden','NN_depth','readout','activation']
training_param_names = ['lr','batch_size']

RNN_hyp_res = b.hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
RNN_hyp_res[0]

 10%|█         | 3/30 [15:40:38<141:05:44, 18812.76s/trial, best loss: 1.2543874979019165]


KeyboardInterrupt: 

In [None]:
#RNN with attention
model_dict = {'name':'RNN with attention', 'model':d.dnet, 'model_type':'torch', 'data_type':'sentences'}
param_space = {'features':300,
               'interaction':hp.choice('interaction', ['exp','tanh']),
               'RNN_hidden':hp.choice('RNN_hidden', [128,256,512]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512,1024,2048]),
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['features','interaction','RNN_hidden','NN_hidden','NN_depth','readout','activation']
training_param_names = ['lr','batch_size']

RNNatt_hyp_res = b.hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
RNNatt_hyp_res[0]

In [682]:
#DMPNN
model_dict = {'name':'DMPNN', 'model':c.double_MPNN, 'model_type':'torch', 'data_type':'SMILES'}
param_space = {'atom_messages':False,
               'MP_hidden':hp.choice('MP_hidden', [64,128,256,512]),
               'MP_depth':hp.choice('MP_depth', [2,3,4]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'interaction':False,
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512]),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['atom_messages','MP_hidden','MP_depth','readout','dropout','interaction','NN_depth','NN_hidden','activation']
training_param_names = ['lr','batch_size']

DMPNN_hyp_res = b.hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
DMPNN_hyp_res[0]

 10%|█         | 3/30 [1:01:20<9:12:04, 1226.82s/trial, best loss: 1.4481956958770752]


KeyboardInterrupt: 

In [None]:
#DMPNN with attention
model_dict = {'name':'DMPNN with attention', 'model':c.double_MPNN, 'model_type':'torch', 'data_type':'SMILES'}
param_space = {'atom_messages':False,
               'MP_hidden':hp.choice('MP_hidden', [64,128,256,512]),
               'MP_depth':hp.choice('MP_depth', [2,3,4]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'interaction':hp.choice('interaction', ['exp','tanh']),
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512]),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['atom_messages','MP_hidden','MP_depth','readout','dropout','interaction','NN_depth','NN_hidden','activation']
training_param_names = ['lr','batch_size']

DMPNNatt_hyp_res = b.hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
DMPNNatt_hyp_res[0]

In [678]:
#MPNN
model_dict = {'name':'MPNN', 'model':c.double_MPNN, 'model_type':'torch', 'data_type':'SMILES'}
param_space = {'atom_messages':False,
               'MP_hidden':hp.choice('MP_hidden', [64,128,256,512]),
               'MP_depth':hp.choice('MP_depth', [2,3,4]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'interaction':False,
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512]),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['atom_messages','MP_hidden','MP_depth','readout','dropout','interaction','NN_depth','NN_hidden','activation']
training_param_names = ['lr','batch_size']

MPNN_hyp_res = b.hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
MPNN_hyp_res[0]

  0%|          | 0/30 [00:35<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [None]:
#MPNN with attention
model_dict = {'name':'MPNN with attention', 'model':c.double_MPNN, 'model_type':'torch', 'data_type':'SMILES'}
param_space = {'atom_messages':False,
               'MP_hidden':hp.choice('MP_hidden', [64,128,256,512]),
               'MP_depth':hp.choice('MP_depth', [2,3,4]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'interaction':hp.choice('interaction', ['exp','tanh']),
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512]),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['atom_messages','MP_hidden','MP_depth','readout','dropout','interaction','NN_depth','NN_hidden','activation']
training_param_names = ['lr','batch_size']

MPNNatt_hyp_res = b.hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
MPNNatt_hyp_res[0]

In [None]:
#RF with descriptors
model_dict = {'name':'RF with descriptors', 'model':RandomForestRegressor, 'model_type':'sklearn', 'data_type':'descriptors'}
param_space = {'n_estimators':hp.choice('n_estimators', [32,64,128,256,512,1024,2048]),
               'max_depth':hp.choice('max_depth', [16,32,64,128,256,512,None]),
               'min_samples_split':hp.choice('min_samples_split', [2,4,8,16]),
               'min_samples_leaf':hp.choice('min_samples_leaf', [1,2,4,8]),
               'max_features':hp.choice('max_features', ['auto','sqrt']),
               'bootstrap':hp.choice('bootstrap', [True,False]),
               'n_jobs':-1}

model_param_names = ['n_estimators','max_depth','min_samples_split','min_samples_leaf','max_features','bootstrap','n_jobs']
training_param_names = []

RFdesc_hyp_res = b.hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
RFdesc_hyp_res[0]

In [None]:
#RF with ECFP
model_dict = {'name':'RF with ECFP', 'model':RandomForestRegressor, 'model_type':'sklearn', 'data_type':'ECFP'}
param_space = {'n_estimators':hp.choice('n_estimators', [32,64,128,256,512,1024,2048]),
               'max_depth':hp.choice('max_depth', [16,32,64,128,256,512,None]),
               'min_samples_split':hp.choice('min_samples_split', [2,4,8,16]),
               'min_samples_leaf':hp.choice('min_samples_leaf', [1,2,4,8]),
               'max_features':hp.choice('max_features', ['auto','sqrt']),
               'bootstrap':hp.choice('bootstrap', [True,False]),
               'n_jobs':-1}

model_param_names = ['n_estimators','max_depth','min_samples_split','min_samples_leaf','max_features','bootstrap','n_jobs']
training_param_names = []

RFecfp_hyp_res = b.hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
RFecfp_hyp_res[0]

In [699]:
#MLP with descriptors
model_dict = {'name':'MLP with descriptors', 'model':MLPRegressor, 'model_type':'sklearn', 'data_type':'descriptors'}
param_space = {'hidden_layer_sizes':hp.choice('hidden_layer_sizes', [(128),(256,128),(512,256,128),(512,256),(256),(512),(64),(256,128,64),(128,64,32),(128,256,128),(256,256),(128,128)]),
               'activation':hp.choice('activation', ['logistic','tanh','relu']),
               'solver':'adam',
               'batch_size':hp.choice('batch_size', [16,32,64,128,'auto']),
               'early_stopping':True}

model_param_names = ['hidden_layer_sizes','activation','solver','batch_size','early_stopping']
training_param_names = []

MLPdesc_hyp_res = b.hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
MLPdesc_hyp_res[0]

100%|██████████| 30/30 [23:05<00:00, 46.18s/trial, best loss: 1.2760181667972326]
Total training time (min): 23.091510160119892


{'loss': 1.2760181667972326,
 'params': {'activation': 'logistic',
  'batch_size': 32,
  'early_stopping': True,
  'hidden_layer_sizes': (512, 256, 128),
  'solver': 'adam'},
 'run_time': 128.43516844697297,
 'status': 'ok'}

In [None]:
#MLP with ECFP
model_dict = {'name':'MLP with ECFP', 'model':MLPRegressor, 'model_type':'sklearn', 'data_type':'ECFP'}
param_space = {'hidden_layer_sizes':hp.choice('hidden_layer_sizes', [(128),(256,128),(512,256,128),(512,256),(256),(512),(64),(256,128,64),(128,64,32),(128,256,128),(256,256),(128,128)]),
               'activation':hp.choice('activation', ['logistic','tanh','relu']),
               'solver':'adam',
               'batch_size':hp.choice('batch_size', [16,32,64,128,'auto']),
               'early_stopping':True}

model_param_names = ['hidden_layer_sizes','activation','solver','batch_size','early_stopping']
training_param_names = []

MLPecfp_hyp_res = b.hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
MLPecfp_hyp_res[0]

In [None]:
#XGB with descriptors
model_dict = {'name':'XGB with descriptors', 'model':XGBRegressor, 'model_type':'sklearn', 'data_type':'descriptors'}
param_space = {'n_estimators':hp.choice('n_estimators', [32,64,128,256,512,1024,2048,4096]),
               'max_depth':hp.choice('max_depth', [1,2,4,8]),
               'min_child_weight':hp.choice('min_child_weight', [1,2,4,8,16]),
               'eta':hp.choice('eta', [0.1,0.2,0.3,0.4]),
               'gamma':hp.choice('gamma', [0,0.1,0.2]),
               'subsample':hp.choice('subsample', [0.6,0.7,0.8,0.9]),
               'colsample_bytree':hp.choice('colsample_bytree', [0.6,0.7,0.8,0.9,1]),
               'gamma':hp.choice('gamma', [0,0.1,0.2]),
               'n_jobs':-1}

model_param_names = ['n_estimators','max_depth','min_child_weight','eta','gamma','subsample','colsample_bytree','gamma','n_jobs']
training_param_names = []

XGBdesc_hyp_res = b.hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
XGBdesc_hyp_res[0]

In [None]:
#XGB with ECFP
model_dict = {'name':'XGB with ECFP', 'model':XGBRegressor, 'model_type':'sklearn', 'data_type':'ECFP'}
param_space = {'n_estimators':hp.choice('n_estimators', [32,64,128,256,512,1024,2048,4096]),
               'max_depth':hp.choice('max_depth', [1,2,4,8]),
               'min_child_weight':hp.choice('min_child_weight', [1,2,4,8,16]),
               'eta':hp.choice('eta', [0.1,0.2,0.3,0.4]),
               'gamma':hp.choice('gamma', [0,0.1,0.2]),
               'subsample':hp.choice('subsample', [0.6,0.7,0.8,0.9]),
               'colsample_bytree':hp.choice('colsample_bytree', [0.6,0.7,0.8,0.9,1]),
               'gamma':hp.choice('gamma', [0,0.1,0.2]),
               'n_jobs':-1}

model_param_names = ['n_estimators','max_depth','min_child_weight','eta','gamma','subsample','colsample_bytree','gamma','n_jobs']
training_param_names = []

XGBecfp_hyp_res = b.hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
XGBecfp_hyp_res[0]

---
## Training + testing

In [697]:
DMPNN = b.Model(name='D-MPNN',
                model=c.double_MPNN(atom_messages=False),
                model_type='torch',
                data_type='SMILES')
DMPNN_att = b.Model(name='D-MPNN with attention',
                    model=c.double_MPNN(atom_messages=False, interaction='exp'),
                    model_type='torch',
                    data_type='SMILES')
MPNN = b.Model(name='MPNN',
                model=c.double_MPNN(atom_messages=True),
                model_type='torch',
                data_type='SMILES')
MPNN_att = b.Model(name='MPNN with attention',
                    model=c.double_MPNN(atom_messages=True, interaction='exp'),
                    model_type='torch',
                    data_type='SMILES')
RNN = b.Model(name='RNN+NN',
              model=d.dnet(interaction=None),
              model_type='torch',
              data_type='sentences')
RNN_att = b.Model(name='RNN+NN with attention',
                  model=d.dnet(interaction='exp'),
                  model_type='torch',
                  data_type='sentences')
RF_desc = b.Model(name='Random forest with descriptors',
                  model=RandomForestRegressor(n_estimators=1000, n_jobs=12),
                  model_type='sklearn',
                  data_type='descriptors')
RF_ECFP = b.Model(name='Random forest with ECFP',
                  model=RandomForestRegressor(n_estimators=1000, n_jobs=12),
                  model_type='sklearn',
                  data_type='ECFP')
MLP_desc = b.Model(name='MLP with descriptors',
                  model=MLPRegressor(hidden_layer_sizes=(200, 100)),
                  model_type='sklearn',
                  data_type='descriptors')
MLP_ECFP = b.Model(name='MLP with ECFP',
                  model=MLPRegressor(hidden_layer_sizes=(256, 128)),
                  model_type='sklearn',
                  data_type='ECFP')
RGB_desc = b.Model(name='RGBoost with descriptors',
                  model=XGBRegressor(n_jobs=12),
                  model_type='sklearn',
                  data_type='descriptors')
RGB_ECFP = b.Model(name='RGBoost with ECFP',
                  model=XGBRegressor(n_jobs=12),
                  model_type='sklearn',
                  data_type='ECFP')

#list of all models for testing
models = [DMPNN, DMPNN_att, MPNN, MPNN_att, RNN, RNN_att, RF_desc, RF_ECFP, MLP_desc, MLP_ECFP, RGB_desc, RGB_ECFP]

In [405]:
#TODO: full CV training + testing
results = ddict(list)
for m in models:
    print('testing '+m.name+' ...')
    data = datasets[m.data_type]
    
    res, full_res = b.CV_fit(m, data)
    
    results['Model'].append(m.name)
    results['MAE'].append(res[0])
    results['RMSE'].append(res[1])

full_CV_test = pd.DataFrame(results)
print(full_CV_test)

testing D-MPNN ...
testing D-MPNN with attention ...
testing MPNN ...
testing MPNN with attention ...
testing RNN+NN ...
testing RNN+NN with attention ...
testing Random forest with descriptors ...
testing Random forest with ECFP ...
testing MLP with descriptors ...




testing MLP with ECFP ...
testing RGBoost with descriptors ...
testing RGBoost with ECFP ...
                             Model    MAE   RMSE
0                           D-MPNN  9.060 11.765
1            D-MPNN with attention  9.345 12.045
2                             MPNN  9.145 11.799
3              MPNN with attention 13.104 18.482
4                           RNN+NN  9.038 11.766
5            RNN+NN with attention  9.025 11.758
6   Random forest with descriptors  3.062  4.810
7          Random forest with ECFP  4.021  5.861
8             MLP with descriptors  7.678 12.406
9                    MLP with ECFP  3.906  5.657
10        RGBoost with descriptors  2.848  4.659
11               RGBoost with ECFP  3.970  6.005


In [676]:
#TODO: holdout data test
exp_name = "Holdout test"
results = ddict(list)
for m in models:
    print('testing '+m.name+' ...')
    data = datasets[m.data_type]
    
    res = b.fit(m, data, holdout_ids, exp_name, datasets)
    
    results['Model'].append(m.name)
    results['MAE'].append(res[0])
    results['RMSE'].append(res[1])

holdout_test = pd.DataFrame(results)
print(holdout_test)

testing D-MPNN ...
testing D-MPNN with attention ...
testing MPNN ...
testing MPNN with attention ...
testing RNN+NN ...
testing RNN+NN with attention ...
testing Random forest with descriptors ...
testing Random forest with ECFP ...
testing MLP with descriptors ...
testing MLP with ECFP ...
testing RGBoost with descriptors ...
testing RGBoost with ECFP ...
                             Model    MAE   RMSE
0                           D-MPNN  1.811  2.479
1            D-MPNN with attention  1.740  2.587
2                             MPNN  1.825  2.601
3              MPNN with attention  1.843  2.677
4                           RNN+NN  1.534  2.310
5            RNN+NN with attention  1.189  1.804
6   Random forest with descriptors  1.179  2.059
7          Random forest with ECFP  1.484  2.541
8             MLP with descriptors 15.651 18.638
9                    MLP with ECFP  1.777  2.608
10        RGBoost with descriptors  1.190  1.999
11               RGBoost with ECFP  1.558  2.497


In [None]:
#TODO: dataset size vs accuracy

In [None]:
#TODO: pka split vs accuracy

In [677]:
#LOSO
results = ddict(list)
solvent_set = list(set(solvent))

for solv in solvent_set:
    test_ids = [i for i, x in enumerate(solvent) if x == solv]
    size = len(test_ids)
    exp_name = "LOSO "+solv
    print('testing '+solv+' ...')
    for m in models:
        data = datasets[m.data_type]

        res = b.fit(m, data, test_ids, exp_name, datasets)

        results['Model'].append(m.name)
        results['LOSO solvent'].append(solv)
        results['Test size'].append(size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

LOSO_test = pd.DataFrame(results)
print(LOSO_test)

testing C1CCOC1 ...


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [635]:
#LOEO
results = ddict(list)
element_set = ['N','O','F','P','S','Cl','Br']

for ele in element_set:
    test_ids = [i for i, x in enumerate(solute) if ele in x]
    size = len(test_ids)
    exp_name = "LOEO "+ele
    print('testing '+ele+' ...')
    for m in models:
        data = datasets[m.data_type]

        res = b.fit(m, data, test_ids, exp_name, datasets)

        results['Model'].append(m.name)
        results['LOEO element'].append(ele)
        results['Test size'].append(size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

LOEO_test = pd.DataFrame(results)
print(LOEO_test)

testing N ...
Stopping at epoch 56
Stopping at epoch 41
Stopping at epoch 93
Stopping at epoch 46
Stopping at epoch 40
Stopping at epoch 39




testing O ...
Stopping at epoch 60
Stopping at epoch 60
Stopping at epoch 66
Stopping at epoch 53
Stopping at epoch 19
Stopping at epoch 21




testing F ...
Stopping at epoch 34
Stopping at epoch 54
Stopping at epoch 44
Stopping at epoch 96
Stopping at epoch 28
Stopping at epoch 44
testing P ...
Stopping at epoch 43
Stopping at epoch 36
Stopping at epoch 31
Stopping at epoch 65
Stopping at epoch 22
Stopping at epoch 28
testing S ...
Stopping at epoch 50
Stopping at epoch 56
Stopping at epoch 49
Stopping at epoch 71
Stopping at epoch 24
Stopping at epoch 29
testing Cl ...
Stopping at epoch 73
Stopping at epoch 63
Stopping at epoch 64
Stopping at epoch 51
Stopping at epoch 28
Stopping at epoch 27
testing Br ...
Stopping at epoch 42
Stopping at epoch 34
Stopping at epoch 37
Stopping at epoch 38
Stopping at epoch 39
Stopping at epoch 40
                       Model LOEO element  Test size    MAE   RMSE
0                     D-MPNN            N        700  6.800  8.865
1      D-MPNN with attention            N        700  6.212  7.945
2                       MPNN            N        700  6.969  9.464
3        MPNN with attention  

In [636]:
#LOMO
results = ddict(list)
solute_masses = [MolWt(Chem.MolFromSmiles(mol)) for mol in solute]
mass_cutoffs = [100,150,200,250,300]

for mass in mass_cutoffs:
    test_ids = [i for i, x in enumerate(solute_masses) if x > mass]
    size = len(test_ids)
    exp_name = "LOMO >"+str(mass)+'g/mol'
    print('testing >'+str(mass)+'g/mol ...')
    for m in models:
        data = datasets[m.data_type]

        res = b.fit(m, data, test_ids, exp_name, datasets)

        results['Model'].append(m.name)
        results['LOMO mass cutoff'].append(mass)
        results['Test size'].append(size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

LOMO_test = pd.DataFrame(results)
print(LOMO_test)

testing >100g/mol ...
Stopping at epoch 48
Stopping at epoch 51
Stopping at epoch 42
Stopping at epoch 54
Stopping at epoch 24
Stopping at epoch 24
testing >150g/mol ...
Stopping at epoch 63
Stopping at epoch 80
Stopping at epoch 57
Stopping at epoch 46
Stopping at epoch 41
Stopping at epoch 45




testing >200g/mol ...
Stopping at epoch 51
Stopping at epoch 72
Stopping at epoch 49
Stopping at epoch 65
Stopping at epoch 20
Stopping at epoch 41
testing >250g/mol ...
Stopping at epoch 42
Stopping at epoch 86
Stopping at epoch 38
Stopping at epoch 67
Stopping at epoch 26
Stopping at epoch 26
testing >300g/mol ...
Stopping at epoch 37
Stopping at epoch 79
Stopping at epoch 43
Stopping at epoch 59
Stopping at epoch 29
Stopping at epoch 28




                             Model  LOMO mass cutoff  Test size    MAE   RMSE
0                           D-MPNN               100        750  5.927  7.644
1            D-MPNN with attention               100        750  6.389  9.159
2                             MPNN               100        750  5.804  7.650
3              MPNN with attention               100        750  6.807 10.643
4                           RNN+NN               100        750  6.081  7.822
5            RNN+NN with attention               100        750  6.049  7.877
6   Random forest with descriptors               100        750  6.464  8.631
7          Random forest with ECFP               100        750  5.731  7.364
8             MLP with descriptors               100        750 28.092 38.180
9                    MLP with ECFP               100        750  6.023  7.479
10        RGBoost with descriptors               100        750  6.962  9.505
11               RGBoost with ECFP               100        750 

In [637]:
#LOCO
results = ddict(list)
solute_charges = [GetFormalCharge(Chem.MolFromSmiles(mol)) for mol in solute]
charge_list = [0,1]

for charge in charge_list:
    test_ids = [i for i, x in enumerate(solute_charges) if x == charge]
    size = len(test_ids)
    exp_name = "LOCO "+str(charge)
    print('testing >'+str(charge)+' ...')
    for m in models:
        data = datasets[m.data_type]

        res = b.fit(m, data, test_ids, exp_name, datasets)

        results['Model'].append(m.name)
        results['LOCO charge'].append(charge)
        results['Test size'].append(size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

LOCO_test = pd.DataFrame(results)
print(LOCO_test)

testing >0 ...
Stopping at epoch 37
Stopping at epoch 53
Stopping at epoch 91
Stopping at epoch 69
Stopping at epoch 29
Stopping at epoch 66




testing >1 ...
Stopping at epoch 56
Stopping at epoch 57
Stopping at epoch 49
Stopping at epoch 96
Stopping at epoch 29
Stopping at epoch 14




                             Model  LOCO charge  Test size    MAE   RMSE
0                           D-MPNN            0        580 14.455 18.893
1            D-MPNN with attention            0        580 19.377 24.194
2                             MPNN            0        580 14.384 19.088
3              MPNN with attention            0        580 15.891 20.473
4                           RNN+NN            0        580 12.140 17.001
5            RNN+NN with attention            0        580 11.763 16.532
6   Random forest with descriptors            0        580 12.807 17.470
7          Random forest with ECFP            0        580 12.132 16.639
8             MLP with descriptors            0        580 29.541 38.871
9                    MLP with ECFP            0        580 11.263 15.948
10        RGBoost with descriptors            0        580 13.268 17.729
11               RGBoost with ECFP            0        580 12.453 17.110
12                          D-MPNN            1    

In [None]:
exp_name = "Holdout test"
results = ddict(list)
for m in models:
    print('testing '+m.name+' ...')
    data = datasets[m.data_type]
    
    res = b.fit(m, data, holdout_ids, exp_name, datasets)
    
    results['Model'].append(m.name)
    results['MAE'].append(res[0])
    results['RMSE'].append(res[1])

holdout_test = pd.DataFrame(results)
print(holdout_test)

In [585]:
outputs.reshape(893,).shape

(893,)

In [None]:
results = ddict(list)
results['targets'] = datasets['ECFP'][1]
for m in models:
    data = datasets[m.data_type]
    _, outputs = b.predict(m, m.experiments[0], data)
    results[m.name] = outputs.reshape(893,)
    
output_results = pd.DataFrame(results)
print(output_results)
output_results.to_csv('output_results.csv')

In [598]:
imp.reload(b)

<module 'basicest' from '/Users/u6676643/codes/testing/basicest.py'>