#### Testing for non-generic torch models:
1. Delfos (with and without attention)
2. MPNN (with and without attention)

In [1]:
%matplotlib inline

from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.Chem.Descriptors import MolWt
from sklearn.ensemble import RandomForestRegressor
import sklearn
from rdkit.Chem.rdmolops import GetFormalCharge
import torch
import deepchem as dc
import copy
from sklearn.model_selection import train_test_split
from hyperopt import hp
import imp

pd.set_option('display.float_format', lambda x: '%.3f' % x)  # Display floats without scientific notation



In [2]:
from modules.data import data_maker
from modules.RNN import double_RNN
from modules.fit import Model, fit
from modules.myhyperopt import hyperopt_func
from modules.MPNN import double_MPNN



---
## Loading Dataset

In [3]:
data = pd.read_csv('data/full_pka_data.csv')
solute = data['Solute SMILES'].tolist()
solvent = data['Solvent SMILES'].tolist()
pka = data['pKa (avg)'].tolist()
data_size = len(solute)

In [4]:
indices = list(range(data_size))
CV_ids, holdout_ids, _, _ = train_test_split(indices, solvent, test_size=0.2, random_state=1, stratify=solvent)
CV_datasets = data_maker(solute, solvent, pka, CV_ids)
datasets = data_maker(solute, solvent, pka)

---
## Hyperoptimisation

In [24]:
#RNN
model_dict = {'name':'RNN', 'model':double_RNN, 'model_type':'torch', 'data_type':'sentences'}
param_space = {'features':300,
               'interaction':None,
               'RNN_hidden':hp.choice('RNN_hidden', [128,256,512]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512,1024,2048]),
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['features','interaction','RNN_hidden','NN_hidden','NN_depth','readout','activation']
training_param_names = ['lr','batch_size']

RNN_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
RNN_hyp_res[0]

NameError: name 'double_RNN' is not defined

In [None]:
#RNN with attention
model_dict = {'name':'RNN with attention', 'model':double_RNN, 'model_type':'torch', 'data_type':'sentences'}
param_space = {'features':300,
               'interaction':hp.choice('interaction', ['exp','tanh']),
               'RNN_hidden':hp.choice('RNN_hidden', [128,256,512]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512,1024,2048]),
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['features','interaction','RNN_hidden','NN_hidden','NN_depth','readout','activation']
training_param_names = ['lr','batch_size']

RNNatt_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
RNNatt_hyp_res[0]

In [682]:
#DMPNN
model_dict = {'name':'DMPNN', 'model':double_MPNN, 'model_type':'torch', 'data_type':'SMILES'}
param_space = {'atom_messages':False,
               'MP_hidden':hp.choice('MP_hidden', [64,128,256,512]),
               'MP_depth':hp.choice('MP_depth', [2,3,4]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'interaction':False,
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512]),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['atom_messages','MP_hidden','MP_depth','readout','dropout','interaction','NN_depth','NN_hidden','activation']
training_param_names = ['lr','batch_size']

DMPNN_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
DMPNN_hyp_res[0]

 10%|█         | 3/30 [1:01:20<9:12:04, 1226.82s/trial, best loss: 1.4481956958770752]


KeyboardInterrupt: 

In [None]:
#DMPNN with attention
model_dict = {'name':'DMPNN with attention', 'model':double_MPNN, 'model_type':'torch', 'data_type':'SMILES'}
param_space = {'atom_messages':False,
               'MP_hidden':hp.choice('MP_hidden', [64,128,256,512]),
               'MP_depth':hp.choice('MP_depth', [2,3,4]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'interaction':hp.choice('interaction', ['exp','tanh']),
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512]),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['atom_messages','MP_hidden','MP_depth','readout','dropout','interaction','NN_depth','NN_hidden','activation']
training_param_names = ['lr','batch_size']

DMPNNatt_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
DMPNNatt_hyp_res[0]

In [678]:
#MPNN
model_dict = {'name':'MPNN', 'model':double_MPNN, 'model_type':'torch', 'data_type':'SMILES'}
param_space = {'atom_messages':False,
               'MP_hidden':hp.choice('MP_hidden', [64,128,256,512]),
               'MP_depth':hp.choice('MP_depth', [2,3,4]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'interaction':False,
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512]),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['atom_messages','MP_hidden','MP_depth','readout','dropout','interaction','NN_depth','NN_hidden','activation']
training_param_names = ['lr','batch_size']

MPNN_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
MPNN_hyp_res[0]

  0%|          | 0/30 [00:35<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [None]:
#MPNN with attention
model_dict = {'name':'MPNN with attention', 'model':double_MPNN, 'model_type':'torch', 'data_type':'SMILES'}
param_space = {'atom_messages':False,
               'MP_hidden':hp.choice('MP_hidden', [64,128,256,512]),
               'MP_depth':hp.choice('MP_depth', [2,3,4]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'interaction':hp.choice('interaction', ['exp','tanh']),
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512]),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['atom_messages','MP_hidden','MP_depth','readout','dropout','interaction','NN_depth','NN_hidden','activation']
training_param_names = ['lr','batch_size']

MPNNatt_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
MPNNatt_hyp_res[0]

In [5]:
#RF with descriptors
model_dict = {'name':'RF with descriptors', 'model':RandomForestRegressor, 'model_type':'sklearn', 'data_type':'descriptors'}
param_space = {'n_estimators':hp.choice('n_estimators', [32,64,128,256,512,1024,2048]),
               'max_depth':hp.choice('max_depth', [16,32,64,128,256,512,None]),
               'min_samples_split':hp.choice('min_samples_split', [2,4,8,16]),
               'min_samples_leaf':hp.choice('min_samples_leaf', [1,2,4,8]),
               'max_features':hp.choice('max_features', ['auto','sqrt']),
               'bootstrap':hp.choice('bootstrap', [True,False]),
               'n_jobs':-1}

model_param_names = ['n_estimators','max_depth','min_samples_split','min_samples_leaf','max_features','bootstrap','n_jobs']
training_param_names = []

RFdesc_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
RFdesc_hyp_res[0]

100%|██████████| 30/30 [44:37<00:00, 89.25s/trial, best loss: 1.3505516265758195] 
Total training time (min): 44.623752204316666


{'loss': 1.3505516265758195,
 'params': {'bootstrap': True,
  'max_depth': 512,
  'max_features': 'auto',
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 2048,
  'n_jobs': -1},
 'run_time': 226.25426462799987,
 'status': 'ok'}

In [5]:
#RF with ECFP
model_dict = {'name':'RF with ECFP', 'model':RandomForestRegressor, 'model_type':'sklearn', 'data_type':'ECFP'}
param_space = {'n_estimators':hp.choice('n_estimators', [32,64,128,256,512,1024,2048]),
               'max_depth':hp.choice('max_depth', [16,32,64,128,256,512,None]),
               'min_samples_split':hp.choice('min_samples_split', [2,4,8,16]),
               'min_samples_leaf':hp.choice('min_samples_leaf', [1,2,4,8]),
               'max_features':hp.choice('max_features', ['auto','sqrt']),
               'bootstrap':hp.choice('bootstrap', [True,False]),
               'n_jobs':-1}

model_param_names = ['n_estimators','max_depth','min_samples_split','min_samples_leaf','max_features','bootstrap','n_jobs']
training_param_names = []

RFecfp_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
RFecfp_hyp_res[0]

100%|██████████| 30/30 [3:51:02<00:00, 462.08s/trial, best loss: 1.5674555405853379]   
Total training time (min): 231.0397540024


{'loss': 1.5674555405853379,
 'params': {'bootstrap': True,
  'max_depth': 512,
  'max_features': 'auto',
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 2048,
  'n_jobs': -1},
 'run_time': 1238.881594587,
 'status': 'ok'}

In [7]:
#MLP with descriptors
model_dict = {'name':'MLP with descriptors', 'model':MLPRegressor, 'model_type':'sklearn', 'data_type':'descriptors'}
param_space = {'hidden_layer_sizes':hp.choice('hidden_layer_sizes', [(128),(256,128),(512,256,128),(512,256),(256),(512),(64),(256,128,64),(128,64,32),(128,256,128),(256,256),(128,128)]),
               'activation':hp.choice('activation', ['logistic','tanh','relu']),
               'solver':'adam',
               'batch_size':hp.choice('batch_size', [16,32,64,128,'auto']),
               'early_stopping':True}

model_param_names = ['hidden_layer_sizes','activation','solver','batch_size','early_stopping']
training_param_names = []

MLPdesc_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
MLPdesc_hyp_res[0]

100%|██████████| 30/30 [21:12<00:00, 42.40s/trial, best loss: 1.2326064571528836]
Total training time (min): 21.203471856133334


{'loss': 1.2326064571528836,
 'params': {'activation': 'logistic',
  'batch_size': 32,
  'early_stopping': True,
  'hidden_layer_sizes': (512, 256, 128),
  'solver': 'adam'},
 'run_time': 146.33090689399978,
 'status': 'ok'}

In [6]:
#MLP with ECFP
model_dict = {'name':'MLP with ECFP', 'model':MLPRegressor, 'model_type':'sklearn', 'data_type':'ECFP'}
param_space = {'hidden_layer_sizes':hp.choice('hidden_layer_sizes', [(128),(256,128),(512,256,128),(512,256),(256),(512),(64),(256,128,64),(128,64,32),(128,256,128),(256,256),(128,128)]),
               'activation':hp.choice('activation', ['logistic','tanh','relu']),
               'solver':'adam',
               'batch_size':hp.choice('batch_size', [16,32,64,128,'auto']),
               'early_stopping':True}

model_param_names = ['hidden_layer_sizes','activation','solver','batch_size','early_stopping']
training_param_names = []

MLPecfp_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
MLPecfp_hyp_res[0]

100%|██████████| 30/30 [2:23:49<00:00, 287.65s/trial, best loss: 1.5294755988767599] 
Total training time (min): 143.82353783585003


{'loss': 1.5294755988767599,
 'params': {'activation': 'relu',
  'batch_size': 16,
  'early_stopping': True,
  'hidden_layer_sizes': (256, 128, 64),
  'solver': 'adam'},
 'run_time': 461.70881920600004,
 'status': 'ok'}

In [9]:
#XGB with descriptors
model_dict = {'name':'XGB with descriptors', 'model':XGBRegressor, 'model_type':'sklearn', 'data_type':'descriptors'}
param_space = {'n_estimators':hp.choice('n_estimators', [32,64,128,256,512,1024,2048,4096]),
               'max_depth':hp.choice('max_depth', [1,2,4,8]),
               'min_child_weight':hp.choice('min_child_weight', [1,2,4,8,16]),
               'eta':hp.choice('eta', [0.1,0.2,0.3,0.4]),
               'gamma':hp.choice('gamma', [0,0.1,0.2]),
               'subsample':hp.choice('subsample', [0.6,0.7,0.8,0.9]),
               'colsample_bytree':hp.choice('colsample_bytree', [0.6,0.7,0.8,0.9,1]),
               'gamma':hp.choice('gamma', [0,0.1,0.2]),
               'n_jobs':-1}

model_param_names = ['n_estimators','max_depth','min_child_weight','eta','gamma','subsample','colsample_bytree','gamma','n_jobs']
training_param_names = []

XGBdesc_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
XGBdesc_hyp_res[0]

100%|██████████| 30/30 [25:37<00:00, 51.26s/trial, best loss: 1.1962208378296566] 
Total training time (min): 25.630719183983334


{'loss': 1.1962208378296566,
 'params': {'colsample_bytree': 0.7,
  'eta': 0.1,
  'gamma': 0,
  'max_depth': 8,
  'min_child_weight': 2,
  'n_estimators': 2048,
  'n_jobs': -1,
  'subsample': 0.9},
 'run_time': 141.17223592899973,
 'status': 'ok'}

In [7]:
#XGB with ECFP
model_dict = {'name':'XGB with ECFP', 'model':XGBRegressor, 'model_type':'sklearn', 'data_type':'ECFP'}
param_space = {'n_estimators':hp.choice('n_estimators', [32,64,128,256,512,1024,2048,4096]),
               'max_depth':hp.choice('max_depth', [1,2,4,8]),
               'min_child_weight':hp.choice('min_child_weight', [1,2,4,8,16]),
               'eta':hp.choice('eta', [0.1,0.2,0.3,0.4]),
               'gamma':hp.choice('gamma', [0,0.1,0.2]),
               'subsample':hp.choice('subsample', [0.6,0.7,0.8,0.9]),
               'colsample_bytree':hp.choice('colsample_bytree', [0.6,0.7,0.8,0.9,1]),
               'gamma':hp.choice('gamma', [0,0.1,0.2]),
               'n_jobs':-1}

model_param_names = ['n_estimators','max_depth','min_child_weight','eta','gamma','subsample','colsample_bytree','gamma','n_jobs']
training_param_names = []

XGBecfp_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
XGBecfp_hyp_res[0]

100%|██████████| 30/30 [3:01:13<00:00, 362.46s/trial, best loss: 1.4146270334817639]  
Total training time (min): 181.2331836483667


{'loss': 1.4146270334817639,
 'params': {'colsample_bytree': 0.6,
  'eta': 0.1,
  'gamma': 0,
  'max_depth': 8,
  'min_child_weight': 2,
  'n_estimators': 2048,
  'n_jobs': -1,
  'subsample': 0.8},
 'run_time': 844.2109466850015,
 'status': 'ok'}

---
## Training + testing

In [5]:
DMPNN = Model(name='D-MPNN',
                model=double_MPNN(atom_messages=False),
                model_type='torch',
                data_type='graphs')
DMPNN_att = Model(name='D-MPNN with attention',
                    model=double_MPNN(atom_messages=False, interaction='exp'),
                    model_type='torch',
                    data_type='graphs')
MPNN = Model(name='MPNN',
                model=double_MPNN(atom_messages=True),
                model_type='torch',
                data_type='graphs')
MPNN_att = Model(name='MPNN with attention',
                    model=double_MPNN(atom_messages=True, interaction='exp'),
                    model_type='torch',
                    data_type='graphs')
RNN = Model(name='RNN',
              model=double_RNN(interaction=None),
              model_type='torch',
              data_type='sentences')
RNN_att = Model(name='RNN with attention',
                  model=double_RNN(interaction='exp'),
                  model_type='torch',
                  data_type='sentences')
RF_desc = Model(name='Random forest with descriptors',
                  model=RandomForestRegressor(bootstrap=True, max_depth=512, max_features='auto', min_samples_leaf=2, min_samples_split=2, n_estimators=2048, n_jobs=-1),
                  model_type='sklearn',
                  data_type='descriptors')
RF_ECFP = Model(name='Random forest with ECFP',
                  model=RandomForestRegressor(bootstrap=True, max_depth=512, max_features='auto', min_samples_leaf=2, min_samples_split=2, n_estimators=2048, n_jobs=-1),
                  model_type='sklearn',
                  data_type='ECFP')
MLP_desc = Model(name='MLP with descriptors',
                  model=MLPRegressor(activation='logistic', batch_size=32, early_stopping=True, hidden_layer_sizes=(512, 256, 128), solver='adam'),
                  model_type='sklearn',
                  data_type='descriptors')
MLP_ECFP = Model(name='MLP with ECFP',
                  model=MLPRegressor(activation='relu', batch_size=16, early_stopping=True, hidden_layer_sizes=(256, 128, 64), solver='adam'),
                  model_type='sklearn',
                  data_type='ECFP')
RGB_desc = Model(name='RGBoost with descriptors',
                  model=XGBRegressor(colsample_bytree=0.7, eta=0.1, gamma=0, max_depth=8, min_child_weight=2, n_estimators=2048, n_jobs=-1, subsample=0.9),
                  model_type='sklearn',
                  data_type='descriptors')
RGB_ECFP = Model(name='RGBoost with ECFP',
                  model=XGBRegressor(colsample_bytree=0.6, eta=0.1, gamma=0, max_depth=8, min_child_weight=2, n_estimators=2048, n_jobs=-1, subsample=0.8),
                  model_type='sklearn',
                  data_type='ECFP')
#list of all models for testing
models = [DMPNN, DMPNN_att, MPNN, MPNN_att, RNN, RNN_att, RF_desc, RF_ECFP, MLP_desc, MLP_ECFP, RGB_desc, RGB_ECFP]
#models = [RF_desc, RF_ECFP, MLP_desc, MLP_ECFP, RGB_desc, RGB_ECFP]

In [7]:
#TODO: holdout data test
exp_name = "Holdout test"
results = ddict(list)
for m in models:
    print('testing '+m.name+' ...')
    data = datasets[m.data_type]
    
    res = fit(m, data, holdout_ids, exp_name)
    
    results['Model'].append(m.name)
    results['MAE'].append(res[0])
    results['RMSE'].append(res[1])

holdout_test = pd.DataFrame(results)
print(holdout_test)
holdout_test.to_csv('results/holdout_test.csv')

testing MPNN ...
testing D-MPNN ...


KeyboardInterrupt: 

In [None]:
from statsmodels.api import qqlot_2samples
import pylab

data = datasets[MPNN.data_type]
targets, outputs = predict(MPNN, MPNN.experiments[0], data, holdout_ids)

sm.qqplot_2samples(targets, outputs, line='45')
pylab.show()

In [16]:
#TODO: dataset size vs accuracy
results = ddict(list)
proportions = [0.1,0.2,0.5,0.75]

for prop in proportions:
    solvents = [solvent[i] for i in CV_ids]
    train_ids,_,_,_ = train_test_split(CV_ids, solvents, test_size=1-prop, random_state=1, stratify=solvents)
    train_size = len(train_ids)
    exp_name = "Training data size "+str(prop)
    print('testing prop '+str(prop)+' ...')
    for m in models:
        data = datasets[m.data_type]

        res = fit(m, data, test_ids, exp_name, train_ids=train_ids)

        results['Model'].append(m.name)
        results['Train size'].append(train_size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

train_prop_test = pd.DataFrame(results)
print(train_prop_test)
train_prop_test.to_csv('results/train_prop_test.csv')

testing prop 0.1 ...
testing prop 0.2 ...
testing prop 0.5 ...
testing prop 0.75 ...
                             Model  Train size   MAE  RMSE
0   Random forest with descriptors         257 1.524 2.273
1          Random forest with ECFP         257 1.551 2.282
2             MLP with descriptors         257 1.938 2.641
3                    MLP with ECFP         257 1.997 2.753
4         RGBoost with descriptors         257 1.380 2.125
5                RGBoost with ECFP         257 1.519 2.229
6   Random forest with descriptors         515 1.276 2.061
7          Random forest with ECFP         515 1.303 2.031
8             MLP with descriptors         515 1.883 2.538
9                    MLP with ECFP         515 2.128 2.877
10        RGBoost with descriptors         515 1.078 1.809
11               RGBoost with ECFP         515 1.231 1.991
12  Random forest with descriptors        1288 0.818 1.523
13         Random forest with ECFP        1288 0.958 1.687
14            MLP with descrip

In [15]:
#LOSO
results = ddict(list)
solvent_set = list(set(solvent))

for solv in solvent_set:
    test_ids = [i for i, x in enumerate(solvent) if x == solv]
    size = len(test_ids)
    exp_name = "LOSO "+solv
    print('testing '+solv+' ...')
    for m in models:
        try:
            data = datasets[m.data_type]

            res = fit(m, data, test_ids, exp_name)

            results['Model'].append(m.name)
            results['LOSO solvent'].append(solv)
            results['Test size'].append(size)
            results['MAE'].append(res[0])
            results['RMSE'].append(res[1])
        except:
            print('error with model '+m.name)

LOSO_test = pd.DataFrame(results)
print(LOSO_test)
LOSO_test.to_csv('results/LOSO_test.csv')

testing CS(=O)C ...
testing C1CCOC1 ...
testing CC#N ...
testing CN(C)C=O ...
testing C(CCl)Cl ...
testing O ...
                             Model LOSO solvent  Test size    MAE   RMSE
0   Random forest with descriptors      CS(=O)C        130  7.217  7.937
1          Random forest with ECFP      CS(=O)C        130  3.785  5.088
2             MLP with descriptors      CS(=O)C        130 11.440 11.969
3                    MLP with ECFP      CS(=O)C        130  5.030  6.729
4         RGBoost with descriptors      CS(=O)C        130  3.370  3.996
5                RGBoost with ECFP      CS(=O)C        130  2.625  3.125
6   Random forest with descriptors      C1CCOC1         65  2.301  3.304
7          Random forest with ECFP      C1CCOC1         65  3.741  4.340
8             MLP with descriptors      C1CCOC1         65 13.904 14.879
9                    MLP with ECFP      C1CCOC1         65  4.351  5.081
10        RGBoost with descriptors      C1CCOC1         65  2.083  4.159
11         

In [14]:
#LOEO
results = ddict(list)
element_set = ['N','O','F','P','S','Cl','Br']

for ele in element_set:
    test_ids = [i for i, x in enumerate(solute) if ele in x]
    size = len(test_ids)
    exp_name = "LOEO "+ele
    print('testing '+ele+' ...')
    for m in models:
        data = datasets[m.data_type]

        res = fit(m, data, test_ids, exp_name)

        results['Model'].append(m.name)
        results['LOEO element'].append(ele)
        results['Test size'].append(size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

LOEO_test = pd.DataFrame(results)
print(LOEO_test)
LOEO_test.to_csv('results/LOEO_test.csv')

testing N ...
testing O ...
testing F ...
testing P ...
testing S ...
testing Cl ...
testing Br ...
                             Model LOEO element  Test size    MAE   RMSE
0   Random forest with descriptors            N       2311  2.858  4.082
1          Random forest with ECFP            N       2311  3.623  4.820
2             MLP with descriptors            N       2311  3.380  5.063
3                    MLP with ECFP            N       2311  3.369  4.622
4         RGBoost with descriptors            N       2311  2.974  4.326
5                RGBoost with ECFP            N       2311  3.287  4.570
6   Random forest with descriptors            O       2067  3.790  5.085
7          Random forest with ECFP            O       2067  3.476  4.674
8             MLP with descriptors            O       2067  3.620  4.840
9                    MLP with ECFP            O       2067  3.058  4.221
10        RGBoost with descriptors            O       2067  3.395  4.820
11               RGBoost

In [13]:
#LOMO
results = ddict(list)
solute_masses = [MolWt(Chem.MolFromSmiles(mol)) for mol in solute]
mass_cutoffs = [150,200,250,300,350]

for mass in mass_cutoffs:
    test_ids = [i for i, x in enumerate(solute_masses) if x > mass]
    size = len(test_ids)
    exp_name = "LOMO >"+str(mass)+'g/mol'
    print('testing >'+str(mass)+'g/mol ...')
    for m in models:
        try:
            data = datasets[m.data_type]

            res = fit(m, data, test_ids, exp_name)

            results['Model'].append(m.name)
            results['LOMO mass cutoff'].append(mass)
            results['Test size'].append(size)
            results['MAE'].append(res[0])
            results['RMSE'].append(res[1])
        except:
            print('error with model '+m.name)

LOMO_high_test = pd.DataFrame(results)
print(LOMO_high_test)
LOMO_high_test.to_csv('results/LOMO_high_test.csv')

testing >150g/mol ...
testing >200g/mol ...
testing >250g/mol ...
testing >300g/mol ...
testing >350g/mol ...
                             Model  LOMO mass cutoff  Test size   MAE  RMSE
0   Random forest with descriptors               150       2191 2.844 4.749
1          Random forest with ECFP               150       2191 2.819 4.505
2             MLP with descriptors               150       2191 3.263 5.797
3                    MLP with ECFP               150       2191 2.890 5.058
4         RGBoost with descriptors               150       2191 2.719 4.578
5                RGBoost with ECFP               150       2191 2.914 4.435
6   Random forest with descriptors               200       1307 2.827 4.497
7          Random forest with ECFP               200       1307 2.851 4.283
8             MLP with descriptors               200       1307 3.138 5.265
9                    MLP with ECFP               200       1307 2.890 4.533
10        RGBoost with descriptors               200  

In [12]:
#LOMO
results = ddict(list)
solute_masses = [MolWt(Chem.MolFromSmiles(mol)) for mol in solute]
mass_cutoffs = [50,100,150,200,250,300,350]

for mass in mass_cutoffs:
    test_ids = [i for i, x in enumerate(solute_masses) if x < mass]
    size = len(test_ids)
    exp_name = "LOMO <"+str(mass)+'g/mol'
    print('testing <'+str(mass)+'g/mol ...')
    for m in models:
        try:
            data = datasets[m.data_type]

            res = fit(m, data, test_ids, exp_name)

            results['Model'].append(m.name)
            results['LOMO mass cutoff'].append(mass)
            results['Test size'].append(size)
            results['MAE'].append(res[0])
            results['RMSE'].append(res[1])
        except:
            print('error with model '+m.name)

LOMO_low_test = pd.DataFrame(results)
print(LOMO_low_test)
LOMO_low_test.to_csv('results/LOMO_low_test.csv')

testing <50g/mol ...
testing <100g/mol ...
testing <150g/mol ...
testing <200g/mol ...
testing <250g/mol ...
testing <300g/mol ...
testing <350g/mol ...
                             Model  LOMO mass cutoff  Test size   MAE  RMSE
0   Random forest with descriptors                50         28 5.262 7.096
1          Random forest with ECFP                50         28 2.401 3.967
2             MLP with descriptors                50         28 2.518 3.526
3                    MLP with ECFP                50         28 4.154 5.884
4         RGBoost with descriptors                50         28 4.479 6.072
5                RGBoost with ECFP                50         28 2.285 3.779
6   Random forest with descriptors               100        272 2.692 3.609
7          Random forest with ECFP               100        272 2.183 3.589
8             MLP with descriptors               100        272 2.422 3.820
9                    MLP with ECFP               100        272 2.466 3.595
10        R

In [9]:
#LOCO
results = ddict(list)
solute_charges = [GetFormalCharge(Chem.MolFromSmiles(mol)) for mol in solute]
charge_list = [0,1]

for charge in charge_list:
    test_ids = [i for i, x in enumerate(solute_charges) if x == charge]
    size = len(test_ids)
    exp_name = "LOCO "+str(charge)
    print('testing >'+str(charge)+' ...')
    for m in models:
        try:
            data = datasets[m.data_type]

            res = fit(m, data, test_ids, exp_name)

            results['Model'].append(m.name)
            results['LOCO charge'].append(charge)
            results['Test size'].append(size)
            results['MAE'].append(res[0])
            results['RMSE'].append(res[1])
        except:
            print('error with model '+m.name)

LOCO_test = pd.DataFrame(results)
print(LOCO_test)
LOCO_test.to_csv('results/LOCO_test.csv')

testing >0 ...
testing >1 ...
                             Model  LOCO charge  Test size   MAE   RMSE
0   Random forest with descriptors            0       1487 6.218 10.726
1          Random forest with ECFP            0       1487 5.811 10.139
2             MLP with descriptors            0       1487 5.481 10.226
3                    MLP with ECFP            0       1487 5.714 10.110
4         RGBoost with descriptors            0       1487 6.038 10.630
5                RGBoost with ECFP            0       1487 5.871 10.180
6   Random forest with descriptors            1       1687 3.036  4.580
7          Random forest with ECFP            1       1687 3.577  4.935
8             MLP with descriptors            1       1687 3.730  5.675
9                    MLP with ECFP            1       1687 2.976  4.163
10        RGBoost with descriptors            1       1687 3.097  4.860
11               RGBoost with ECFP            1       1687 3.169  4.507


In [8]:
#LOFO
results = ddict(list)
pattern_list = [('proton donor','[!H0;F,Cl,Br,I,N+,$([OH]-*=[!#6]),+]'),
             ('H-bond donor','[N,n,O;!H0]'),
             ('protonated amine','[NH+,NH2+,NH3+,nH+,nH2+]'),
             ('carboxylic acid','[CX3](=O)[OX2H1]'),
             ('amide','[NX3][CX3](=[OX1])[#6]'),
             ('ketone','[#6][CX3](=O)[#6]'),
             ('ether','[OD2]([#6])[#6]'),
             ('amine','[NX3;H2,H1;!$(NC=O)]'),
             ('nitrile','[NX1]#[CX2]'),
             ('nitro','[$([NX3](=O)=O),$([NX3+](=O)[O-])][!#8]'),
             ('hydroxyl alcohol','[#6][OX2H]'),
             ('phenol','[OX2H][cX3]:[c]'),
             ('thiol','[#16X2H]'),
             ('phosphoric acid','[$(P(=[OX1])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)]),$([P+]([OX1-])([$([OX2H]),$([OX1-]),$([OX2]P)])([$([OX2H]),$([OX1-]),$([OX2]P)])[$([OX2H]),$([OX1-]),$([OX2]P)])]')]
solute_mols = [Chem.MolFromSmiles(sol) for sol in solute]

for name, smart in pattern_list:
    patt = Chem.MolFromSmarts(smart)
    test_ids = [i for i, x in enumerate(solute_mols) if x.HasSubstructMatch(patt)==True]
    size = len(test_ids)
    exp_name = "LOFO "+name
    print('testing '+name+' ...')
    for m in models:
        try:
            data = datasets[m.data_type]

            res = fit(m, data, test_ids, exp_name)

            results['Model'].append(m.name)
            results['LOFO group'].append(name)
            results['Test size'].append(size)
            results['MAE'].append(res[0])
            results['RMSE'].append(res[1])
        except:
            print('error with model '+m.name)

LOFO_test = pd.DataFrame(results)
print(LOFO_test)
LOFO_test.to_csv('results/LOFO_test.csv')

2250
testing proton donor ...
3024
testing H-bond donor ...
1734
testing protonated amine ...
500
testing carboxylic acid ...
189
testing amide ...
80
testing ketone ...
539
testing ether ...
427
testing amine ...
190
testing nitrile ...
312
testing nitro ...
1192
testing hydroxyl alcohol ...
484
testing phenol ...
72
testing thiol ...
13
testing phosphoric acid ...
                             Model       LOFO group  Test size   MAE  RMSE
0   Random forest with descriptors     proton donor       2250 4.022 5.267
1          Random forest with ECFP     proton donor       2250 3.816 4.962
2             MLP with descriptors     proton donor       2250 5.207 6.636
3                    MLP with ECFP     proton donor       2250 3.827 5.000
4         RGBoost with descriptors     proton donor       2250 4.449 5.553
..                             ...              ...        ...   ...   ...
79         Random forest with ECFP  phosphoric acid         13 3.379 4.058
80            MLP with descript