In [1]:
%matplotlib inline

from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
import sklearn
import torch
import deepchem as dc
import copy
from sklearn.model_selection import train_test_split
from hyperopt import hp
import pickle

pd.set_option('display.float_format', lambda x: '%.3f' % x)  # Display floats without scientific notation

In [2]:
import sys
sys.path.append('../')
from modules.data import data_maker
from modules.RNN import double_RNN
from modules.fit import Model
from modules.myhyperopt import hyperopt_func
from modules.MPNN import double_MPNN



---
## Loading Dataset

In [3]:
data = pd.read_csv('../data/full_pka_data.csv')
solute = data['Solute SMILES'].tolist()
solvent = data['Solvent SMILES'].tolist()
pka = data['pKa (avg)'].tolist()
data_size = len(solute)

In [6]:
CV_datasets.keys()

dict_keys(['ECFP', 'descriptors', 'graphs', 'SMILES', 'sentences'])

In [5]:
indices = list(range(data_size))
CV_ids, holdout_ids, _, _ = train_test_split(indices, solvent, test_size=0.2, random_state=1, stratify=solvent)
CV_datasets = data_maker(solute, solvent, pka, CV_ids)

---
## Hyperparameter optimisation

In [7]:
def save_dict(obj, name):
    with open('pickles/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [15]:
with open('pickles/DMPNN_Full.pkl', 'rb') as pickle_file:
    content = pickle.load(pickle_file)
content[0]

{'loss': 1.049818754196167,
 'params': {'MP_depth': 3,
  'MP_hidden': 256,
  'NN_depth': 2,
  'NN_hidden': 512,
  'activation': 'ReLU',
  'atom_messages': False,
  'batch_size': 64,
  'dropout': 0,
  'interaction': False,
  'lr': 0.001,
  'readout': 'sum'},
 'run_time': 655.1243992559976,
 'status': 'ok'}

In [None]:
#RNN
model_dict = {'name':'RNN', 'model':double_RNN, 'model_type':'torch', 'data_type':'sentences'}
param_space = {'features':300,
               'interaction':None,
               'RNN_hidden':hp.choice('RNN_hidden', [128,256,512]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512,1024,2048]),
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['features','interaction','RNN_hidden','NN_hidden','NN_depth','readout','activation']
training_param_names = ['lr','batch_size']

RNN_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
print(RNN_hyp_res[0])
save_dict(RNN_hyp_res[0],'RNN')

In [21]:
#RNN with attention
model_dict = {'name':'RNN with attention', 'model':double_RNN, 'model_type':'torch', 'data_type':'sentences'}
param_space = {'features':300,
               'interaction':hp.choice('interaction', ['exp','tanh']),
               'RNN_hidden':hp.choice('RNN_hidden', [128,256,512]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512,1024,2048]),
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['features','interaction','RNN_hidden','NN_hidden','NN_depth','readout','activation']
training_param_names = ['lr','batch_size']

RNNatt_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
print(RNNatt_hyp_res[0])
save_dict(RNNatt_hyp_res[0],'RNNatt')

  0%|          | 0/30 [01:42<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [8]:
#DMPNN
model_dict = {'name':'DMPNN', 'model':double_MPNN, 'model_type':'torch', 'data_type':'graphs'}
param_space = {'atom_messages':False,
               'MP_hidden':hp.choice('MP_hidden', [64,128,256,512]),
               'MP_depth':hp.choice('MP_depth', [2,3,4]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'interaction':None,
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512]),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['atom_messages','MP_hidden','MP_depth','readout','dropout','interaction','NN_depth','NN_hidden','activation']
training_param_names = ['lr','batch_size']

DMPNN_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
print(DMPNN_hyp_res[0])
save_dict(DMPNN_hyp_res[0],'DMPNN')

  3%|▎         | 1/30 [08:52<4:17:18, 532.36s/trial, best loss: 1.42119300365448]


KeyboardInterrupt: 

In [None]:
#DMPNN with attention
model_dict = {'name':'DMPNN with attention', 'model':double_MPNN, 'model_type':'torch', 'data_type':'graphs'}
param_space = {'atom_messages':False,
               'MP_hidden':hp.choice('MP_hidden', [64,128,256,512]),
               'MP_depth':hp.choice('MP_depth', [2,3,4]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'interaction':hp.choice('interaction', ['exp','tanh']),
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512]),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['atom_messages','MP_hidden','MP_depth','readout','dropout','interaction','NN_depth','NN_hidden','activation']
training_param_names = ['lr','batch_size']

DMPNNatt_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
print(DMPNNatt_hyp_res[0])
save_dict(DMPNNatt_hyp_res[0],'DMPNNatt')

In [15]:
#MPNN
model_dict = {'name':'MPNN', 'model':double_MPNN, 'model_type':'torch', 'data_type':'graphs'}
param_space = {'atom_messages':False,
               'MP_hidden':hp.choice('MP_hidden', [64,128,256,512]),
               'MP_depth':hp.choice('MP_depth', [2,3,4]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'interaction':None,
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512]),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['atom_messages','MP_hidden','MP_depth','readout','dropout','interaction','NN_depth','NN_hidden','activation']
training_param_names = ['lr','batch_size']

MPNN_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
print(MPNN_hyp_res[0])
save_dict(MPNN_hyp_res[0],'MPNN')

  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

job exception: 'graphs'



  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]


KeyError: 'graphs'

In [20]:
#MPNN with attention
model_dict = {'name':'MPNN with attention', 'model':double_MPNN, 'model_type':'torch', 'data_type':'graphs'}
param_space = {'atom_messages':False,
               'MP_hidden':hp.choice('MP_hidden', [64,128,256,512]),
               'MP_depth':hp.choice('MP_depth', [2,3,4]),
               'readout':hp.choice('readout', ['mean','sum','max']),
               'dropout':hp.choice('dropout', [0,0.1,0.2,0.3]),
               'interaction':hp.choice('interaction', ['exp','tanh']),
               'NN_depth':hp.choice('NN_depth', [1,2,3,4]),
               'NN_hidden':hp.choice('NN_hidden', [64,128,256,512]),
               'activation':hp.choice('activation', ['ReLU','LeakyReLU','PReLU','tanh','SELU','ELU']),
               'lr':hp.choice('lr', [1e-2,1e-3,1e-4]),
               'batch_size':hp.choice('batch_size', [16,32,64])}
model_param_names = ['atom_messages','MP_hidden','MP_depth','readout','dropout','interaction','NN_depth','NN_hidden','activation']
training_param_names = ['lr','batch_size']

MPNNatt_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
print(MPNNatt_hyp_res[0])
save_dict(MPNNatt_hyp_res[0],'MPNNatt')

  0%|          | 0/30 [00:52<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

In [None]:
#RF with descriptors
model_dict = {'name':'RF with descriptors', 'model':RandomForestRegressor, 'model_type':'sklearn', 'data_type':'descriptors'}
param_space = {'n_estimators':hp.choice('n_estimators', [32,64,128,256,512,1024,2048]),
               'max_depth':hp.choice('max_depth', [16,32,64,128,256,512,None]),
               'min_samples_split':hp.choice('min_samples_split', [2,4,8,16]),
               'min_samples_leaf':hp.choice('min_samples_leaf', [1,2,4,8]),
               'max_features':hp.choice('max_features', ['auto','sqrt']),
               'bootstrap':hp.choice('bootstrap', [True,False]),
               'n_jobs':-1}

model_param_names = ['n_estimators','max_depth','min_samples_split','min_samples_leaf','max_features','bootstrap','n_jobs']
training_param_names = []

RFdesc_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
print(RFdesc_hyp_res[0])
save_dict(RFdesc_hyp_res[0],'RFdesc')

In [None]:
#RF with ECFP
model_dict = {'name':'RF with ECFP', 'model':RandomForestRegressor, 'model_type':'sklearn', 'data_type':'ECFP'}
param_space = {'n_estimators':hp.choice('n_estimators', [32,64,128,256,512,1024,2048]),
               'max_depth':hp.choice('max_depth', [16,32,64,128,256,512,None]),
               'min_samples_split':hp.choice('min_samples_split', [2,4,8,16]),
               'min_samples_leaf':hp.choice('min_samples_leaf', [1,2,4,8]),
               'max_features':hp.choice('max_features', ['auto','sqrt']),
               'bootstrap':hp.choice('bootstrap', [True,False]),
               'n_jobs':-1}

model_param_names = ['n_estimators','max_depth','min_samples_split','min_samples_leaf','max_features','bootstrap','n_jobs']
training_param_names = []

RFecfp_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
print(RFecfp_hyp_res[0])
save_dict(RFecfp_hyp_res[0],'RFecfp')

In [None]:
#MLP with descriptors
model_dict = {'name':'MLP with descriptors', 'model':MLPRegressor, 'model_type':'sklearn', 'data_type':'descriptors'}
param_space = {'hidden_layer_sizes':hp.choice('hidden_layer_sizes', [(128),(256,128),(512,256,128),(512,256),(256),(512),(64),(256,128,64),(128,64,32),(128,256,128),(256,256),(128,128)]),
               'activation':hp.choice('activation', ['logistic','tanh','relu']),
               'solver':'adam',
               'batch_size':hp.choice('batch_size', [16,32,64,128,'auto']),
               'early_stopping':True}

model_param_names = ['hidden_layer_sizes','activation','solver','batch_size','early_stopping']
training_param_names = []

MLPdesc_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
print(MLPdesc_hyp_res[0])
save_dict(MLPdesc_hyp_res[0],'MLPdesc')

In [None]:
#MLP with ECFP
model_dict = {'name':'MLP with ECFP', 'model':MLPRegressor, 'model_type':'sklearn', 'data_type':'ECFP'}
param_space = {'hidden_layer_sizes':hp.choice('hidden_layer_sizes', [(128),(256,128),(512,256,128),(512,256),(256),(512),(64),(256,128,64),(128,64,32),(128,256,128),(256,256),(128,128)]),
               'activation':hp.choice('activation', ['logistic','tanh','relu']),
               'solver':'adam',
               'batch_size':hp.choice('batch_size', [16,32,64,128,'auto']),
               'early_stopping':True}

model_param_names = ['hidden_layer_sizes','activation','solver','batch_size','early_stopping']
training_param_names = []

MLPecfp_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
print(MLPecfp_hyp_res[0])
save_dict(MLPecfp_hyp_res[0],'MLPecfp')

In [15]:
#XGB with descriptors
model_dict = {'name':'XGB with descriptors', 'model':XGBRegressor, 'model_type':'sklearn', 'data_type':'descriptors'}
param_space = {'n_estimators':hp.choice('n_estimators', [32,64,128,256,512,1024,2048,4096]),
               'max_depth':hp.choice('max_depth', [1,2,4,8]),
               'min_child_weight':hp.choice('min_child_weight', [1,2,4,8,16]),
               'eta':hp.choice('eta', [0.1,0.2,0.3,0.4]),
               'gamma':hp.choice('gamma', [0,0.1,0.2]),
               'subsample':hp.choice('subsample', [0.6,0.7,0.8,0.9]),
               'colsample_bytree':hp.choice('colsample_bytree', [0.6,0.7,0.8,0.9,1]),
               'gamma':hp.choice('gamma', [0,0.1,0.2]),
               'n_jobs':-1}

model_param_names = ['n_estimators','max_depth','min_child_weight','eta','gamma','subsample','colsample_bytree','gamma','n_jobs']
training_param_names = []

XGBdesc_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
print(XGBdesc_hyp_res[0])
save_dict(XGBdesc_hyp_res[0],'XGBdesc')

 13%|█▎        | 4/30 [02:34<16:46, 38.70s/trial, best loss: 1.4216405365432614]


KeyboardInterrupt: 

In [None]:
#XGB with ECFP
model_dict = {'name':'XGB with ECFP', 'model':XGBRegressor, 'model_type':'sklearn', 'data_type':'ECFP'}
param_space = {'n_estimators':hp.choice('n_estimators', [32,64,128,256,512,1024,2048,4096]),
               'max_depth':hp.choice('max_depth', [1,2,4,8]),
               'min_child_weight':hp.choice('min_child_weight', [1,2,4,8,16]),
               'eta':hp.choice('eta', [0.1,0.2,0.3,0.4]),
               'gamma':hp.choice('gamma', [0,0.1,0.2]),
               'subsample':hp.choice('subsample', [0.6,0.7,0.8,0.9]),
               'colsample_bytree':hp.choice('colsample_bytree', [0.6,0.7,0.8,0.9,1]),
               'gamma':hp.choice('gamma', [0,0.1,0.2]),
               'n_jobs':-1}

model_param_names = ['n_estimators','max_depth','min_child_weight','eta','gamma','subsample','colsample_bytree','gamma','n_jobs']
training_param_names = []

XGBecfp_hyp_res = hyperopt_func(model_dict, model_param_names, training_param_names, param_space, CV_datasets)
print(XGBecfp_hyp_res[0])
save_dict(XGBecfp_hyp_res[0],'XGBecfp')