#### Testing for non-generic torch models:
1. Delfos (with and without attention)
2. MPNN (with and without attention)

In [248]:
%matplotlib inline

from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.Chem.Descriptors import MolWt
from sklearn.ensemble import RandomForestRegressor
import sklearn
from rdkit.Chem.rdmolops import GetFormalCharge
import torch
import deepchem as dc

import basicest as b
import chemprop_ish as c
import delfos as d
import imp

pd.set_option('display.float_format', lambda x: '%.3f' % x)  # Display floats without scientific notation

---
## Loading Dataset

In [54]:
imp.reload(d)

<module 'delfos' from '/Users/u6676643/codes/testing/delfos.py'>

In [204]:
data = pd.read_csv('non_aqueous_pka_data.csv')
solute = data['Solute SMILES'].tolist()
solvent = data['Solvent SMILES'].tolist()
pka = data['pKa (avg)'].tolist()
size = len(solute)

In [319]:
#preprocessing
scaler = b.pka_scaler(pka)

In [205]:
#ECFP
featurizer = dc.feat.CircularFingerprint(size=512, radius=3)
sol = featurizer.featurize(solute)
solv = featurizer.featurize(solvent)
ECFP_data = [np.concatenate((sol,solv),axis=1),np.array(pka)]

In [290]:
#Descriptors
featurizer = dc.feat.RDKitDescriptors()
sol = featurizer.featurize(solute)
solv = featurizer.featurize(solvent)
desc_data = [np.concatenate((sol,solv),axis=1),np.array(pka)]

In [207]:
#SMILES
SMILES_pairs = [(solute[i],solvent[i]) for i in range(size)]
SMILES_data = [SMILES_pairs, torch.Tensor(pka)]

In [208]:
#Sentences
sentence_pairs = d.delfos_data(solute,solvent)
sentence_data = [sentence_pairs, torch.Tensor(pka)]

In [291]:
#collate datasets through a dictionary retrieval system
datasets = dict(ECFP=ECFP_data,
                descriptors=desc_data,
                SMILES=SMILES_data,
                sentences=sentence_data)

In [322]:
b.CV_fit(DMPNN, datasets['SMILES'])

(array([ 8.757932, 12.026693], dtype=float32),
 [[8.916032, 12.772387],
  [8.86487, 11.43323],
  [7.3435698, 9.186112],
  [7.291783, 10.901827],
  [11.373402, 15.839908]])

In [321]:
b.CV_fit(RF_desc, datasets['descriptors'])

(array([3.08091445, 4.84686281]),
 [[2.7228431128747257, 4.681402741515197],
  [3.547177836547824, 5.8009438611128346],
  [3.4963111924151824, 4.781696317197858],
  [2.4938124841647884, 3.9624368344756493],
  [3.144427611045456, 5.007834291194637]])

---
## Hyperoptimisation

In [None]:
def fitness(model, *args):
    params = dict(args)
    results = b.train_cv_model(model, x_data, y_data, params=params, random_state=seed)
    score = np.mean(results.cv_scores['mae'])
    return -score

pbounds = {'MP_hidden': (100,500), 'MP_depth': (2,4), 'NN_hidden': (100,500), 'NN_depth': (1,3)}

def optimizer(pbounds):
    return BayesianOptimization(f=fitness(model=model), pbounds=pbounds, verbose=2, random_state=1)

In [None]:
from timeit import default_timer as timer

class fitness:
    def __init__(self, model_dict, default_params, integer_params)
    self.m = model_dict
    self.default_params = default_params
    self.integer_params = integer_params
    
    def objective(params):
        #round off some parameters to integers
        for parameter_name in self.integer_params:
            params[parameter_name] = int(params[parameter_name])
            
        default_params.update(params)
        self.m['model'] = self.m['model'](*params)
        model = b.Model(*self.m)
        data = datasets[model.data_type]

        start = timer()
        res, full_res = b.CV_fit(model, data)
        run_time = timer()-start

        loss = -res[0]

In [None]:
pbounds = {'MP_hidden': (100,500), 'MP_depth': (2,4), 'NN_hidden': (100,500), 'NN_depth': (1,3)}

In [1]:
#TODO: pytorch hyperopt - perhaps skorch?
#TODO: sklearn hyperopt
#

---
## Training + testing

In [320]:
DMPNN = b.Model(name='D-MPNN',
                model=c.double_MPNN(atom_messages=False),
                model_type='torch',
                data_type='SMILES',
                scaler=scaler)
DMPNN_att = b.Model(name='D-MPNN with attention',
                    model=c.double_MPNN(atom_messages=False, interaction=True),
                    model_type='torch',
                    data_type='SMILES',
                    scaler=scaler)
RNN = b.Model(name='RNN+NN',
              model=d.dnet(interaction=False),
              model_type='torch',
              data_type='sentences',
              scaler=scaler)
RNN_att = b.Model(name='RNN+NN with attention',
                  model=d.dnet(),
                  model_type='torch',
                  data_type='sentences',
                  scaler=scaler)
RF_desc = b.Model(name='Random forest with descriptors',
                  model=RandomForestRegressor(n_estimators=1000, n_jobs=12),
                  model_type='sklearn',
                  data_type='descriptors',
                  scaler=scaler)
RF_ECFP = b.Model(name='Random forest with ECFP',
                  model=RandomForestRegressor(n_estimators=1000, n_jobs=12),
                  model_type='sklearn',
                  data_type='ECFP',
                  scaler=scaler)

#list of all models for testing
models = [DMPNN, DMPNN_att, RNN, RNN_att, RF_desc, RF_ECFP]

In [326]:
imp.reload(b)

<module 'basicest' from '/Users/u6676643/codes/testing/basicest.py'>

In [306]:
data = datasets['descriptors']
data[1][0]

8.8

In [324]:
#TODO: full CV training + testing
results = ddict(list)
for m in models:
    print('testing '+m.name+' ...')
    data = datasets[m.data_type]
    
    res, full_res = b.CV_fit(m, data)
    
    results['Model'].append(m.name)
    results['MAE'].append(res[0])
    results['RMSE'].append(res[1])

full_CV_test = pd.DataFrame(results)
print(full_CV_test)

testing D-MPNN ...
testing D-MPNN with attention ...
testing RNN+NN ...
testing RNN+NN with attention ...
testing Random forest with descriptors ...
testing Random forest with ECFP ...
                            Model   MAE   RMSE
0                          D-MPNN 8.753 12.098
1           D-MPNN with attention 9.509 11.946
2                          RNN+NN 8.996 11.753
3           RNN+NN with attention 8.962 11.789
4  Random forest with descriptors 3.071  4.828
5         Random forest with ECFP 4.027  5.865


In [None]:
#TODO: dataset size vs accuracy

In [None]:
#TODO: pka split vs accuracy

In [327]:
#LOSO
results = ddict(list)
solvent_set = list(set(solvent))

for solv in solvent_set:
    test_ids = [i for i, x in enumerate(solvent) if x == solv]
    size = len(test_ids)
    print('testing '+solv+' ...')
    for m in models:
        data = datasets[m.data_type]

        trained_m, res = b.fit(m, data, test_ids)
        
        m.experiments.append(('LOSO '+solv,res,size,trained_m))
        results['Model'].append(m.name)
        results['LOSO solvent'].append(solv)
        results['Test size'].append(size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

LOSO_test = pd.DataFrame(results)
print(LOSO_test)

testing C1CCOC1 ...
testing O ...
testing CCCCCCC ...
testing COCCOC.COCCOC ...
testing CN(C)C=O ...
testing CS(=O)C ...
testing C(CCl)Cl ...
testing CC#N ...
                             Model   LOSO solvent  Test size    MAE   RMSE
0                           D-MPNN        C1CCOC1         65  3.971  5.773
1            D-MPNN with attention        C1CCOC1         65  5.325  6.927
2                           RNN+NN        C1CCOC1         65  5.106  6.674
3            RNN+NN with attention        C1CCOC1         65  4.749  6.416
4   Random forest with descriptors        C1CCOC1         65  3.063  4.744
5          Random forest with ECFP        C1CCOC1         65  4.260  5.926
6                           D-MPNN              O        134  6.708  8.585
7            D-MPNN with attention              O        134  9.097 10.695
8                           RNN+NN              O        134  9.115 10.690
9            RNN+NN with attention              O        134  8.781 10.387
10  Random fores

In [328]:
#LOEO
results = ddict(list)
element_set = ['F','N','S','O']

for ele in element_set:
    test_ids = [i for i, x in enumerate(solute) if ele in x]
    size = len(test_ids)
    print('testing '+ele+' ...')
    for m in models:
        data = datasets[m.data_type]

        trained_m, res = b.fit(m, data, test_ids)
        
        m.experiments.append(('LOEO '+ele,res,size,trained_m))
        results['Model'].append(m.name)
        results['LOEO element'].append(ele)
        results['Test size'].append(size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

LOEO_test = pd.DataFrame(results)
print(LOEO_test)

testing F ...
testing N ...
testing S ...
testing O ...
                             Model LOEO element  Test size    MAE   RMSE
0                           D-MPNN            F        282 11.457 15.841
1            D-MPNN with attention            F        282 12.615 15.401
2                           RNN+NN            F        282 11.272 15.039
3            RNN+NN with attention            F        282 11.321 15.159
4   Random forest with descriptors            F        282  6.214  8.324
5          Random forest with ECFP            F        282  5.806  7.714
6                           D-MPNN            N        700  8.598 12.201
7            D-MPNN with attention            N        700  9.806 12.289
8                           RNN+NN            N        700  8.985 11.927
9            RNN+NN with attention            N        700  8.930 11.962
10  Random forest with descriptors            N        700  6.129  7.759
11         Random forest with ECFP            N        700  7.335  9

In [329]:
#LOMO
results = ddict(list)
solute_masses = [MolWt(Chem.MolFromSmiles(mol)) for mol in solute]
mass_cutoffs = [100,150,200]

for mass in mass_cutoffs:
    test_ids = [i for i, x in enumerate(solute_masses) if x > mass]
    size = len(test_ids)
    print('testing >'+str(mass)+'g/mol ...')
    for m in models:
        data = datasets[m.data_type]

        trained_m, res = b.fit(m, data, test_ids)
        
        m.experiments.append(('LOMO '+str(mass),res,size,trained_m))
        results['Model'].append(m.name)
        results['LOMO mass cutoff'].append(mass)
        results['Test size'].append(size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

LOMO_test = pd.DataFrame(results)
print(LOMO_test)

testing >100g/mol ...
testing >150g/mol ...
testing >200g/mol ...
                             Model  LOMO mass cutoff  Test size    MAE   RMSE
0                           D-MPNN               100        750  9.260 12.888
1            D-MPNN with attention               100        750  9.871 12.603
2                           RNN+NN               100        750  9.310 12.341
3            RNN+NN with attention               100        750  9.307 12.405
4   Random forest with descriptors               100        750  6.423  8.692
5          Random forest with ECFP               100        750  5.760  7.383
6                           D-MPNN               150        578 10.044 13.915
7            D-MPNN with attention               150        578 10.601 13.597
8                           RNN+NN               150        578  9.819 13.063
9            RNN+NN with attention               150        578  9.858 13.167
10  Random forest with descriptors               150        578  7.465 10.13

In [330]:
#LOCO
results = ddict(list)
solute_charges = [GetFormalCharge(Chem.MolFromSmiles(mol)) for mol in solute]
charge_list = [0,1]

for charge in charge_list:
    test_ids = [i for i, x in enumerate(solute_charges) if x == charge]
    size = len(test_ids)
    print('testing >'+str(charge)+' ...')
    for m in models:
        data = datasets[m.data_type]

        trained_m, res = b.fit(m, data, test_ids)
        
        m.experiments.append(('LOCO '+str(charge),res,size,trained_m))
        results['Model'].append(m.name)
        results['LOCO charge'].append(charge)
        results['Test size'].append(size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

LOCO_test = pd.DataFrame(results)
print(LOCO_test)

testing >0 ...
testing >1 ...
                             Model  LOCO charge  Test size    MAE   RMSE
0                           D-MPNN            0        580 10.696 14.438
1            D-MPNN with attention            0        580 10.687 13.587
2                           RNN+NN            0        580 10.337 13.625
3            RNN+NN with attention            0        580 10.400 13.739
4   Random forest with descriptors            0        580 12.824 17.494
5          Random forest with ECFP            0        580 12.121 16.635
6                           D-MPNN            1        313  4.911  6.415
7            D-MPNN with attention            1        313  6.469  7.782
8                           RNN+NN            1        313  6.510  7.882
9            RNN+NN with attention            1        313  6.290  7.677
10  Random forest with descriptors            1        313  8.094  9.813
11         Random forest with ECFP            1        313  9.982 11.263
