In [1]:
from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.Chem.Descriptors import MolWt
from sklearn.ensemble import RandomForestRegressor
import sklearn
from rdkit.Chem.rdmolops import GetFormalCharge
import torch
import deepchem as dc
import copy
from sklearn.model_selection import train_test_split
import imp
from os import listdir
from os.path import isfile, join



In [2]:
import sys
sys.path.append('../')
from modules.RNN import double_RNN
from modules.MPNN import double_MPNN
from modules.fit import *
from modules.transfer import transfer_weights



---
## Data preparation

In [3]:
data = pd.read_csv('../data/full_pka_data.csv')
solute = data['Solute SMILES'].tolist()
solvent = data['Solvent SMILES'].tolist()
pka = data['pKa (avg)'].tolist()
data_size = len(solute)

In [4]:
indices = list(range(data_size))
CV_ids, holdout_ids, _, _ = train_test_split(indices, solvent, test_size=0.2, random_state=1, stratify=solvent)
datasets = data_maker(solute, solvent, pka)

---
## Training + testing

In [5]:
DMPNN = Model(name='DMPNN',
              model=double_MPNN(MP_depth=3, MP_hidden=256, NN_depth=2, NN_hidden=512, activation='ReLU', 
                                atom_messages=False, dropout=0, interaction=None, readout='sum'),
              lr=0.001,
              batch_size=64,
              model_type='torch',
              data_type='graphs')
DMPNN_att = Model(name='DMPNN with attention',
              model=double_MPNN(MP_depth=4, MP_hidden=128, NN_depth=4, NN_hidden=64, activation='ELU', 
                                atom_messages=False, dropout=0, interaction='tanh', readout='mean'),
              lr=0.001,
              batch_size=64,
              model_type='torch',
              data_type='graphs')
MPNN = Model(name='MPNN',
             model=double_MPNN(MP_depth=3, MP_hidden=256, NN_depth=2, NN_hidden=512, activation='LeakyReLU', 
                                  atom_messages=True, dropout=0, interaction=None, readout='sum'),
             lr=0.001,
             batch_size=64,
             model_type='torch',
             data_type='graphs')
MPNN_att = Model(name='MPNN with attention',
             model=double_MPNN(MP_depth=2, MP_hidden=64, NN_depth=4, NN_hidden=512, activation='ReLU', 
                                  atom_messages=True, dropout=0, interaction='tanh', readout='max'),
             lr=0.001,
             batch_size=64,
             model_type='torch',
             data_type='graphs')
RNN = Model(name='RNN',
            model=double_RNN(NN_depth=3, NN_hidden=512, RNN_hidden=512, activation='ReLU', dropout=0.3,
                             features=300, interaction=None, readout='max'),
            lr=0.001,
            batch_size=32,
            model_type='torch',
            data_type='sentences')
RNN_att = Model(name='RNN with attention',
                model=double_RNN(NN_depth=1, NN_hidden=1024, RNN_hidden=512, activation='PReLU', dropout=0.1,
                                 features=300, interaction='exp', readout='max'),
                lr=0.001,
                batch_size=32,
                model_type='torch',
                data_type='sentences')
#list of all models for testing
models = [DMPNN, DMPNN_att, MPNN, MPNN_att, RNN, RNN_att]

In [6]:
trained_models = [f for f in listdir('trained/') if isfile(join('trained/', f))]

In [14]:
def task_func(file):
    if 'Water' in file:
        task = 'Water pKa'
    if 'Gsolv' in file:
        task = 'Gsolv'
    else:
        task = file[-11:-3]
    return task

model_weights = []
for file in trained_models:
    task = task_func(file)
    if 'RNN_w' in file:
        model_weights.append((RNN_att,file,task))
    elif 'DMPNN_w' in file:
        model_weights.append((DMPNN_att,file,task))        
    elif 'MPNN_w' in file:
        model_weights.append((MPNN_att,file,task))
    elif 'RNN' in file:
        model_weights.append((RNN,file,task))
    elif 'DMPNN' in file:
        model_weights.append((DMPNN,file,task))
    elif 'MPNN' in file:
        model_weights.append((MPNN,file,task))

In [12]:
exp_name = "Holdout test"
results = ddict(list)
for m, file, task in model_weights:
    if task == 'Water pKa':
        pass
    else:
        print('testing '+file+' ...')
        data = datasets[m.data_type]

        transfer_weights(m, file)

        res = fit(m, data, holdout_ids, exp_name)

        results['Model'].append(m.name)
        results['Pretraining task'].append(task)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

holdout_test = pd.DataFrame(results)
print(holdout_test)
holdout_test.to_csv('results/holdout_test.csv')

testing MPNN_with_attention_Water_pka.pt ...
testing RNN_Water_pka.pt ...
testing DMPNN_Water_pka.pt ...
testing MPNN_Water_pka.pt ...
testing RNN_with_attention_Water_pka.pt ...
testing DMPNN_with_attention_Water_pka.pt ...
                  Model       MAE      RMSE
0   MPNN with attention  1.047276  1.502790
1                   RNN  1.375980  2.083664
2                 DMPNN  1.077354  1.640946
3                  MPNN  1.133315  1.685496
4    RNN with attention  1.296481  2.258889
5  DMPNN with attention  1.153409  1.706847
