In [26]:
from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.Chem.Descriptors import MolWt
from sklearn.ensemble import RandomForestRegressor
import sklearn
from rdkit.Chem.rdmolops import GetFormalCharge
import torch
import deepchem as dc
import copy
from sklearn.model_selection import train_test_split
import imp
from os import listdir
from os.path import isfile, join

In [27]:
import sys
sys.path.append('../')
from modules.RNN import double_RNN
from modules.MPNN import double_MPNN
from modules.fit import *
from modules.transfer import transfer_weights, finetune, transfer_finetune
from modules.data import *

---
## Data preparation

In [28]:
data = pd.read_csv('../data/full_pka_data.csv')
solute = data['Solute SMILES'].tolist()
solvent = data['Solvent SMILES'].tolist()
pka = data['pKa (avg)'].tolist()
data_size = len(solute)

In [None]:
indices = list(range(data_size))
CV_ids, holdout_ids, _, _ = train_test_split(indices, solvent, test_size=0.2, random_state=1, stratify=solvent)
datasets = data_maker(solute, solvent, pka)

---
## Training + testing

In [None]:
DMPNN = Model(name='DMPNN',
              model=double_MPNN(MP_depth=3, MP_hidden=256, NN_depth=2, NN_hidden=512, activation='ReLU', 
                                atom_messages=False, dropout=0, interaction=None, readout='sum'),
              lr=0.001,
              batch_size=64,
              model_type='torch',
              data_type='graphs')
DMPNN_att = Model(name='DMPNN with attention',
              model=double_MPNN(MP_depth=4, MP_hidden=128, NN_depth=4, NN_hidden=64, activation='ELU', 
                                atom_messages=False, dropout=0, interaction='tanh', readout='mean'),
              lr=0.001,
              batch_size=64,
              model_type='torch',
              data_type='graphs')
MPNN = Model(name='MPNN',
             model=double_MPNN(MP_depth=3, MP_hidden=256, NN_depth=2, NN_hidden=512, activation='LeakyReLU', 
                                  atom_messages=True, dropout=0, interaction=None, readout='sum'),
             lr=0.001,
             batch_size=64,
             model_type='torch',
             data_type='graphs')
MPNN_att = Model(name='MPNN with attention',
             model=double_MPNN(MP_depth=2, MP_hidden=64, NN_depth=4, NN_hidden=512, activation='ReLU', 
                                  atom_messages=True, dropout=0, interaction='tanh', readout='max'),
             lr=0.001,
             batch_size=64,
             model_type='torch',
             data_type='graphs')
RNN = Model(name='RNN',
            model=double_RNN(NN_depth=3, NN_hidden=512, RNN_hidden=512, activation='ReLU', dropout=0.3,
                             features=300, interaction=None, readout='max'),
            lr=0.001,
            batch_size=32,
            model_type='torch',
            data_type='sentences')
RNN_att = Model(name='RNN with attention',
                model=double_RNN(NN_depth=1, NN_hidden=1024, RNN_hidden=512, activation='PReLU', dropout=0.1,
                                 features=300, interaction='exp', readout='max'),
                lr=0.001,
                batch_size=32,
                model_type='torch',
                data_type='sentences')
#list of all models for testing
models = [DMPNN, DMPNN_att, MPNN, MPNN_att, RNN, RNN_att]

In [None]:
trained_models = [f for f in listdir('trained/') if isfile(join('trained/', f))]

In [None]:
def task_func(file):
    if 'Water' in file:
        task = 'Water pKa'
    elif 'Gsolv' in file:
        task = 'Gsolv'
    else:
        task = file[-11:-3]
    return task

model_weights = []
for file in trained_models:
    task = task_func(file)
    if 'RNN_w' in file:
        model_weights.append((RNN_att,file,task))
    elif 'DMPNN_w' in file:
        model_weights.append((DMPNN_att,file,task))        
    elif 'MPNN_w' in file:
        model_weights.append((MPNN_att,file,task))
    elif 'RNN' in file:
        model_weights.append((RNN,file,task))
    elif 'DMPNN' in file:
        model_weights.append((DMPNN,file,task))
    elif 'MPNN' in file:
        model_weights.append((MPNN,file,task))

In [11]:
exp_name = "Holdout test"
results = ddict(list)
for m, file, task in model_weights:
    if task == 'Water pKa':
        pass
    else:
        print('testing '+file+' ...')
        data = datasets[m.data_type]

        transfer_weights(m, file)

        res = fit(m, data, holdout_ids, exp_name)

        results['Model'].append(m.name)
        results['Pretraining task'].append(task)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

holdout_test = pd.DataFrame(results)
print(holdout_test)
holdout_test.to_csv('results/holdout_test.csv')

testing DMPNN_Gsolv.pt ...
testing MPNN_with_attention_Gsolv.pt ...
testing MPNN_Gsolv.pt ...
testing RNN_Gsolv.pt ...
testing RNN_with_attention_Gsolv.pt ...
testing DMPNN_with_attention_Gsolv.pt ...
                  Model Pretraining task       MAE      RMSE
0                 DMPNN            Gsolv  1.711399  2.430409
1   MPNN with attention            Gsolv  1.618414  2.458323
2                  MPNN            Gsolv  2.169771  2.914583
3                   RNN            Gsolv  2.424599  3.430700
4    RNN with attention            Gsolv  2.812207  3.793522
5  DMPNN with attention            Gsolv  2.601547  3.374790


In [18]:
exp_name = "Holdout test"
results = ddict(list)
for m, file, task in model_weights:
    if 'Water' in file:
        print('testing '+file+' ...')
        data = datasets[m.data_type]

        transfer_weights(m, file)

        res = fit(m, data, holdout_ids, exp_name)

        results['Model'].append(m.name)
        results['Pretraining task'].append(task)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

holdout_test = pd.DataFrame(results)
print(holdout_test)
holdout_test.to_csv('results/holdout_test.csv')

testing MPNN_with_attention_Water_pka.pt ...
testing RNN_Water_pka.pt ...
testing DMPNN_Water_pka.pt ...
testing MPNN_Water_pka.pt ...
testing RNN_with_attention_Water_pka.pt ...
testing DMPNN_with_attention_Water_pka.pt ...
                  Model Pretraining task       MAE      RMSE
0   MPNN with attention        Water pKa  1.160310  1.686413
1                   RNN        Water pKa  1.281696  1.872694
2                 DMPNN        Water pKa  0.920751  1.435663
3                  MPNN        Water pKa  0.927955  1.447016
4    RNN with attention        Water pKa  1.194385  1.808636
5  DMPNN with attention        Water pKa  0.999161  1.545486


In [8]:
exp_name = "Holdout test"
results = ddict(list)
for m, file, task in model_weights:
    if 'Water' in file:
        print('testing '+file+' ...')
        data = datasets[m.data_type]

        transfer_weights(m, file)

        res = fit(m, data, holdout_ids, exp_name)

        results['Model'].append(m.name)
        results['Pretraining task'].append(task)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])
        results['Finetuning'].append(False)
        print(res)
        
        print('finetuning...')
        res = finetune(m, data, holdout_ids, exp_name, new_lr=0.0001)

        results['Model'].append(m.name)
        results['Pretraining task'].append(task)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])
        results['Finetuning'].append(True)
        print(res)

holdout_test = pd.DataFrame(results)
print(holdout_test)
holdout_test.to_csv('results/holdout_test.csv')

testing MPNN_with_attention_Water_pka.pt ...
[0.9440844, 1.4092151]
finetuning...
[0.8747762, 1.335492]
testing RNN_Water_pka.pt ...
[1.3927959, 2.016345]
finetuning...
[1.1193192, 1.7541449]
testing DMPNN_Water_pka.pt ...
[0.9408062, 1.5067396]
finetuning...
[0.8445427, 1.4667693]
testing MPNN_Water_pka.pt ...
[1.1806642, 1.8219391]
finetuning...
[0.8803673, 1.4534552]
testing RNN_with_attention_Water_pka.pt ...
[1.2777618, 1.8782059]
finetuning...
[1.15115, 1.768911]
testing DMPNN_with_attention_Water_pka.pt ...
[1.0912564, 1.6384128]
finetuning...
[1.0239928, 1.5814767]
                   Model Pretraining task       MAE      RMSE  Finetuning
0    MPNN with attention        Water pKa  0.944084  1.409215       False
1    MPNN with attention        Water pKa  0.874776  1.335492        True
2                    RNN        Water pKa  1.392796  2.016345       False
3                    RNN        Water pKa  1.119319  1.754145        True
4                  DMPNN        Water pKa  0.94080

In [9]:
exp_name = "Holdout test"
results = ddict(list)
for m, file, task in model_weights:
    if 'Water' not in file:
        print('testing '+file+' ...')
        data = datasets[m.data_type]

        transfer_weights(m, file)

        res = fit(m, data, holdout_ids, exp_name)

        results['Model'].append(m.name)
        results['Pretraining task'].append(task)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])
        results['Finetuning'].append(False)
        print(res)
        
        print('finetuning...')
        res = finetune(m, data, holdout_ids, exp_name, new_lr=0.0001)

        results['Model'].append(m.name)
        results['Pretraining task'].append(task)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])
        results['Finetuning'].append(True)
        print(res)

holdout_test = pd.DataFrame(results)
print(holdout_test)
holdout_test.to_csv('results/holdout_test.csv')

testing RNN_with_attention_QM9_g298_atom.pt ...
[1.340462, 2.1901085]
finetuning...
[1.2175503, 2.019849]
testing MPNN_QM9_alpha.pt ...
[1.2845355, 1.957642]
finetuning...
[1.1974685, 1.8678793]
testing RNN_with_attention_QM9_mu.pt ...
[1.3628873, 2.364774]
finetuning...
[1.1832656, 2.202396]
testing DMPNN_QM9_g298.pt ...
[1.5501249, 2.2572246]
finetuning...
[1.3986841, 2.0772169]
testing RNN_with_attention_QM9_cv.pt ...
[1.365908, 2.2674577]
finetuning...
[1.1906406, 2.1952927]
testing RNN_QM9_g298.pt ...
[1.2779596, 2.1584306]
finetuning...
[1.1570556, 1.9248806]
testing DMPNN_QM9_r2.pt ...
[1.0997479, 1.8459163]
finetuning...
[1.0194817, 1.801157]
testing RNN_with_attention_QM9_alpha.pt ...
[1.3594762, 2.2791085]
finetuning...
[1.272934, 2.234772]
testing DMPNN_Gsolv.pt ...
[1.7113993, 2.4304094]
finetuning...
[1.3351934, 2.0362833]
testing DMPNN_with_attention_QM9_alpha.pt ...
[1.3256223, 2.037689]
finetuning...
[1.1883332, 1.8974679]
testing MPNN_with_attention_Gsolv.pt ...
[1.618

In [8]:
f_model_weights = [(m,f,t) for m,f,t in model_weights if 'Water' in f]

In [9]:
#TODO: dataset size vs accuracy
results = ddict(list)
proportions = [0.1,0.2,0.5,0.75]

for prop in proportions:
    solvents = [solvent[i] for i in CV_ids]
    train_ids,_,_,_ = train_test_split(CV_ids, solvents, test_size=1-prop, random_state=1, stratify=solvents)
    train_size = len(train_ids)
    exp_name = "Training data size "+str(prop)
    print('testing prop '+str(prop)+' ...')
    for m, file, task in f_model_weights:
        data = datasets[m.data_type]

        res = transfer_finetune(m, file, data, holdout_ids, exp_name, 0.0001, train_ids=train_ids)

        results['Model'].append(m.name)
        results['Pretraining task'].append(task)
        results['Train size'].append(train_size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])
        
        pd.DataFrame(results).to_csv('results/train_prop_test.csv')

train_size = len(CV_ids)
for m, file, task in f_model_weights:
    data = datasets[m.data_type]

    res = transfer_finetune(m, file, data, holdout_ids, exp_name, 0.0001, train_ids=CV_ids)

    results['Model'].append(m.name)
    results['Pretraining task'].append(task)
    results['Train size'].append(train_size)
    results['MAE'].append(res[0])
    results['RMSE'].append(res[1])
    
    pd.DataFrame(results).to_csv('results/train_prop_test.csv')

train_prop_test = pd.DataFrame(results)
print(train_prop_test)
train_prop_test.to_csv('results/train_prop_test.csv')

testing prop 0.1 ...
testing prop 0.2 ...
testing prop 0.5 ...
testing prop 0.75 ...
                   Model Pretraining task  Train size       MAE      RMSE
0    MPNN with attention        Water pKa         257  1.945032  3.374745
1                    RNN        Water pKa         257  2.235872  3.603557
2                  DMPNN        Water pKa         257  1.966565  3.449821
3                   MPNN        Water pKa         257  1.885221  3.129176
4     RNN with attention        Water pKa         257  2.634878  4.512725
5   DMPNN with attention        Water pKa         257  2.519415  4.239733
6    MPNN with attention        Water pKa         515  1.525278  2.845181
7                    RNN        Water pKa         515  1.785536  2.807272
8                  DMPNN        Water pKa         515  1.372364  2.475326
9                   MPNN        Water pKa         515  1.334818  2.258871
10    RNN with attention        Water pKa         515  1.879017  3.010668
11  DMPNN with attention   

In [11]:
#LOSO
f_model_weights = [(m,f,t) for m,f,t in model_weights if 'Water' in f]+[(m,f,t) for m,f,t in model_weights if 'Gsolv' in f]

results = ddict(list)
solvent_set = list(set(solvent))

for solv in solvent_set:
    test_ids = [i for i, x in enumerate(solvent) if x == solv]
    size = len(test_ids)
    exp_name = "LOSO "+solv
    print('testing '+solv+' ...')
    for m, file, task in f_model_weights:
        data = datasets[m.data_type]

        res = transfer_finetune(m, file, data, test_ids, exp_name, 0.0001)

        results['Model'].append(m.name+' '+task)
        results['LOSO solvent'].append(solv)
        results['Test size'].append(size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])
        
        pd.DataFrame(results).to_csv('results/LOSO_test.csv')

LOSO_test = pd.DataFrame(results)
print(LOSO_test)
LOSO_test.to_csv('results/LOSO_test.csv')

testing CN(C)C=O ...
testing C1CCOC1 ...
testing O ...
testing CS(=O)C ...
testing CC#N ...
testing C(CCl)Cl ...
                            Model LOSO solvent  Test size        MAE  \
0   MPNN with attention Water pKa     CN(C)C=O         20   2.259517   
1                   RNN Water pKa     CN(C)C=O         20   1.403135   
2                 DMPNN Water pKa     CN(C)C=O         20   1.195458   
3                  MPNN Water pKa     CN(C)C=O         20   0.696446   
4    RNN with attention Water pKa     CN(C)C=O         20   1.479848   
..                            ...          ...        ...        ...   
67      MPNN with attention Gsolv     C(CCl)Cl         88  31.266466   
68                     MPNN Gsolv     C(CCl)Cl         88  38.768452   
69                      RNN Gsolv     C(CCl)Cl         88  33.307377   
70       RNN with attention Gsolv     C(CCl)Cl         88  36.919594   
71     DMPNN with attention Gsolv     C(CCl)Cl         88  38.622948   

         RMSE  
0    2

In [None]:
#LOFO
f_model_weights = [(m,f,t) for m,f,t in model_weights if 'Water' in f]

results = ddict(list)
acid_types = data["Acid type"].to_list()
acid_names = list(set(acid_types))

for name in acid_names:
    test_ids = [i for i, x in enumerate(acid_types) if x == name]
    size = len(test_ids)
    exp_name = "LOFO "+name
    print('testing '+name+' ...')
    for m, file, task in f_model_weights:
        data = datasets[m.data_type]

        res = transfer_finetune(m, file, data, test_ids, exp_name, 0.0001)

        results['Model'].append(m.name+' '+task)
        results['LOFO group'].append(name)
        results['Test size'].append(size)
        results['MAE'].append(res[0])
        results['RMSE'].append(res[1])

        pd.DataFrame(results).to_csv('results/LOFO_test.csv')

LOFO_test = pd.DataFrame(results)
print(LOFO_test)
LOFO_test.to_csv('results/LOFO_test.csv')