In [1]:
from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.Chem.Descriptors import MolWt
from sklearn.ensemble import RandomForestRegressor
import sklearn
from rdkit.Chem.rdmolops import GetFormalCharge
import torch
import deepchem as dc
import copy
from sklearn.model_selection import train_test_split
import imp



In [17]:
import sys
sys.path.append('../')
from modules.RNN import double_RNN, RNN
from modules.MPNN import double_MPNN, MPNN
import modules.pretraining as p

---
## Training + testing

### Single input models

In [19]:
s_DMPNN = p.Model(name='DMPNN',
                model=MPNN(MP_depth=3, MP_hidden=256, NN_depth=2, NN_hidden=512, activation='ReLU', 
                                atom_messages=False, dropout=0, readout='sum'),
                lr=0.001,
                batch_size=64,
                data_type='graphs',
                inputs=1)
s_DMPNN_att = p.Model(name='DMPNN with attention',
                      model=MPNN(MP_depth=4, MP_hidden=128, NN_depth=4, NN_hidden=64, activation='ELU', 
                                        atom_messages=False, dropout=0, readout='mean'),
                      lr=0.001,
                      batch_size=64,
                      data_type='graphs',
                      inputs=1)
s_MPNN = p.Model(name='MPNN',
               model=MPNN(MP_depth=3, MP_hidden=256, NN_depth=2, NN_hidden=512, activation='LeakyReLU', 
                          atom_messages=True, dropout=0, readout='sum'),
               lr=0.001,
               batch_size=64,
               data_type='graphs',
               inputs=1)
s_MPNN_att = p.Model(name='MPNN with attention',
                   model=MPNN(MP_depth=2, MP_hidden=64, NN_depth=4, NN_hidden=512, activation='ReLU', 
                                     atom_messages=True, dropout=0, readout='max'),   
                   lr=0.001,
                   batch_size=64,
                   data_type='graphs',
                   inputs=1)
s_RNN = p.Model(name='RNN',
              model=RNN(NN_depth=3, NN_hidden=512, RNN_hidden=512, activation='ReLU', dropout=0.3,
                        features=300, readout='max'),
              lr=0.001,
              batch_size=32,
              data_type='sentences',
              inputs=1)
s_RNN_att = p.Model(name='RNN with attention',
                    model=RNN(NN_depth=1, NN_hidden=1024, RNN_hidden=512, activation='PReLU', dropout=0.1,
                                     features=300, readout='max'),
                    lr=0.001,
                    batch_size=32,
                    data_type='sentences',
                    inputs=1)

#list of all models for testing
s_models = [s_DMPNN, s_DMPNN_att, s_MPNN, s_MPNN_att, s_RNN, s_RNN_att]
s_graph_models = [s_DMPNN, s_DMPNN_att, s_MPNN, s_MPNN_att]
s_sen_models = [s_RNN, s_RNN_att]

### Water pka

In [20]:
data = pd.read_csv('pretrain_data/water_pka.csv')
solute = data['Solute SMILES'].tolist()
pka = data['pKa (avg)'].tolist()
datasets = p.data_maker(solute, pka)

exp_name = "Water pka"
for m in s_models:
    data = datasets[m.data_type]
    print('testing '+m.name+' ...')
    p.fit_no_test(m, exp_name, data)

testing DMPNN with attention ...
testing MPNN ...
testing MPNN with attention ...
testing RNN ...
testing RNN with attention ...


### QM9

In [None]:
#data
data = pd.read_csv('pretrain_data/qm9.csv')
smiles = data['smiles'].tolist()
#properties = ['mu','alpha','homo','lumo','gap','r2','zpve','u0','u298','h298','g298','cv']
properties = ['mu','alpha','gap','r2','g298','cv','g298_atom']

In [5]:
#graph models
for prop in properties:
    exp_name = 'QM9_'+prop
    print('---loading '+prop)
    prop_list = data[prop].tolist()
    dataset = p.data_maker_decon(smiles, prop_list, 'graphs')
    prop_list = None
    for m in s_graph_models:
        print('testing '+m.name+' ...')
        p.fit_no_test(m, exp_name, dataset)

---loading mu
testing D-MPNN ...
testing MPNN ...
---loading alpha
testing D-MPNN ...
testing MPNN ...
---loading homo
testing D-MPNN ...
testing MPNN ...


KeyboardInterrupt: 

In [None]:
#sentence models
for prop in properties:
    exp_name = 'QM9_'+prop
    print('---loading '+prop)
    prop_list = data[prop].tolist()
    dataset = p.data_maker_decon(smiles, prop_list, 'sentences')
    prop_list = None
    for m in s_sen_models:
        print('testing '+m.name+' ...')
        p.fit_no_test(m, exp_name, dataset)

---
### Dual input models

In [None]:
d_DMPNN = p.Model(name='DMPNN',
                model=double_MPNN(MP_depth=3, MP_hidden=256, NN_depth=2, NN_hidden=512, activation='ReLU', 
                                  interaction=None, atom_messages=False, dropout=0, readout='sum'),
                lr=0.001,
                batch_size=64,
                data_type='graphs',
                inputs=2)
d_DMPNN_att = p.Model(name='DMPNN with attention',
                      model=double_MPNN(MP_depth=4, MP_hidden=128, NN_depth=4, NN_hidden=64, activation='ELU', 
                                        atom_messages=False, dropout=0, interaction='tanh', readout='mean'),
                      lr=0.001,
                      batch_size=64,
                      data_type='graphs',
                      inputs=2)
d_MPNN = p.Model(name='MPNN',
                 model=double_MPNN(MP_depth=3, MP_hidden=256, NN_depth=2, NN_hidden=512, activation='LeakyReLU', 
                                   atom_messages=True, dropout=0, interaction=None, readout='sum'),
                 lr=0.001,
                 batch_size=64,
                 data_type='graphs',
                 inputs=2)
d_MPNN_att = p.Model(name='MPNN with attention',
                     model=double_MPNN(MP_depth=2, MP_hidden=64, NN_depth=4, NN_hidden=512, activation='ReLU', 
                                       atom_messages=True, dropout=0, interaction='tanh', readout='max'),
                     lr=0.001,
                     batch_size=64,
                     data_type='graphs',
                     inputs=2)
d_RNN = p.Model(name='RNN',
                model=double_RNN(NN_depth=3, NN_hidden=512, RNN_hidden=512, activation='ReLU', dropout=0.3,
                                 features=300, interaction=None, readout='max'),
                lr=0.001,
                batch_size=32,
                data_type='sentences',
                inputs=2)
d_RNN_att = p.Model(name='RNN with attention',
                    model=double_RNN(NN_depth=1, NN_hidden=1024, RNN_hidden=512, activation='PReLU', dropout=0.1,
                                     features=300, interaction='exp', readout='max'),
                    lr=0.001,
                    batch_size=32,
                    data_type='sentences',
                    inputs=2)

#list of all models for testing
#d_models = [d_DMPNN, d_DMPNN_att, d_MPNN, d_MPNN_att, d_RNN, d_RNN_att]
d_graph_models = [d_DMPNN, d_DMPNN_att, d_MPNN, d_MPNN_att]
d_sen_models = [d_RNN, d_RNN_att]

---
### Gsolv

In [None]:
data = pd.read_csv('pretrain_data/comp_solv.csv')
solute = data['mol solvent'].tolist()
solvent = data['mol solute'].tolist()
Gsolv = data['target Gsolv kcal'].tolist()
exp_name = "Gsolv"

In [None]:
dataset = p.data_maker_decon(solute, Gsolv, 'graphs', solvent=solvent)
for m in d_graph_models:
    print('testing '+m.name+' ...')
    p.fit_no_test(m, exp_name, dataset)

In [None]:
dataset = p.data_maker_decon(solute, Gsolv, 'sentences', solvent=solvent)
for m in d_sen_models:
    print('testing '+m.name+' ...')
    p.fit_no_test(m, exp_name, dataset)