In [2]:
from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.Chem.Descriptors import MolWt
from sklearn.ensemble import RandomForestRegressor
import sklearn
from rdkit.Chem.rdmolops import GetFormalCharge
import torch
import deepchem as dc
import copy
from sklearn.model_selection import train_test_split
import imp

In [3]:
import sys
sys.path.append('../')
from modules.RNN import double_RNN, RNN
from modules.MPNN import double_MPNN, MPNN
import modules.pretraining as p

---
## Training + testing

### Single input models

In [19]:
s_DMPNN = p.Model(name='DMPNN',
                model=MPNN(MP_depth=3, MP_hidden=256, NN_depth=2, NN_hidden=512, activation='ReLU', 
                                atom_messages=False, dropout=0, readout='sum'),
                lr=0.001,
                batch_size=64,
                data_type='graphs',
                inputs=1)
s_DMPNN_att = p.Model(name='DMPNN with attention',
                      model=MPNN(MP_depth=4, MP_hidden=128, NN_depth=4, NN_hidden=64, activation='ELU', 
                                        atom_messages=False, dropout=0, readout='mean'),
                      lr=0.001,
                      batch_size=64,
                      data_type='graphs',
                      inputs=1)
s_MPNN = p.Model(name='MPNN',
               model=MPNN(MP_depth=3, MP_hidden=256, NN_depth=2, NN_hidden=512, activation='LeakyReLU', 
                          atom_messages=True, dropout=0, readout='sum'),
               lr=0.001,
               batch_size=64,
               data_type='graphs',
               inputs=1)
s_MPNN_att = p.Model(name='MPNN with attention',
                   model=MPNN(MP_depth=2, MP_hidden=64, NN_depth=4, NN_hidden=512, activation='ReLU', 
                                     atom_messages=True, dropout=0, readout='max'),   
                   lr=0.001,
                   batch_size=64,
                   data_type='graphs',
                   inputs=1)
s_RNN = p.Model(name='RNN',
              model=RNN(NN_depth=3, NN_hidden=512, RNN_hidden=512, activation='ReLU', dropout=0.3,
                        features=300, readout='max'),
              lr=0.001,
              batch_size=32,
              data_type='sentences',
              inputs=1)
s_RNN_att = p.Model(name='RNN with attention',
                    model=RNN(NN_depth=1, NN_hidden=1024, RNN_hidden=512, activation='PReLU', dropout=0.1,
                                     features=300, readout='max'),
                    lr=0.001,
                    batch_size=32,
                    data_type='sentences',
                    inputs=1)

#list of all models for testing
s_models = [s_DMPNN, s_DMPNN_att, s_MPNN, s_MPNN_att, s_RNN, s_RNN_att]
s_graph_models = [s_DMPNN, s_DMPNN_att, s_MPNN, s_MPNN_att]
s_sen_models = [s_RNN, s_RNN_att]

### Water pka

In [None]:
data = pd.read_csv('pretrain_data/water_pka.csv')
solute = data['Solute SMILES'].tolist()
pka = data['pKa (avg)'].tolist()
datasets = p.data_maker(solute, pka)

exp_name = "Water pka"
for m in s_models:
    data = datasets[m.data_type]
    print('testing '+m.name+' ...')
    p.fit_no_test(m, exp_name, data)

testing DMPNN with attention ...
testing MPNN ...
testing MPNN with attention ...
testing RNN ...


### QM9

In [None]:
#data
data = pd.read_csv('pretrain_data/qm9.csv')
smiles = data['smiles'].tolist()
#properties = ['mu','alpha','homo','lumo','gap','r2','zpve','u0','u298','h298','g298','cv']
properties = ['mu','r2','g298','cv'] #'alpha','homo','lumo','gap','zpve'

In [5]:
#graph models
for prop in properties:
    exp_name = 'QM9_'+prop
    print('---loading '+prop)
    prop_list = data[prop].tolist()
    dataset = p.data_maker_decon(smiles, prop_list, 'graphs')
    prop_list = None
    for m in s_graph_models:
        print('testing '+m.name+' ...')
        p.fit_no_test(m, exp_name, dataset)

---loading mu
testing D-MPNN ...
testing MPNN ...
---loading alpha
testing D-MPNN ...
testing MPNN ...
---loading homo
testing D-MPNN ...
testing MPNN ...


KeyboardInterrupt: 

In [None]:
#sentence models
for prop in properties:
    exp_name = 'QM9_'+prop
    print('---loading '+prop)
    prop_list = data[prop].tolist()
    dataset = p.data_maker_decon(smiles, prop_list, 'sentences')
    prop_list = None
    for m in s_sen_models:
        print('testing '+m.name+' ...')
        p.fit_no_test(m, exp_name, dataset)

---
### Dual input models

In [4]:
d_DMPNN = p.Model(name='DMPNN',
                model=double_MPNN(MP_depth=3, MP_hidden=256, NN_depth=2, NN_hidden=512, activation='ReLU', 
                                  interaction=None, atom_messages=False, dropout=0, readout='sum'),
                lr=0.001,
                batch_size=64,
                data_type='graphs',
                inputs=2)
d_DMPNN_att = p.Model(name='DMPNN with attention',
                      model=double_MPNN(MP_depth=4, MP_hidden=128, NN_depth=4, NN_hidden=64, activation='ELU', 
                                        atom_messages=False, dropout=0, interaction='tanh', readout='mean'),
                      lr=0.001,
                      batch_size=64,
                      data_type='graphs',
                      inputs=2)
d_MPNN = p.Model(name='MPNN',
                 model=double_MPNN(MP_depth=3, MP_hidden=256, NN_depth=2, NN_hidden=512, activation='LeakyReLU', 
                                   atom_messages=True, dropout=0, interaction=None, readout='sum'),
                 lr=0.001,
                 batch_size=64,
                 data_type='graphs',
                 inputs=2)
d_MPNN_att = p.Model(name='MPNN with attention',
                     model=double_MPNN(MP_depth=2, MP_hidden=64, NN_depth=4, NN_hidden=512, activation='ReLU', 
                                       atom_messages=True, dropout=0, interaction='tanh', readout='max'),                 
                     lr=0.001,
                     batch_size=64,
                     data_type='graphs',
                     inputs=2)
d_RNN = p.Model(name='RNN',
                model=double_RNN(NN_depth=3, NN_hidden=512, RNN_hidden=512, activation='ReLU', dropout=0.3,
                                 features=300, interaction=None, readout='max'),
                lr=0.001,
                batch_size=32,
                data_type='sentences',
                inputs=2)
d_RNN_att = p.Model(name='RNN with attention',
                    model=double_RNN(NN_depth=1, NN_hidden=1024, RNN_hidden=512, activation='PReLU', dropout=0.1,
                                     features=300, interaction='exp', readout='max'),
                    lr=0.001,
                    batch_size=32,
                    data_type='sentences',
                    inputs=2)

#list of all models for testing
#d_models = [d_DMPNN, d_DMPNN_att, d_MPNN, d_MPNN_att, d_RNN, d_RNN_att]
d_graph_models = [d_DMPNN, d_DMPNN_att, d_MPNN, d_MPNN_att]
d_sen_models = [d_RNN, d_RNN_att]

---
### Gsolv

In [118]:
# The very last Gsolv value is nan, which breaks the validation loss or training loss at a certain point whenever it arises. To avoid this, I have dropped it here.
data = pd.read_csv('pretrain_data/comp_solv.csv')
data.drop([342158],inplace=True) # This is the last index.

solute = data['mol solvent'].tolist()
solvent = data['mol solute'].tolist()
Gsolv = data['target Gsolv kcal'].tolist()
exp_name = "Gsolv"

In [5]:
dataset = p.data_maker_decon(solute, Gsolv, 'graphs', solvent=solvent)

In [106]:
import importlib 
importlib.reload(p)

<module 'modules.pretraining' from '../modules/pretraining.py'>

In [17]:
for m in d_graph_models:
    print('testing '+m.name+' ...')
    p.fit_no_test(m, exp_name, dataset)

testing DMPNN ...
tensor([[[ 0.1655,  0.0495, -0.1016,  ..., -0.2221, -0.1312,  0.1659],
         [-0.1367,  0.0673, -0.0521,  ..., -0.1420, -0.4025, -0.2129],
         [ 0.0373, -0.1383, -0.3160,  ..., -0.2490, -0.2950, -0.0161],
         ...,
         [-0.1367,  0.0673, -0.0521,  ..., -0.1420, -0.4025, -0.2129],
         [-0.1367,  0.0673, -0.0521,  ..., -0.1420, -0.4025, -0.2129],
         [ 0.1612,  0.1427, -0.0699,  ..., -0.3203, -0.2415,  0.0655]],

        [[ 0.0723,  0.2225,  0.0227,  ..., -0.1677, -0.2336,  0.1446],
         [ 0.0461,  0.1143, -0.0683,  ..., -0.1665, -0.1754, -0.5153],
         [-0.1259, -0.0966, -0.4584,  ..., -0.0063, -0.4361,  0.1118],
         ...,
         [-0.0845, -0.0193, -0.2386,  ..., -0.2852, -0.3526,  0.0738],
         [ 0.2763, -0.0210, -0.0667,  ...,  0.0091, -0.1528, -0.4056],
         [ 0.3596, -0.1918,  0.2411,  ..., -0.2958, -0.1595, -0.1761]],

        [[ 0.0514,  0.0482, -0.1205,  ...,  0.0075, -0.2370,  0.1635],
         [-0.0090,  0.0072,

AttributeError: 'Tensor' object has no attribute 'get_components'

In [120]:
dataset = p.data_maker_decon(solute, Gsolv, 'sentences', solvent=solvent)

In [121]:
import importlib 
importlib.reload(p)

<module 'modules.pretraining' from '../modules/pretraining.py'>

In [122]:
d_RNN = p.Model(name='RNN',
                model=double_RNN(NN_depth=3, NN_hidden=512, RNN_hidden=512, activation='ReLU', dropout=0.3,
                                 features=300, interaction=None, readout='max'),
                lr=0.0001,
                batch_size=512,
                data_type='sentences',
                inputs=2)
d_RNN_att = p.Model(name='RNN with attention',
                    model=double_RNN(NN_depth=1, NN_hidden=1024, RNN_hidden=512, activation='PReLU', dropout=0.1,
                                     features=300, interaction='exp', readout='max'),
                    lr=0.0001,
                    batch_size=512,
                    data_type='sentences',
                    inputs=2)

In [82]:
len(dataset[0][0])

2

In [110]:
# This will tell you what the index of the nan value is, if there is one.

index = data['target Gsolv kcal'].index[data['target Gsolv kcal'].apply(np.isnan)]
index

Int64Index([342158], dtype='int64')

In [95]:
# This cell is to check that none of the converted mol2vec sentence tensors do not contain nan values. If they do, then there would be an issue with that row. 
# Fortunately, none of them do have issues.

import tqdm 

problems = []

for i,lst in tqdm.tqdm(enumerate(dataset[0])):
    for j,tensor in enumerate(lst):
        if torch.any(torch.isnan(tensor)):
            problems.append((i,j))

342159it [00:06, 56232.69it/s]


[]

In [123]:
import warnings
warnings.filterwarnings("ignore")

for m in d_sen_models:
    print('testing '+m.name+' ...')
    p.fit_no_test(m, exp_name, dataset)

testing RNN ...
250
0.1031670793890953
500
0.17221900820732117
750
0.11610537767410278
1000
0.17618653178215027
1250
0.13632653653621674
1500
0.1271635740995407
1750
0.2680697739124298
2000
0.04769159108400345
2250
0.15518315136432648
2500
0.10135287046432495
2750
0.050395697355270386
3000
0.04251009225845337
3250
0.15185430645942688
3500
0.09637784957885742
3750
0.05741807073354721
4000
0.17533129453659058
4250
0.056278254836797714
4500
0.06993065774440765
4750
0.05617912858724594
5000
0.04596874117851257
5250
0.08783460408449173
5500
0.05978197976946831
5750
0.14979825913906097
6000
0.13611002266407013
6250
0.07647736370563507
6500
0.0676911473274231
6750
0.02257583662867546
7000
0.07329926639795303
7250
0.020645489916205406


KeyboardInterrupt: 