In [2]:
from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.Chem.Descriptors import MolWt
from sklearn.ensemble import RandomForestRegressor
import sklearn
from rdkit.Chem.rdmolops import GetFormalCharge
import torch
import deepchem as dc
import copy
from sklearn.model_selection import train_test_split


ModuleNotFoundError: No module named 'pretraining'

In [None]:
import sys
sys.path.append('../')
from modules.RNN import double_RNN
from modules.MPNN import double_MPNN
import modules.pretraining as p

---
## Data preparation

In [25]:
#single input water pkas
data = pd.read_csv('pretrain_data/water_pka.csv')
solute = data['Solute SMILES'].tolist()
pka = data['pKa (avg)'].tolist()
water_pka_datasets = p.data_maker(solute, pka, ids=list(range(200)))

In [43]:
#double input solvation energy
data = pd.read_csv('pretrain_data/comp_solv.csv')
solute = data['mol solvent'].tolist()
solvent = data['mol solute'].tolist()
Gsolv = data['target Gsolv kcal'].tolist()
Gsolv_datasets = p.data_maker(solute, Gsolv, solvent=solvent, ids=list(range(200)))

In [56]:
#single input QM9
data = pd.read_csv('pretrain_data/qm9.csv')
smiles = data['smiles'].tolist()
qm9_data = ddict()
properties = ['mu','alpha','homo','lumo','gap','r2','zpve','u0','u298','h298','g298','cv']
for prop in properties:
    prop_list = data[prop].tolist()
    datasets = p.data_maker(smiles, prop_list, ids=list(range(200)))
    qm9_data[prop] = datasets

---
## Training + testing

### Single input

In [60]:
DMPNN = p.Model(name='DMPNN',
                model=c.MPNN(atom_messages=False),
                model_type='torch',
                data_type='SMILES',
                inputs=1)
MPNN = p.Model(name='MPNN',
               model=c.MPNN(atom_messages=True),
               model_type='torch',
               data_type='SMILES',
               inputs=1)
RNN = p.Model(name='RNN',
              model=d.snet(),
              model_type='torch',
              data_type='sentences',
              inputs=1)

#list of all models for testing
s_models = [DMPNN, MPNN, RNN]

In [40]:
exp_name = "Water pka"
datasets = water_pka_datasets
for m in s_models:
    print('testing '+m.name+' ...')
    p.fit_no_test(m, exp_name, datasets)

testing RNN ...
testing DMPNN ...
testing MPNN ...
testing RNN ...


In [61]:
#QM9
for prop in properties:
    exp_name = 'QM9_'+prop
    datasets = qm9_data[prop]
    print('TESTING '+prop)
    for m in s_models:
        print('testing '+m.name+' ...')
        p.fit_no_test(m, exp_name, datasets)

TESTING mu
testing DMPNN ...
testing MPNN ...
testing RNN ...
TESTING alpha
testing DMPNN ...
testing MPNN ...
testing RNN ...
TESTING homo
testing DMPNN ...
testing MPNN ...
testing RNN ...
TESTING lumo
testing DMPNN ...
testing MPNN ...
testing RNN ...
TESTING gap
testing DMPNN ...
testing MPNN ...
testing RNN ...
TESTING r2
testing DMPNN ...
testing MPNN ...
testing RNN ...
TESTING zpve
testing DMPNN ...
testing MPNN ...
testing RNN ...
TESTING u0
testing DMPNN ...
testing MPNN ...
testing RNN ...
TESTING u298
testing DMPNN ...
testing MPNN ...
testing RNN ...
TESTING h298
testing DMPNN ...
testing MPNN ...
testing RNN ...
TESTING g298
testing DMPNN ...
testing MPNN ...
testing RNN ...
TESTING cv
testing DMPNN ...
testing MPNN ...
testing RNN ...


### Dual input

In [47]:
DMPNN = p.Model(name='D-MPNN',
                model=c.double_MPNN(atom_messages=False),
                model_type='torch',
                data_type='SMILES',
                inputs=2)
DMPNN_att = p.Model(name='D-MPNN with attention',
                    model=c.double_MPNN(atom_messages=False, interaction='exp'),
                    model_type='torch',
                    data_type='SMILES',
                    inputs=2)
MPNN = p.Model(name='MPNN',
                model=c.double_MPNN(atom_messages=True),
                model_type='torch',
                data_type='SMILES',
                inputs=2)
MPNN_att = p.Model(name='MPNN with attention',
                    model=c.double_MPNN(atom_messages=True, interaction='exp'),
                    model_type='torch',
                    data_type='SMILES',
                    inputs=2)
RNN = p.Model(name='RNN',
              model=d.dnet(interaction=None),
              model_type='torch',
              data_type='sentences',
              inputs=2)
RNN_att = p.Model(name='RNN with attention',
                  model=d.dnet(interaction='exp'),
                  model_type='torch',
                  data_type='sentences',
                  inputs=2)

#list of all models for testing
d_models = [DMPNN, DMPNN_att, MPNN, MPNN_att, RNN, RNN_att]

In [48]:
exp_name = "Gsolv"
datasets = Gsolv_datasets
for m in d_models:
    print('testing '+m.name+' ...')
    p.fit_no_test(m, exp_name, datasets)

testing D-MPNN ...
testing D-MPNN with attention ...
testing MPNN ...
testing MPNN with attention ...
testing RNN ...
testing RNN with attention ...
