In [1]:
%matplotlib inline

from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.Chem.Descriptors import MolWt
from sklearn.ensemble import RandomForestRegressor
import sklearn
from rdkit.Chem.rdmolops import GetFormalCharge
import torch
import deepchem as dc
import copy
from sklearn.model_selection import train_test_split
from hyperopt import hp
import imp

pd.set_option('display.float_format', lambda x: '%.3f' % x)  # Display floats without scientific notation



In [2]:
from modules.data import data_maker, Dataset
from modules.RNN import double_RNN
from modules.fit import Model, fit
from modules.myhyperopt import hyperopt_func
from modules.MPNN import double_MPNN
from modules.MP_utils import mol2graph



In [29]:
import modules

In [3]:
data = pd.read_csv('data/full_pka_data.csv')
solute = data['Solute SMILES'].tolist()
solvent = data['Solvent SMILES'].tolist()
pka = data['pKa (avg)'].tolist()
data_size = len(solute)

In [4]:
indices = list(range(data_size))
CV_ids, holdout_ids, _, _ = train_test_split(indices, solvent, test_size=0.2, random_state=1, stratify=solvent)
CV_datasets = data_maker(solute, solvent, pka, CV_ids)
datasets = data_maker(solute, solvent, pka)

In [5]:
MPNN = Model(name='MPNN',
                model=double_MPNN(atom_messages=True),
                model_type='torch',
                data_type='SMILES')

In [6]:
def collate_double(batch):
    '''
    Collates double input batches for a torch loader.
        
    Parameters
    ----------
    batch: List = [(X,y)]
        List of (solute,solvent) pairs with their target value.
    
    Returns
    -------
    [sol_batch, solv_batch, targets]: List
        Type of output depends on if the original dataset contains SMILES or sentences.
        Each component is a list / torch.Tensor.
    '''
    if type(batch[0][0][0]) == str:
        sol_batch = mol2graph([t[0][0] for t in batch])
        solv_batch = mol2graph([t[0][1] for t in batch])
    else:
        sol_batch = [torch.Tensor(t[0][0]) for t in batch]
        sol_batch = nn.utils.rnn.pad_sequence(sol_batch)
        solv_batch = [torch.Tensor(t[0][1]) for t in batch]
        solv_batch = nn.utils.rnn.pad_sequence(solv_batch)
    targets = torch.Tensor([t[1].item() for t in batch])
    
    return [sol_batch, solv_batch, targets]

def double_loader(data, indices, batch_size=64):
    '''
    torch loader for double inputs.
        
    Parameters
    ----------
    indices : list, np.array
        Indices for selected samples.
    data : List = [(sol,solv),pka]
        Training data of (solute,solvent) pairs and target values.
    batch_size : int
        Size of selected batches
    
    Returns
    -------
    loader : torch.utils.data.DataLoader
        Batched dataloader for torch regressors
    '''
    dataset = Dataset(indices, data[0], data[1])
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_double)
    return loader

In [7]:
data = datasets['SMILES']
ids = list(range(len(data[0])))
loader = double_loader(data, ids)

In [8]:
for (sol,solv,pka) in loader:
    MPNN.model(sol,solv)