## C-SGEN

A PyTorch implementation of "Molecule Property Prediction based on Spatial Graph Embedding"

In [1]:
!git clone https://github.com/wxfsd/C-SGEN.git
%cd /content/C-SGEN

/content/C-SGEN


In [None]:
# Install dependencies / requirements
!pip install theano==1.0.3 numpy==1.16.4 scipy==1.3.0
!pip install sklearn==0.0 deepchem torch==1.4.0 torchvision==0.5.0 torchtext==0.5.0

!pip install torch-geometric \
  torch-sparse==latest+cu101 \
  torch-scatter==latest+cu101 \
  torch-cluster==latest+cu101 \
  torch-spline-conv==latest+cu101 \
  -f https://pytorch-geometric.com/whl/torch-1.4.0.html

# Install RDKit 
!pip install rdkit-pypi==2021.3.1.5

### Example: C-SGEN_train

Training C-SGEN model on the default dataset, the data is ready to be saved in a folder.

In [2]:
import numpy as np
import torch
import torch.nn as nn
import timeit
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset, DataLoader

import os
import pickle

In [3]:
def load_pickle(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f)

def rms_score(y_true, y_pred):
    """Computes RMS error."""
    return np.sqrt(mean_squared_error(y_true, y_pred))


def load_tensor(file_name, dtype):
    return [dtype(d).to(device) for d in np.load(file_name + '.npy')]

In [4]:
class mydataset(torch.utils.data.Dataset):

    def __init__(self,dataset):

        self.Features = load_tensor('./FreeSolv/decrease/' + dataset +'/Features', torch.FloatTensor)
        self.Normed_adj = load_tensor('./FreeSolv/decrease/' + dataset +'/Normed_adj', torch.FloatTensor)
        self.Fringer = load_tensor('./FreeSolv/decrease/' + dataset +'/fingerprint_stand', torch.FloatTensor)
        self.interactions = load_tensor('./FreeSolv/decrease/' + dataset +'/Interactions', torch.FloatTensor)

        self.dataset = list(zip(np.array(self.Features), np.array(self.Normed_adj), np.array(self.Fringer),np.array(self.interactions)))


    def __getitem__(self, item):
        data_batch = self.dataset[item]

        return data_batch

    def __len__(self):
        return len(self.interactions)

In [5]:
class C_SGEN(nn.Module):
    
    def __init__(self):
        super(C_SGEN, self).__init__()
        self.layer1 = nn.Linear(75, 4 * ch_num)
        self.dropout_feature = nn.Dropout(p=0.5)
        self.dropout_adj = nn.Dropout(p=0.5)

        self.conv1d_1 = nn.Conv1d(in_channels=4 * ch_num, out_channels=5 * ch_num // 2, kernel_size=(k + 1) // 2 + 1)
        self.conv1d_2 = nn.Conv1d(in_channels=5 * ch_num // 2, out_channels=ch_num, kernel_size=k // 2 + 1)
        self.bn = nn.BatchNorm1d(ch_num)

        self.conv1d_3 = nn.Conv1d(in_channels=5 * ch_num, out_channels=6 * ch_num // 2, kernel_size=(k + 1) // 2 + 1)
        self.conv1d_4 = nn.Conv1d(in_channels=6 * ch_num // 2, out_channels=ch_num, kernel_size=k // 2 + 1)

        self.conv1d_5 = nn.Conv1d(in_channels=6 * ch_num, out_channels=7 * ch_num // 2, kernel_size=(k + 1) // 2 + 1)
        self.conv1d_6 = nn.Conv1d(in_channels=7 * ch_num // 2, out_channels=ch_num, kernel_size=k // 2 + 1)

        self.conv1d_7 = nn.Conv1d(in_channels=7 * ch_num, out_channels=8 * ch_num // 2, kernel_size=(k + 1) // 2 + 1)
        self.conv1d_8 = nn.Conv1d(in_channels=8 * ch_num // 2, out_channels=ch_num, kernel_size=k // 2 + 1)

        self.conv1d_9 = nn.Conv1d(in_channels=8 * ch_num, out_channels=9 * ch_num // 2, kernel_size=(k + 1) // 2 + 1)
        self.conv1d_10 = nn.Conv1d(in_channels=9 * ch_num // 2, out_channels=ch_num, kernel_size=k // 2 + 1)

        self.conv1d_11 = nn.Conv1d(in_channels=9 * ch_num, out_channels=10 * ch_num // 2, kernel_size=(k + 1) // 2 + 1)
        self.conv1d_12 = nn.Conv1d(in_channels=10 * ch_num // 2, out_channels=ch_num, kernel_size=k // 2 + 1)

        self.layer2 = nn.Linear(6 * ch_num, ch_num)

        self.predict_property = nn.Linear(ch_num+363, 1)
        self.conv1d = nn.Conv1d(batch * 16, k, 3, stride=1, padding=1)

        self.W_cnn = nn.ModuleList([nn.Conv2d(
            in_channels=1, out_channels=1, kernel_size=2 * window + 1,
            stride=1, padding=window) for _ in range(3)])

        self.cnn_line = nn.Linear(363, 363)

        self.dnn1 = nn.Linear(363, 512)
        self.dnn2 = nn.Linear(512, 1024)
        self.dnn3 = nn.Linear(1024, 363)

    def pad(self, matrices, value):
        """Pad adjacency matrices for batch processing."""
        sizes = [d.shape[0] for d in matrices]
        D = sum(sizes)
        pad_matrices = value + np.zeros((D, D))
        m = 0
        for i, d in enumerate(matrices):
            s_i = sizes[i]
            pad_matrices[m:m + s_i, m:m + s_i] = d.cpu()
            m += s_i
        return torch.FloatTensor(pad_matrices).to(device)

    def sum_axis(self, xs, axis):
        y = list(map(lambda x: torch.sum(x, 0), torch.split(xs, axis)))
        return torch.stack(y)

    def simple_conv1(self, Normed_adj, Features):
        adj_m = self.dropout_adj(Normed_adj)
        outs = self.dropout_feature(Features)
        outs = self.layer1(outs)
        outs = torch.matmul(adj_m, outs)
        return outs

    def DNN(self, x_words):

        x_words = F.relu(self.dnn1(x_words))
        x_words = F.relu(self.dnn2(x_words))
        x_words = self.dnn3(x_words)

        return x_words

    def cnn_process(self, x, layer):
        """Controlled experiments, CNN Processing Molecular Fingerprints."""
        for i in range(layer):
            hs = self.cnn(x, i)
            x = torch.relu(self.cnn_line(hs))
        return x

    def cnn(self, xs, i):
        xs = torch.unsqueeze(torch.unsqueeze(xs, 0), 0)
        hs = torch.relu(self.W_cnn[i](xs))
        return torch.squeeze(torch.squeeze(hs, 0), 0)

    def conv1d_spatial_graph_matrix(self, adj_m, fea_m):
        """
        After the 1-d convolution processes spatial graph matrix,
        it is concatenated with the initial atomic features.
        """
        adj_m_graph1 = torch.unsqueeze(adj_m, 1)
        fea_m_graph1 = torch.unsqueeze(fea_m, -1)
        feas = torch.mul(adj_m_graph1, fea_m_graph1)
        feas = feas.permute(2, 1, 0)
        features = feas.permute(0, 2, 1)

        spatial_feature = self.conv1d(features)
        spatial_feature = spatial_feature.permute(0, 2, 1)
        outs = torch.cat([fea_m_graph1, spatial_feature], 2)
        outs = outs.permute(0, 2, 1)
        return outs

    def C_SGEL1(self, Normed_adj, input_feature):
        adj_m = self.dropout_adj(Normed_adj)
        outs = self.conv1d_spatial_graph_matrix(adj_m, input_feature)
        outs = self.dropout_feature(outs)
        outs = outs.permute(0, 2, 1)
        outs_conv1d_1 = self.conv1d_1(outs)
        outs_conv1d_1_relu = F.relu(outs_conv1d_1)
        outs_conv1d_1_relu_drought = self.dropout_feature(outs_conv1d_1_relu)
        outs_conv1d_2 = self.conv1d_2(outs_conv1d_1_relu_drought)
        outs_conv1d_2_relu = F.relu(outs_conv1d_2)
        outs_conv1d_2_permute = outs_conv1d_2_relu.permute(0, 2, 1)
        outs_conv1d_2_permute_unbind = torch.unbind(outs_conv1d_2_permute, dim=1)
        outs_conv1d_2_batch_norm = self.bn(outs_conv1d_2_permute_unbind[0])
        return outs_conv1d_2_batch_norm

    def C_SGEL2(self, Normed_adj, input_feature):
        adj_m = self.dropout_adj(Normed_adj)
        outs = self.conv1d_spatial_graph_matrix(adj_m, input_feature)
        outs = self.dropout_feature(outs)
        outs = outs.permute(0, 2, 1)
        outs_conv1d_3 = self.conv1d_3(outs)
        outs_conv1d_3_relu = F.relu(outs_conv1d_3)
        outs_conv1d_3_relu_drought = self.dropout_feature(outs_conv1d_3_relu)

        outs_conv1d_4 = self.conv1d_4(outs_conv1d_3_relu_drought)
        outs_conv1d_4 = F.relu(outs_conv1d_4)
        outs_conv1d_4_permute = outs_conv1d_4.permute(0, 2, 1)
        outs_conv1d_4_permute_unbind = torch.unbind(outs_conv1d_4_permute, dim=1)
        outs_conv1d_4_batch_norm = self.bn(outs_conv1d_4_permute_unbind[0])
        return outs_conv1d_4_batch_norm

    def C_SGEL3(self, Normed_adj, input_feature):
        adj_m = self.dropout_adj(Normed_adj)
        outs = self.conv1d_spatial_graph_matrix(adj_m, input_feature)
        outs = self.dropout_feature(outs)
        outs = outs.permute(0, 2, 1)
        outs_conv1d_5 = self.conv1d_5(outs)
        outs_conv1d_5_relu = F.relu(outs_conv1d_5)
        outs_conv1d_5_relu_drought = self.dropout_feature(outs_conv1d_5_relu)
        outs_conv1d_6 = self.conv1d_6(outs_conv1d_5_relu_drought)
        outs_conv1d_6 = F.relu(outs_conv1d_6)
        outs_conv1d_6_permute = outs_conv1d_6.permute(0, 2, 1)
        outs_conv1d_6_permute_unbind = torch.unbind(outs_conv1d_6_permute, dim=1)
        outs_conv1d_6_batch_norm = self.bn(outs_conv1d_6_permute_unbind[0])
        return outs_conv1d_6_batch_norm

    def C_SGEL4(self, Normed_adj, input_feature):
        adj_m = self.dropout_adj(Normed_adj)
        outs = self.conv1d_spatial_graph_matrix(adj_m, input_feature)
        outs = self.dropout_feature(outs)
        outs = outs.permute(0, 2, 1)
        outs_conv1d_7 = self.conv1d_7(outs)
        outs_conv1d_7_relu = F.relu(outs_conv1d_7)
        outs_conv1d_7_relu_drought = self.dropout_feature(outs_conv1d_7_relu)
        outs_conv1d_8 = self.conv1d_8(outs_conv1d_7_relu_drought)
        outs_conv1d_8 = F.relu(outs_conv1d_8)
        outs_conv1d_8_permute = outs_conv1d_8.permute(0, 2, 1)
        outs_conv1d_8_permute_unbind = torch.unbind(outs_conv1d_8_permute, dim=1)
        outs_conv1d_8_batch_norm = self.bn(outs_conv1d_8_permute_unbind[0])
        return outs_conv1d_8_batch_norm

    def C_SGEL5(self, Normed_adj, input_feature):
        adj_m = self.dropout_adj(Normed_adj)
        outs = self.conv1d_spatial_graph_matrix(adj_m, input_feature)
        outs = self.dropout_feature(outs)
        outs = outs.permute(0, 2, 1)
        outs_conv1d_9 = self.conv1d_9(outs)
        outs_conv1d_9_relu = F.relu(outs_conv1d_9)
        outs_conv1d_9_relu_drought = self.dropout_feature(outs_conv1d_9_relu)
        outs_conv1d_10 = self.conv1d_10(outs_conv1d_9_relu_drought)
        outs_conv1d_10 = F.relu(outs_conv1d_10)
        outs_conv1d_10_permute = outs_conv1d_10.permute(0, 2, 1)
        outs_conv1d_10_permute_unbind = torch.unbind(outs_conv1d_10_permute, dim=1)
        outs_conv1d_10_batch_norm = self.bn(outs_conv1d_10_permute_unbind[0])
        return outs_conv1d_10_batch_norm

    def C_SGEL6(self, Normed_adj, input_feature):
        adj_m = self.dropout_adj(Normed_adj)
        outs = self.conv1d_spatial_graph_matrix(adj_m, input_feature)
        outs = self.dropout_feature(outs)
        outs = outs.permute(0, 2, 1)
        outs_conv1d_11 = self.conv1d_11(outs)
        outs_conv1d_11_relu = F.relu(outs_conv1d_11)
        outs_conv1d_11_relu_drought = self.dropout_feature(outs_conv1d_11_relu)

        outs_conv1d_12 = self.conv1d_12(outs_conv1d_11_relu_drought)

        outs_conv1d_12 = F.relu(outs_conv1d_12)
        outs_conv1d_12_permute = outs_conv1d_12.permute(0, 2, 1)
        outs_conv1d_12_permute_unbind = torch.unbind(outs_conv1d_12_permute, dim=1)
        outs_conv1d_12_batch_norm = self.bn(outs_conv1d_12_permute_unbind[0])
        return outs_conv1d_12_batch_norm

    def simple_conv2(self, Normed_adj, Features):
        adj_m = self.dropout_adj(Normed_adj)
        outs = self.dropout_feature(Features)
        outs = self.layer2(outs)
        outs = torch.matmul(adj_m, outs)
        return outs

    def forward(self, inputs):

        Features, Normed_adj, Fringer = list(inputs[0]), list(inputs[1]), list(inputs[2])

        axis = list(map(lambda x: len(x), Features))

        Features = torch.cat(Features)
        Normed_adj = self.pad(Normed_adj, 0)

        Fringer = list(Fringer)
        for i in range(len(Fringer)):
            Fringer[i] = torch.unsqueeze(Fringer[i], 0)
        Fringer = torch.cat(Fringer, 0)

        # Graph embedding layer
        outs1 = self.simple_conv1(Normed_adj, Features)

        # Layer 1 Convolution Spatial Graph Embedding layer
        cur_outs1 = self.C_SGEL1(Normed_adj, outs1)
        # Skip connection
        outs2 = torch.cat((outs1, cur_outs1), 1)

        # Layer 2 Convolution Spatial Graph Embedding layer
        cur_outs2 = self.C_SGEL2(Normed_adj, outs2)
        # Skip connection
        outs3 = torch.cat((outs2, cur_outs2), 1)

        # Graph-gather layer
        outs = self.simple_conv2(Normed_adj, outs3)
        y_molecules = self.sum_axis(outs, axis)

        # Deep neural network for molecular fingerprint
        Fringer = self.DNN(Fringer)

        # Concatenate molecule and fingerprint
        y_molecules = torch.cat((y_molecules, Fringer), 1)

        # Prediction of Molecular Properties by Fully Connected Layer
        z_molecules = self.predict_property(y_molecules)

        return z_molecules

    def __call__(self, data_batch, std, mean, train=True):

        inputs, t_interaction = data_batch[:-1], torch.squeeze(data_batch[-1])
        z_interaction = self.forward(inputs)

        if train:
            t_interaction = torch.unsqueeze(t_interaction, 1)
            loss = F.mse_loss(z_interaction, t_interaction)
            return loss

        else:
            t_interaction = torch.unsqueeze(t_interaction, 1)
            loss = F.mse_loss(z_interaction, t_interaction)
            z = z_interaction.to('cpu').data.numpy()
            t = t_interaction.to('cpu').data.numpy()
            z, t = std * z + mean, std * t + mean
            return loss, z, t

In [6]:
class Trainer(object):
    
    def __init__(self, model, std, mean):
        self.model = model
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=5e-4)
        self.std = std
        self.mean = mean

    def train(self, train_loader):
        loss_total = 0
        num = 0
        for data in train_loader:
            num += 1
            self.optimizer.zero_grad()
            loss = self.model(data, std=self.std, mean=self.mean, train=True)

            loss.backward()
            self.optimizer.step()
            loss_total += loss.to('cpu').data.numpy()
        loss_mean = loss_total/num
        return loss_mean

In [7]:
class T(object):
    
    def __init__(self, model, std, mean):
        self.model = model
        self.std = std
        self.mean = mean

    def test(self, test_loader):

        loss_total = 0
        all_p = []
        all_t = []

        num = 0
        for data in test_loader:
            num += 1
            loss, predicted, true = self.model(data, std=self.std, mean=self.mean, train=False)

            for i in predicted:
                all_p.append(float(i))
            for i in true:
                all_t.append(float(i))
            loss_total += loss.to('cpu').data.numpy()

        RMSE = rms_score(all_t, all_p)
        loss_mean = loss_total / num
        return loss_mean, RMSE, all_p, all_t

In [8]:
def metric(RMSE_k_test):
    RMSE_mean_test = np.mean(np.array(RMSE_k_test))
    RMSE_std_test = np.std(np.array(RMSE_k_test))

    return RMSE_mean_test, RMSE_std_test

In [9]:
device = torch.device('cuda')
std = 3.8448222046029543
mean = -3.8030062305295975
iteration = 33
window = 5
layer_cnn = 3
batch = 8
k = 4
ch_num = 4
decay_interval = 10
lr = 5e-4
lr_decay = 0.5
lr, lr_decay = map(float, [lr, lr_decay])

In [10]:
setting = 'FreeSolv--' \
          '--batch' + str(batch) + \
          '--k' + str(k) + \
          '--lr-' + str(lr) + \
          '--iteration-' + str(iteration)+\
            '--ch_num-' + str(4*ch_num)+\
            '--decay_interval-' + str(decay_interval)

print(setting)
print('batch:', batch)
print('k:', k)
print('ch_num:', str(4*ch_num))
print('decay_interval:', decay_interval)
print('lr:', lr)
print('lr_decay:', lr_decay)
print('iteration:', iteration)

FreeSolv----batch8--k4--lr-0.0005--iteration-33--ch_num-16--decay_interval-10
batch: 8
k: 4
ch_num: 16
decay_interval: 10
lr: 0.0005
lr_decay: 0.5
iteration: 33


In [11]:
import sys

print('Current python interpreter path：')
print(sys.executable)
print('Epoch Time(sec) Loss_train Loss_dev Loss_test RMSE_train RMSE_dev RMSE_test')

RMSE_k_valid = []
RMSE_k_test = []

seed_list = [256,512,1024]

train_dataset = mydataset('train_data')
valid_dataset = mydataset('valid_data')
test_dataset = mydataset('test_data')

train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=True, drop_last=True)

Current python interpreter path：
/usr/bin/python3
Epoch Time(sec) Loss_train Loss_dev Loss_test RMSE_train RMSE_dev RMSE_test


In [12]:
for seed in seed_list:

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    model = C_SGEN().to(device)

    trainer = Trainer(model.train(), std, mean)
    tester = T(model.eval(), std, mean)

    Loss_train = []
    Loss_valid = []
    Loss_test = []

    for epoch in range(1, (iteration + 1)):

        if epoch  % decay_interval == 0:
            trainer.optimizer.param_groups[0]['lr'] *= lr_decay
        start = timeit.default_timer()

        train_loss = trainer.train(train_loader)
        valid_loss, RMSE_valid, predicted_valid, true_valid = tester.test(valid_loader)
        test_loss, RMSE_test, predicted_test, true_test = tester.test(test_loader)

        Loss_train.append(train_loss)
        Loss_valid.append(valid_loss)
        Loss_test.append(test_loss)

        end = timeit.default_timer()
        time = end - start

        print(
            'epoch:%d-train loss: %.3f,valid loss: %.3f,test loss: %.3f, valid rmse: %.3f, test rmse: %.3f, time: %.3f' %
            (epoch, train_loss, valid_loss, test_loss, RMSE_valid, RMSE_test, time))

        if epoch == iteration:

            RMSE_k_valid.append(RMSE_valid)
            RMSE_k_test.append(RMSE_test)

    print('RMSE_k_valid', RMSE_k_valid)
    print('RMSE_k_test', RMSE_k_test)

epoch:1-train loss: 0.398,valid loss: 0.252,test loss: 0.165, valid rmse: 1.929, test rmse: 1.563, time: 0.633
epoch:2-train loss: 0.175,valid loss: 0.181,test loss: 0.171, valid rmse: 1.635, test rmse: 1.591, time: 0.489
epoch:3-train loss: 0.120,valid loss: 0.105,test loss: 0.092, valid rmse: 1.248, test rmse: 1.166, time: 0.480
epoch:4-train loss: 0.052,valid loss: 0.145,test loss: 0.087, valid rmse: 1.464, test rmse: 1.132, time: 0.497
epoch:5-train loss: 0.062,valid loss: 0.108,test loss: 0.057, valid rmse: 1.261, test rmse: 0.919, time: 0.492
epoch:6-train loss: 0.044,valid loss: 0.128,test loss: 0.075, valid rmse: 1.374, test rmse: 1.056, time: 0.472
epoch:7-train loss: 0.026,valid loss: 0.113,test loss: 0.071, valid rmse: 1.295, test rmse: 1.022, time: 0.484
epoch:8-train loss: 0.036,valid loss: 0.116,test loss: 0.092, valid rmse: 1.309, test rmse: 1.167, time: 0.484
epoch:9-train loss: 0.078,valid loss: 0.152,test loss: 0.094, valid rmse: 1.498, test rmse: 1.177, time: 0.468
e

In [None]:
RMSE_mean_valid, RMSE_std_valid = metric(RMSE_k_valid)
RMSE_mean_test, RMSE_std_test = metric(RMSE_k_test)

print('result:, RMSE:%.3f, RMSE_std:%.3f' % (RMSE_mean_valid, RMSE_std_valid))
print('result:, RMSE:%.3f, RMSE_std:%.3f' % (RMSE_mean_test, RMSE_std_test))

### Example: pyg_train

Training a PyG model directly

In [5]:
import torch

from torch_geometric.data import Data, InMemoryDataset

import pickle
import numpy as np
import timeit
import os

from torch_geometric.data import DataLoader
from sklearn.metrics import mean_squared_error
from torch_geometric.transforms import AddSelfLoops
from utils import GAT, AGNN, SGC, ARMA

In [6]:
def rms_score(y_true, y_pred):
    """Computes RMS error."""
    return np.sqrt(mean_squared_error(y_true, y_pred))

def load_tensor(file_name, dtype):
    return [dtype(d).to(device) for d in np.load(file_name + '.npy')]

In [7]:
class TestDataset(InMemoryDataset):
    
    def __init__(self, data_list):
        super(TestDataset, self).__init__('/tmp/TestDataset')
        self.data, self.slices = self.collate(data_list)

    def _download(self):
        pass

    def _process(self):
        pass

In [8]:
def load_dataset(dataset):

    with open('./FreeSolv/decrease/' + dataset +'/full_feature','rb') as node_features:
        x_train = pickle.load(node_features)
    with open('./FreeSolv/decrease/' + dataset +'/edge','rb') as f:
        edge_index_train = pickle.load(f)
    y_train = load_tensor('./FreeSolv/decrease/' + dataset +'/Interactions', torch.FloatTensor)

    d = []
    for i in range(len(y_train)):
        data = Data(x=x_train[i], edge_index=edge_index_train[i], y=y_train[i])
        data = AddSelfLoops()(data)
        data.atom_num = x_train[i].shape[0]
        d.append(data)
    set = TestDataset(d)
    return set

In [9]:
# FreeSolv
std = 3.8448222046029543
mean = -3.8030062305295975

In [10]:
class Trainer(object):
    
    def __init__(self, model, std, mean):
        self.model = model
        self.optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
        self.std = std
        self.mean = mean

    def train(self, train_loader):

        loss_total = 0
        num = 0
        for data in train_loader:
            num += 1
            data = data.to(device)
            self.optimizer.zero_grad()
            loss = self.model(data, std=self.std, mean=self.mean, train=True)
            loss.backward()
            self.optimizer.step()
            loss_total += loss.to('cpu').data.numpy()

        loss_mean = loss_total / num
        return loss_mean

In [11]:
class tester(object):
    
    def __init__(self, model, std, mean):
        self.model = model
        self.std = std
        self.mean = mean

    def test(self, test_loader):

        loss_total = 0
        all_p = []
        all_t = []
        num = 0
        for data in test_loader:
            num += 1
            data = data.to(device)
            loss, predicted, true = self.model(data, std=self.std, mean=self.mean, train=False)

            for i in predicted:
                all_p.append(float(i))
            for i in true:
                all_t.append(float(i))
            loss_total += loss.to('cpu').data.numpy()

        RMSE = rms_score(all_t,all_p)
        loss_mean = loss_total / num
        return loss_mean, RMSE

In [12]:
def metric(RMSE_k_test):
    RMSE_mean_test = np.mean(np.array(RMSE_k_test))
    RMSE_std_test = np.std(np.array(RMSE_k_test))

    return RMSE_mean_test, RMSE_std_test

In [13]:
batch = 8
iteration = 50
lr = 0.01
device = torch.device('cuda')
decay_interval = 10
lr_decay = 0.5

print('decay_interval:', decay_interval)
print('lr:', lr)

decay_interval: 10
lr: 0.01


In [14]:
train_dataset = load_dataset('train_data')
valid_dataset = load_dataset('valid_data')
test_dataset = load_dataset('test_data')

train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch, shuffle=True)

In [15]:
seed_list = [256, 512, 1024]

RMSE_k_train = []
R2_k_train = []

RMSE_k_valid = []
R2_k_valid = []

RMSE_k_test = []
R2_k_test = []

In [16]:
for seed in seed_list:
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    model = ARMA().to(device)
    trainer = Trainer(model.train(), std, mean)
    Tester = tester(model.eval(), std, mean)

    for epoch in range(1, (iteration + 1)):
        if epoch  % decay_interval == 0:
            trainer.optimizer.param_groups[0]['lr'] *= lr_decay

        start = timeit.default_timer()
        train_loss = trainer.train(train_loader)
        valid_loss, RMSE_valid = Tester.test(valid_loader)
        test_loss, RMSE_test = Tester.test(test_loader)
        end = timeit.default_timer()
        time = end - start

        print(
            'ARMA-epoch:%d,---train loss: %.3f,valid loss: %.3f,test loss: %.3f, valid rmse: %.3f, test rmse: %.3f, time: %.3f' %
            (epoch, train_loss, valid_loss, test_loss, RMSE_valid, RMSE_test, time))

        if epoch == iteration:
            RMSE_k_valid.append(RMSE_valid)
            RMSE_k_test.append(RMSE_test)

    print('RMSE_k_valid', RMSE_k_valid)
    print('RMSE_k_test', RMSE_k_test)

ARMA-epoch:1,---train loss: 3.035,valid loss: 0.748,test loss: 0.665, valid rmse: 3.325, test rmse: 3.112, time: 0.638
ARMA-epoch:2,---train loss: 0.674,valid loss: 0.581,test loss: 0.387, valid rmse: 2.931, test rmse: 2.516, time: 0.501
ARMA-epoch:3,---train loss: 0.424,valid loss: 0.389,test loss: 0.312, valid rmse: 2.398, test rmse: 1.936, time: 0.514
ARMA-epoch:4,---train loss: 0.363,valid loss: 0.399,test loss: 0.216, valid rmse: 2.428, test rmse: 1.881, time: 0.507
ARMA-epoch:5,---train loss: 0.267,valid loss: 0.239,test loss: 0.109, valid rmse: 1.878, test rmse: 1.302, time: 0.507
ARMA-epoch:6,---train loss: 0.198,valid loss: 0.290,test loss: 0.216, valid rmse: 2.069, test rmse: 1.608, time: 0.500
ARMA-epoch:7,---train loss: 0.228,valid loss: 0.260,test loss: 0.127, valid rmse: 1.960, test rmse: 1.368, time: 0.503
ARMA-epoch:8,---train loss: 0.242,valid loss: 0.427,test loss: 0.319, valid rmse: 2.512, test rmse: 2.164, time: 0.516
ARMA-epoch:9,---train loss: 0.193,valid loss: 0.

In [17]:
RMSE_mean_valid, RMSE_std_valid = metric(RMSE_k_valid)
RMSE_mean_test, RMSE_std_test = metric(RMSE_k_test)

print('result:, valid_RMSE:%.3f, valid_RMSE_std:%.3f' % (RMSE_mean_valid, RMSE_std_valid))
print('result:, test_RMSE:%.3f, test_RMSE_std:%.3f' % (RMSE_mean_test, RMSE_std_test))

result:, valid_RMSE:1.462, valid_RMSE_std:0.132
result:, test_RMSE:1.013, test_RMSE_std:0.058


### Example: load_FreeSolv

Load data from DeepChem

In [18]:
import numpy as np
import deepchem as dc
import os
import torch

from utils import preprocess_adj
from rdkit import Chem

import pickle

from numpy import flip

In [19]:
def create_adjacency(mol):
    adjacency = Chem.GetAdjacencyMatrix(mol)
    return np.array(adjacency,dtype=float)

def save_feature(dir, Features, Normed_adj, Interactions, smiles, edge, full_feature, dataset=None):
    dir_input = (dir + dataset + '/')
    os.makedirs(dir_input, exist_ok=True)
    np.save(dir_input + 'Features', Features)
    np.save(dir_input + 'Normed_adj', Normed_adj)
    np.save(dir_input + 'Interactions', Interactions)
    np.save(dir_input + 'smiles', smiles)

    with open(dir_input + 'edge', 'wb') as f:
        pickle.dump(edge, f)

    with open(dir_input + 'full_feature', 'wb') as a:
        pickle.dump(full_feature, a)

In [20]:
# Only for debug!
np.random.seed(123)

# Load Delaney dataset
delaney_tasks, delaney_datasets, transformers = dc.molnet.load_sampl(featurizer='GraphConv', split='random')
train_dataset, valid_dataset, test_dataset = delaney_datasets

maxNumAtoms = 16

'split' is deprecated.  Use 'splitter' instead.


In [21]:
def fix_input(feature_array, iAdjTmp):
    "Fix number of input molecular atoms"

    iFeature = np.zeros((maxNumAtoms, 75))
    if len(feature_array) <= maxNumAtoms:
        iFeature[0:len(feature_array), 0:75] = feature_array
    else:
        iFeature = feature_array[0:maxNumAtoms]

    adjacency = np.zeros((maxNumAtoms, maxNumAtoms))

    if len(feature_array) <= maxNumAtoms:
        adjacency[0:len(feature_array), 0:len(feature_array)] = iAdjTmp
    else:
        adjacency = iAdjTmp[0:maxNumAtoms, 0:maxNumAtoms]

    return iFeature, adjacency

In [22]:
def get_feature(dataset):

    Features_decrease, adj_decrease, edge_decrease, full_feature_decrease = [], [], [], []
    Interactions, smiles = [], []

    for x, label, w, smile in dataset.itersamples():

        # The smile is used to extract molecular fingerprints
        smiles.append(smile)

        interaction = label
        Interactions.append(interaction)

        mol = Chem.MolFromSmiles(smile)

        if not mol:
            raise ValueError("Could not parse SMILES string:", smile)

        # increased order
        feature_increase = x.get_atom_features()
        iAdjTmp_increase = create_adjacency(mol)

        # decreased order
        # Turn the data upside down
        feature_decrease = flip(feature_increase, 0)
        iAdjTmp_decrease = flip(iAdjTmp_increase, 0)

        # Obtaining fixed-size molecular input data
        iFeature_decrease, adjacency_decrease = fix_input(feature_decrease, iAdjTmp_decrease)

        Features_decrease.append(np.array(iFeature_decrease))
        normed_adj_decrease = preprocess_adj(adjacency_decrease)
        adj_decrease.append(normed_adj_decrease)

        #Transforms data into PyTorch Geometrics specific data format.
        index = np.array(np.where(iAdjTmp_decrease == 1))
        edge_index = torch.from_numpy(index).long()
        edge_decrease.append(edge_index)

        feature = torch.from_numpy(feature_decrease.copy()).float()
        full_feature_decrease.append(feature)

    return  Features_decrease, adj_decrease, edge_decrease, full_feature_decrease, Interactions, smiles

In [23]:
Features_decrease1, adj_decrease1, edge_decrease1, full_feature_decrease1, Interactions1, smiles1 = get_feature(train_dataset)
Features_decrease2, adj_decrease2, edge_decrease2, full_feature_decrease2, Interactions2, smiles2 = get_feature(valid_dataset)
Features_decrease3, adj_decrease3, edge_decrease3, full_feature_decrease3, Interactions3, smiles3 = get_feature(test_dataset)

In [25]:
dir = './FreeSolv/decrease/'

In [24]:
save_feature(dir, Features_decrease1, adj_decrease1, Interactions1, smiles1, edge_decrease1, full_feature_decrease1, dataset='train_data')
save_feature(dir, Features_decrease2, adj_decrease1, Interactions2, smiles2, edge_decrease2, full_feature_decrease2, dataset='valid_data')
save_feature(dir, Features_decrease3, adj_decrease1, Interactions3, smiles3, edge_decrease3, full_feature_decrease3, dataset='test_data')