## 1. Dependencies

## Using APT

In [None]:
!sudo apt update
!apt search rdkit
!sudo apt install librdkit-dev librdkit1 python-rdkit rdkit-data rdkit-doc

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:7 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [40.7 kB]
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:9 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:11 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:12 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
I

## Intalling RDkit


In [None]:
import sys
import os
import requests
import subprocess
import shutil
from logging import getLogger, StreamHandler, INFO


logger = getLogger(__name__)
logger.addHandler(StreamHandler())
logger.setLevel(INFO)


def install(
        chunk_size=4096,
        file_name="Miniconda3-latest-Linux-x86_64.sh",
        url_base="https://repo.continuum.io/miniconda/",
        conda_path=os.path.expanduser(os.path.join("~", "miniconda")),
        rdkit_version=None,
        add_python_path=True,
        force=False):
    """install rdkit from miniconda
    ```
    import rdkit_installer
    rdkit_installer.install()
    ```
    """

    python_path = os.path.join(
        conda_path,
        "lib",
        "python{0}.{1}".format(*sys.version_info),
        "site-packages",
    )

    if add_python_path and python_path not in sys.path:
        logger.info("add {} to PYTHONPATH".format(python_path))
        sys.path.append(python_path)

    if os.path.isdir(os.path.join(python_path, "rdkit")):
        logger.info("rdkit is already installed")
        if not force:
            return

        logger.info("force re-install")

    url = url_base + file_name
    python_version = "{0}.{1}.{2}".format(*sys.version_info)

    logger.info("python version: {}".format(python_version))

    if os.path.isdir(conda_path):
        logger.warning("remove current miniconda")
        shutil.rmtree(conda_path)
    elif os.path.isfile(conda_path):
        logger.warning("remove {}".format(conda_path))
        os.remove(conda_path)

    logger.info('fetching installer from {}'.format(url))
    res = requests.get(url, stream=True)
    res.raise_for_status()
    with open(file_name, 'wb') as f:
        for chunk in res.iter_content(chunk_size):
            f.write(chunk)
    logger.info('done')

    logger.info('installing miniconda to {}'.format(conda_path))
    subprocess.check_call(["bash", file_name, "-b", "-p", conda_path])
    logger.info('done')

    logger.info("installing rdkit")
    subprocess.check_call([
        os.path.join(conda_path, "bin", "conda"),
        "install",
        "--yes",
        "-c", "rdkit",
        "python=={}".format(python_version),
        "rdkit" if rdkit_version is None else "rdkit=={}".format(rdkit_version)])
    logger.info("done")

    import rdkit
    logger.info("rdkit-{} installation finished!".format(rdkit.__version__))


if __name__ == "__main__":
    install()

add /root/miniconda/lib/python3.6/site-packages to PYTHONPATH
python version: 3.6.9
fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
done
installing miniconda to /root/miniconda
done
installing rdkit
done
rdkit-2020.09.1 installation finished!


## Original GNN Model


In [None]:
!git clone https://github.com/navanchauhan/molecularGNN_smiles

Cloning into 'molecularGNN_smiles'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 523 (delta 7), reused 33 (delta 2), pack-reused 482[K
Receiving objects: 100% (523/523), 9.01 MiB | 28.57 MiB/s, done.
Resolving deltas: 100% (242/242), done.


In [None]:
%cd molecularGNN_smiles/main
!sed -i 's/regression/classification/' train.sh
!sed -i 's/covid/drugs/' train.sh

/content/molecularGNN_smiles/main


In [None]:
!chmod +x train.sh

In [None]:
!git pull

Already up to date.


In [None]:
!PYTHONPATH="/root/miniconda/lib/python3.6/site-packages" ./train.sh

The code uses a GPU!
----------------------------------------------------------------------------------------------------
Preprocessing the drugs dataset.
Just a moment......
data_train.txt
[07:31:20] Explicit valence for atom # 5 C, 5, is greater than permitted
[07:31:20] Explicit valence for atom # 1 O, 3, is greater than permitted
[07:31:20] Explicit valence for atom # 1 O, 3, is greater than permitted
[07:31:20] Explicit valence for atom # 9 O, 3, is greater than permitted
[07:31:20] SMILES Parse Error: syntax error while parsing: c12c([nH]cn1)ncnc2SC#LD10
[07:31:20] SMILES Parse Error: Failed parsing SMILES 'c12c([nH]cn1)ncnc2SC#LD10' for input: 'c12c([nH]cn1)ncnc2SC#LD10'
[07:31:20] SMILES Parse Error: syntax error while parsing: [Re](Cl)(Cl)Cl#LDLo
[07:31:20] SMILES Parse Error: Failed parsing SMILES '[Re](Cl)(Cl)Cl#LDLo' for input: '[Re](Cl)(Cl)Cl#LDLo'
[07:31:20] Explicit valence for atom # 0 Sn, 5, is greater than permitted
[07:31:20] SMILES Parse Error: syntax error while pa

## GNN Model

In [None]:
import sys
import timeit

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import roc_auc_score

class MolecularGraphNeuralNetwork(nn.Module):
    def __init__(self, N_fingerprints, dim, layer_hidden, layer_output):
        super(MolecularGraphNeuralNetwork, self).__init__()
        self.embed_fingerprint = nn.Embedding(N_fingerprints, dim)
        self.W_fingerprint = nn.ModuleList([nn.Linear(dim, dim)
                                            for _ in range(layer_hidden)])
        self.W_output = nn.ModuleList([nn.Linear(dim, dim)
                                       for _ in range(layer_output)])
        if task == 'classification':
            self.W_property = nn.Linear(dim, 2)
        if task == 'regression':
            self.W_property = nn.Linear(dim, 1)

    def pad(self, matrices, pad_value):
        """Pad the list of matrices
        with a pad_value (e.g., 0) for batch processing.
        For example, given a list of matrices [A, B, C],
        we obtain a new matrix [A00, 0B0, 00C],
        where 0 is the zero (i.e., pad value) matrix.
        """
        shapes = [m.shape for m in matrices]
        M, N = sum([s[0] for s in shapes]), sum([s[1] for s in shapes])
        zeros = torch.FloatTensor(np.zeros((M, N))).to(device)
        pad_matrices = pad_value + zeros
        i, j = 0, 0
        for k, matrix in enumerate(matrices):
            m, n = shapes[k]
            pad_matrices[i:i+m, j:j+n] = matrix
            i += m
            j += n
        return pad_matrices

    def update(self, matrix, vectors, layer):
        hidden_vectors = torch.relu(self.W_fingerprint[layer](vectors))
        return hidden_vectors + torch.matmul(matrix, hidden_vectors)

    def sum(self, vectors, axis):
        sum_vectors = [torch.sum(v, 0) for v in torch.split(vectors, axis)]
        return torch.stack(sum_vectors)

    def mean(self, vectors, axis):
        mean_vectors = [torch.mean(v, 0) for v in torch.split(vectors, axis)]
        return torch.stack(mean_vectors)

    def gnn(self, inputs):

        """Cat or pad each input data for batch processing."""
        fingerprints, adjacencies, molecular_sizes,molecular_weight, HeavyAtomMolWt, NumRadicalElectrons, NumValenceElectrons, RotableBonds, FractionCSP3, NumHeteroatoms, MolMR, logP = inputs
        fingerprints = torch.cat(fingerprints)
        adjacencies = self.pad(adjacencies, 0)

        """GNN layer (update the fingerprint vectors)."""
        fingerprint_vectors = self.embed_fingerprint(fingerprints)
        for l in range(layer_hidden):
            hs = self.update(adjacencies, fingerprint_vectors, l)
            fingerprint_vectors = F.normalize(hs, 2, 1)  # normalize.

        """Molecular vector by sum or mean of the fingerprint vectors."""
        molecular_vectors = self.sum(fingerprint_vectors, molecular_sizes)
        # molecular_vectors = self.mean(fingerprint_vectors, molecular_sizes)

        return molecular_vectors

    def mlp(self, vectors):
        """Classifier or regressor based on multilayer perceptron."""
        for l in range(layer_output):
            vectors = torch.relu(self.W_output[l](vectors))
        outputs = self.W_property(vectors)
        return outputs

    def forward_classifier(self, data_batch, train,predict=False):
        if train:
            inputs = data_batch[:-1]
            correct_labels = torch.cat(data_batch[-1])
            molecular_vectors = self.gnn(inputs)
            predicted_scores = self.mlp(molecular_vectors)
            loss = F.cross_entropy(predicted_scores, correct_labels)
            return loss
        elif predict:
            inputs = data_batch
            with torch.no_grad():
              molecular_vectors = self.gnn(inputs)
              predicted_scores = self.mlp(molecular_vectors)
            predicted_scores = predicted_scores.to('cpu').data.numpy()
            predicted_scores = [s[1] for s in predicted_scores]
            return predicted_scores
        else:
            inputs = data_batch[:-1]
            correct_labels = torch.cat(data_batch[-1])
            with torch.no_grad():
                molecular_vectors = self.gnn(inputs)
                predicted_scores = self.mlp(molecular_vectors)
            predicted_scores = predicted_scores.to('cpu').data.numpy()
            predicted_scores = [s[1] for s in predicted_scores]
            correct_labels = correct_labels.to('cpu').data.numpy()
            return predicted_scores, correct_labels

    def forward_regressor(self, data_batch, train):

        inputs = data_batch[:-1]
        correct_values = torch.cat(data_batch[-1])

        if train:
            molecular_vectors = self.gnn(inputs)
            predicted_values = self.mlp(molecular_vectors)
            loss = F.mse_loss(predicted_values, correct_values)
            return loss
        else:
            with torch.no_grad():
                molecular_vectors = self.gnn(inputs)
                predicted_values = self.mlp(molecular_vectors)
            predicted_values = predicted_values.to('cpu').data.numpy()
            correct_values = correct_values.to('cpu').data.numpy()
            predicted_values = np.concatenate(predicted_values)
            correct_values = np.concatenate(correct_values)
            return predicted_values, correct_values


class Trainer(object):
    def __init__(self, model):
        self.model = model
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)

    def train(self, dataset):
        np.random.shuffle(dataset)
        N = len(dataset)
        loss_total = 0
        for i in range(0, N, batch_train):
            data_batch = list(zip(*dataset[i:i+batch_train]))
            if task == 'classification':
                loss = self.model.forward_classifier(data_batch, train=True)
            if task == 'regression':
                loss = self.model.forward_regressor(data_batch, train=True)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            loss_total += loss.item()
        return loss_total


class Tester(object):
    def __init__(self, model):
        self.model = model

    def test_classifier(self, dataset):
        N = len(dataset)
        P, C = [], []
        for i in range(0, N, batch_test):
            data_batch = list(zip(*dataset[i:i+batch_test]))
            predicted_scores, correct_labels = self.model.forward_classifier(
                                               data_batch, train=False)
            P.append(predicted_scores)
            C.append(correct_labels)
        AUC = roc_auc_score(np.concatenate(C), np.concatenate(P))
        return AUC

    def test_regressor(self, dataset):
        N = len(dataset)
        SAE = 0  # sum absolute error.
        for i in range(0, N, batch_test):
            data_batch = list(zip(*dataset[i:i+batch_test]))
            predicted_values, correct_values = self.model.forward_regressor(
                                               data_batch, train=False)
            SAE += sum(np.abs(predicted_values-correct_values))
        MAE = SAE / N  # mean absolute error.
        return MAE

    def save_result(self, result, filename):
        with open(filename, 'a') as f:
            f.write(result + '\n')

### Data Preprocessing

In [None]:
from collections import defaultdict

import numpy as np

from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, Crippen

import torch


def create_atoms(mol, atom_dict):
    """Transform the atom types in a molecule (e.g., H, C, and O)
    into the indices (e.g., H=0, C=1, and O=2).
    Note that each atom index considers the aromaticity.
    """
    atoms = [a.GetSymbol() for a in mol.GetAtoms()]
    for a in mol.GetAromaticAtoms():
        i = a.GetIdx()
        atoms[i] = (atoms[i], 'aromatic')
    atoms = [atom_dict[a] for a in atoms]
    return np.array(atoms)


def create_ijbonddict(mol, bond_dict):
    """Create a dictionary, in which each key is a node ID
    and each value is the tuples of its neighboring node
    and chemical bond (e.g., single and double) IDs.
    """
    i_jbond_dict = defaultdict(lambda: [])
    for b in mol.GetBonds():
        i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
        bond = bond_dict[str(b.GetBondType())]
        i_jbond_dict[i].append((j, bond))
        i_jbond_dict[j].append((i, bond))
    return i_jbond_dict


def extract_fingerprints(radius, atoms, i_jbond_dict,
                         fingerprint_dict, edge_dict):
    """Extract the fingerprints from a molecular graph
    based on Weisfeiler-Lehman algorithm.
    """

    if (len(atoms) == 1) or (radius == 0):
        nodes = [fingerprint_dict[a] for a in atoms]

    else:
        nodes = atoms
        i_jedge_dict = i_jbond_dict

        for _ in range(radius):

            """Update each node ID considering its neighboring nodes and edges.
            The updated node IDs are the fingerprint IDs.
            """
            nodes_ = []
            for i, j_edge in i_jedge_dict.items():
                neighbors = [(nodes[j], edge) for j, edge in j_edge]
                fingerprint = (nodes[i], tuple(sorted(neighbors)))
                nodes_.append(fingerprint_dict[fingerprint])

            """Also update each edge ID considering
            its two nodes on both sides.
            """
            i_jedge_dict_ = defaultdict(lambda: [])
            for i, j_edge in i_jedge_dict.items():
                for j, edge in j_edge:
                    both_side = tuple(sorted((nodes[i], nodes[j])))
                    edge = edge_dict[(both_side, edge)]
                    i_jedge_dict_[i].append((j, edge))

            nodes = nodes_
            i_jedge_dict = i_jedge_dict_

    return np.array(nodes)


def split_dataset(dataset, ratio):
    """Shuffle and split a dataset."""
    np.random.seed(1234)  # fix the seed for shuffle.
    np.random.shuffle(dataset)
    n = int(ratio * len(dataset))
    return dataset[:n], dataset[n:]


def create_datasets(task, dataset, radius, device):

    dir_dataset = '../dataset/' + task + '/' + dataset + '/'

    """Initialize x_dict, in which each key is a symbol type
    (e.g., atom and chemical bond) and each value is its index.
    """
    atom_dict = defaultdict(lambda: len(atom_dict))
    bond_dict = defaultdict(lambda: len(bond_dict))
    fingerprint_dict = defaultdict(lambda: len(fingerprint_dict))
    edge_dict = defaultdict(lambda: len(edge_dict))

    def create_dataset(filename):

        print(filename)

        """Load a dataset."""
        with open(dir_dataset + filename, 'r') as f:
            smiles_property = f.readline().strip().split()
            data_original = f.read().strip().split('\n')

        """Exclude the data contains '.' in its smiles."""
        data_original = [data for data in data_original
                         if '.' not in data.split()[0]]

        dataset = []
        errored = 0

        for data in data_original:

            smiles, property = data.strip().split()

            """Create each data with the above defined functions."""
            try:
                mol = Chem.AddHs(Chem.MolFromSmiles(smiles))
            except:
                errored += 1
                continue

            atoms = create_atoms(mol, atom_dict)
            molecular_size = len(atoms)
            molecular_weight = Descriptors.ExactMolWt(mol)
            HeavyAtomMolWt = Descriptors.HeavyAtomMolWt(mol)
            NumRadicalElectrons = Descriptors.NumRadicalElectrons(mol)
            NumValenceElectrons = Descriptors.NumValenceElectrons(mol)
            RotableBonds = Lipinski.NumRotatableBonds(mol)
            FractionCSP3 = Lipinski.FractionCSP3(mol)
            NumHeteroatoms = Lipinski.NumHeteroatoms(mol)
            MolMR = Crippen.MolMR(mol)
            logP = Crippen.MolLogP(mol)

            i_jbond_dict = create_ijbonddict(mol, bond_dict)
            fingerprints = extract_fingerprints(radius, atoms, i_jbond_dict,
                                                fingerprint_dict, edge_dict)
            adjacency = Chem.GetAdjacencyMatrix(mol)

            """Transform the above each data of numpy
            to pytorch tensor on a device (i.e., CPU or GPU).
            """
            fingerprints = torch.LongTensor(fingerprints).to(device)
            adjacency = torch.FloatTensor(adjacency).to(device)
            if task == 'classification':
                property = torch.LongTensor([int(property)]).to(device)
            if task == 'regression':
                property = torch.FloatTensor([[float(property)]]).to(device)

            dataset.append((fingerprints, adjacency,
                            molecular_size,molecular_weight, HeavyAtomMolWt,
                            NumRadicalElectrons, NumValenceElectrons,
                            RotableBonds, FractionCSP3,
                            NumHeteroatoms, MolMR, logP,property))
        print("Total: %s" % str(len(data_original)))
        print("Error: %s" % str(errored))
        return dataset

    dataset_train = create_dataset('data_train.txt')
    dataset_train, dataset_dev = split_dataset(dataset_train, 0.9)
    dataset_test = create_dataset('data_test.txt')

    N_fingerprints = len(fingerprint_dict)

    return dataset_train, dataset_dev, dataset_test, N_fingerprints


In [None]:
!cat train.sh

#!/bin/bash

# task=classification  # target is a binary value (e.g., drug or not).
# dataset=hiv

task=classification  # target is a real value (e.g., energy eV).
dataset=drugs

radius=1
dim=50
layer_hidden=6
layer_output=6

batch_train=32
batch_test=32
lr=1e-4
lr_decay=0.99
decay_interval=10
iteration=1000

setting=$dataset--radius$radius--dim$dim--layer_hidden$layer_hidden--layer_output$layer_output--batch_train$batch_train--batch_test$batch_test--lr$lr--lr_decay$lr_decay--decay_interval$decay_interval--iteration$iteration
python train.py $task $dataset $radius $dim $layer_hidden $layer_output $batch_train $batch_test $lr $lr_decay $decay_interval $weight_decay $iteration $setting


## Configuration

In [None]:
task = "classification" # regression/classification
dataset = "drugs"

radius=1
dim=50
layer_hidden=6
layer_output=6

batch_train=32
batch_test=32

lr = 1e-4
lr_decay=0.99
decay_interval=10
iteration=5

In [None]:
setting = "{}--radius{}--dim{}--layer_hidden{}--layer_output{}--batch_train{}--batch_test{}--lr{}--lr_decay{}--decay_interval{}--iteration{}".format(
    dataset,radius,dim,layer_hidden,layer_output,batch_train,batch_test,lr,lr_decay,decay_interval,iteration
)

In [None]:

(radius, dim, layer_hidden, layer_output,
  batch_train, batch_test, decay_interval,
  iteration) = map(int, [radius, dim, layer_hidden, layer_output,
                          batch_train, batch_test,
                          decay_interval, iteration])

lr, lr_decay = map(float, [lr, lr_decay])

### Check CUDA Support

In [None]:
if torch.cuda.is_available():
  device = torch.device('cuda')
  print('Using GPU to train')
else:
  device = torch.device('cpu')
  print('Using CPU to train')

Using GPU to train


## Process Dataset

In [None]:
print('Preprocessing the', dataset, 'dataset.')
print('Just a moment......')
(dataset_train, dataset_dev, dataset_test,
N_fingerprints) = create_datasets(task, dataset, radius, device)

Preprocessing the drugs dataset.
Just a moment......
data_train.txt
Total: 19789
Error: 547
data_test.txt
Total: 4940
Error: 141


In [None]:
print('-'*100)
print('The preprocess has finished!')
print('# of training data samples:', len(dataset_train))
print('# of development data samples:', len(dataset_dev))
print('# of test data samples:', len(dataset_test))
print('-'*100)

----------------------------------------------------------------------------------------------------
The preprocess has finished!
# of training data samples: 17317
# of development data samples: 1925
# of test data samples: 4799
----------------------------------------------------------------------------------------------------


## Setting up Training

In [None]:
model = MolecularGraphNeuralNetwork(
            N_fingerprints, dim, layer_hidden, layer_output).to(device)

NameError: ignored

Setting up trainer and testers

In [None]:
trainer = Trainer(model)
tester = Tester(model)
print('# of model parameters:',
sum([np.prod(p.size()) for p in model.parameters()]))
print('-'*100)

# of model parameters: 96052
----------------------------------------------------------------------------------------------------


In [None]:
file_result = '../output/result--' + setting + '.txt'
if task == 'classification':
  result = 'Epoch\tTime(sec)\tLoss_train\tAUC_dev\tAUC_test'
if task == 'regression':
  result = 'Epoch\tTime(sec)\tLoss_train\tMAE_dev\tMAE_test'

In [None]:
with open(file_result, 'w') as f:
  f.write(result + '\n')

print('Start training.')
print('The result is saved in the output directory every epoch!')


start = timeit.default_timer()

for epoch in range(iteration):

  epoch += 1
  if epoch % decay_interval == 0:
    trainer.optimizer.param_groups[0]['lr'] *= lr_decay

  loss_train = trainer.train(dataset_train)

  if task == 'classification':
      prediction_dev = tester.test_classifier(dataset_dev)
      prediction_test = tester.test_classifier(dataset_test)
  elif task == 'regression':
      prediction_dev = tester.test_regressor(dataset_dev)
      prediction_test = tester.test_regressor(dataset_test)

  time = timeit.default_timer() - start

  if epoch == 1:
    minutes = time * iteration / 60
    hours = int(minutes / 60)
    minutes = int(minutes - 60 * hours)
    print('The training will finish in about',
      hours, 'hours', minutes, 'minutes.')
    print('-'*100)
    print(result)

  result = '\t'.join(map(str, [epoch, time, loss_train,
    prediction_dev, prediction_test]))
  tester.save_result(result, file_result)
  print(result)

Start training.
The result is saved in the output directory every epoch!
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13


KeyboardInterrupt: ignored

In [None]:
nn.Conv2d(N_fingerprints, dim,5)

Conv2d(1307, 50, kernel_size=(5, 5), stride=(1, 1))

In [None]:
model

MolecularGraphNeuralNetwork(
  (embed_fingerprint): Embedding(1307, 50)
  (W_fingerprint): ModuleList(
    (0): Linear(in_features=50, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=50, bias=True)
    (2): Linear(in_features=50, out_features=50, bias=True)
    (3): Linear(in_features=50, out_features=50, bias=True)
    (4): Linear(in_features=50, out_features=50, bias=True)
    (5): Linear(in_features=50, out_features=50, bias=True)
  )
  (W_output): ModuleList(
    (0): Linear(in_features=50, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=50, bias=True)
    (2): Linear(in_features=50, out_features=50, bias=True)
    (3): Linear(in_features=50, out_features=50, bias=True)
    (4): Linear(in_features=50, out_features=50, bias=True)
    (5): Linear(in_features=50, out_features=50, bias=True)
  )
  (W_property): Linear(in_features=50, out_features=2, bias=True)
)

In [None]:
torch.save(model,"./uwu.pt")

In [None]:
model.forward_classifier(list(zip(dataset_test[10][:-1])),train=False,predict=True)

[0.14232707]

In [None]:
len(dataset_test[0])

13

In [None]:
model.mlp(model.gnn(list(zip(dataset_test[10][:-1]))))[0][1].item()

-0.045002736151218414

In [None]:
torch.cat(list(zip(dataset_test[10]))[-1]).cpu().data.numpy()[0]

0

In [None]:
for x in range(10):
  predicted = model.mlp(model.gnn(list(zip(dataset_test[x][:-1]))))[0][1].item()
  actual = torch.cat(list(zip(dataset_test[x]))[-1]).cpu().data.numpy()[0]
  print(np.argmax(predicted),predicted,actual)

0 0.1425742506980896 1
0 0.14243972301483154 1
0 0.1423126608133316 1
0 0.14225676655769348 0
0 0.14258266985416412 1
0 0.14268891513347626 0
0 0.1423802375793457 0
0 0.14231427013874054 1
0 0.1426069289445877 1
0 0.14233356714248657 1


In [None]:
dataset_train[0]

(tensor([ 14,   9,  12,  33,  20,  47,  50,  35,  33,  20,  86,  39,   4,   4,
          22,  13,  14,  39, 252,  87,  66,  66,  67,   4,   4, 153, 154,   4,
           4,   4,  17,  17,  17,  17,  17,  16,  17,  17,  17,  17,  16,  15,
          15,  17,  17,  17,  16,  15,  15,  15,  15,  15], device='cuda:0'),
 tensor([[0., 1., 0.,  ..., 0., 0., 0.],
         [1., 0., 1.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'),
 52,
 tensor([0], device='cuda:0'))