In [1]:
# --- Standard libraries
import pickle as pkl
import numpy as np
import heapq as hq
# --- PyTorch
import torch
# --- PyG
from torch_geometric.loader import DataLoader
from torch_geometric.data import Batch
from torch_geometric.utils import to_networkx, degree
# --- NetworkX
import networkx as nx
# --- RDKit
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.molSize = 300,300
# --- Matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import imageio
# --- Modules from local files
from GNN.GNN_atom import GNN, nnconv, PNA, CGC
from Dataset import XASDataset
from utils.train import train_atom
from utils.test import test_atom
from utils.utils import plot_learning_curve, count_funct_group
rdkit.__version__

'2023.09.5'

In [2]:
# --- Path raw directory containing the data
path = "./"
# --- Load and create the dataset
dataset = XASDataset(path)

In [3]:
# --- Show detail of the dataset
print(dataset)
print('------------')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')
print('')

# --- Show details of the first molecule/graph in dataset
data = dataset[31]

print(data)
print('------------')
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

XASDataset(7895)
------------
Number of graphs: 7895
Number of features: 5
Number of classes: 2

Data(x=[29, 5], edge_index=[2, 74], edge_attr=[74, 3], spectrum=[200], y=[6], pos=[29, 3], z=[29], idx=[1], smiles='C12=[C:20]3[C:16]45[C:12]6([C:8]7=[C:10]8[CH2:9][CH:7]=[C:6]([C:4]17[OH:28])[CH2:5][CH:3]=[C:2]2[CH:1]1[CH:23]([CH:22]3[C:21](=[O:25])[CH2:19][C:18]4=[CH:17][CH:15]=[C:14]6[C:13]([OH:24])=[CH:11]8)[O:26]1)[O:27]5', atom_num=[1])
------------
Number of nodes: 29
Number of edges: 74
Average node degree: 2.55
Has isolated nodes: False
Has self loops: False
Is undirected: True


In [13]:
data.num_classes

AttributeError: 'GlobalStorage' object has no attribute 'num_classes'

In [4]:
# --- Split into test, validation and test
train_dataset = dataset[0:750]
val_dataset = dataset[6001:6999]
test_dataset = dataset[7000:7895]

# --- Pass into dataloader
train_loader = DataLoader(train_dataset, batch_size=750, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

print(f'Length of training data loader: {len(train_loader.dataset)}, with a total of {len(train_loader)} datasets')
print(f'Length of validation data loader: {len(val_loader.dataset)}, with a total of {len(val_loader)} datasets')
print(f'Length of testing data loader: {len(test_loader.dataset)}, with a total of {len(test_loader)} datasets')

Length of training data loader: 750, with a total of 1 datasets
Length of validation data loader: 998, with a total of 16 datasets
Length of testing data loader: 895, with a total of 14 datasets


In [5]:
def train_node_classifier(model, graph, optimizer, criterion, n_epochs=200):

    for epoch in range(1, n_epochs+1):
        model.train()
        optimizer.zero_grad()
        out = model(graph)
        loss = criterion(out[graph.train_mask], graph.y[graph.train_mask])
        loss.backward()
        optimizer.step()

        pred = out.argmax(dim=1)
        acc = eval_node_classifier(model, graph, graph.val_mask)

        if epoch % 25 == 0:
            print(f'Epoch: {epoch:03d}, Train Loss: {loss:.5f}, Val Acc: {acc:.5f}')

    return model

def eval_node_classifier(model, graph, mask):

    model.eval()
    pred = model(graph).argmax(dim=1)
    correct = (pred[mask] == graph.y[mask]).sum()
    acc = int(correct) / int(mask.sum())

    return acc

In [7]:
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
import torch

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, 6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        output = self.conv2(x, edge_index)

        return output

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gcn = GCN().to(device)
optimizer_gcn = torch.optim.Adam(gcn.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
gcn = train_node_classifier(gcn, )