# Graph classification

*the same tutorial exists for GraphNeuralNetworks.jl*

In [None]:
import torch 
import torch_geometric as torchg 
from torch_geometric.datasets import TUDataset
import pandas as pd
import os 

In [None]:
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

In [None]:
dataset = TUDataset(root="data/TUDataset", name = "MUTAG")

In [None]:
print()
print(f'Dataset: {dataset}:')
print('====================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

In [None]:
data = dataset[0]
data

In [None]:
print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

In [None]:
torch.manual_seed(124342)

dataset = dataset.shuffle() 

train_dataset = dataset[:150]
test_dataset = dataset[150:]

In [None]:
print(f'Number of training graphs: {len(train_dataset)}')
print(f'Number of test graphs: {len(test_dataset)}')

In [None]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for step, data in enumerate(train_loader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

We want to do graph classification, which normally follows a simple recipe: 
1. Embed each node by performing multiple rounds of message passing 
2. Aggregate node embeddings into a unified graph embedding (readout layer) 
3. Train a final classifier on the graph embedding 

This more or lesss follows what image classification does. 

There exist a bunch of readout layers in the literature, the most common one being the average of the node embeddings.

PyTorch Geometric provides this functionality via torch_geometric.nn.global_mean_pool, which takes in the node embeddings of all nodes in the mini-batch and the assignment vector batch to compute a graph embedding of size [batch_size, hidden_channels] for each graph in the batch.

The final architecture for applying GNNs to the task of graph classification then looks as follows and allows for complete end-to-end training

In [None]:
from torch.nn import Linear 
import torch.nn.functional as F 
from torch_geometric.nn import GCNConv 
from torch_geometric.nn import global_mean_pool 

In [None]:
class GCN(torch.nn.Module): 
    def __init__(self, input, hidden_channels, output,  seed = 12345, p_dropout = 0.5): 
        super(GCN, self).__init__() # is this some kind of CRTP in python? 
        torch.manual_seed(seed)

        self.conv1 = GCNConv(input, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, output)
        self.p_dropout = p_dropout

    def forward(self, x, edge_index, batch): 
        # 1. obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. global mean pooling for readout 
        x = global_mean_pool(x, batch) # [batch_size, hidden_channels]

        # 3. apply final classifier 
        x = F.dropout(x, p=self.p_dropout, training = self.training)
        x = self.lin(x)
        return x    


In [None]:
model = GCN(input=dataset.num_features, hidden_channels=64, output=dataset.num_classes)

In [None]:
print(model)

Train the thing for a bit to see performance

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
def train(model, loader, optimizer, criterion):
    model.train()

    for data in loader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(model, loader):
    model.eval()
    correct = 0
    for data in loader: # Iterate in batches over the test dataset.
        out = model(data.x, data.edge_index, data.batch)
        pred = out.argmax(dim =1)
        correct += int((pred == data.y).sum())
    return correct / len(loader.dataset)
     

In [None]:
for epoch in range(1, 171): 
    train(model, train_loader, optimizer, criterion) 
    train_acc = test(model, train_loader)
    test_acc = test(model, test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

Doing better with a different GNN layer - 'GraphConv'

In [None]:
from torch_geometric.nn import GraphConv 

class GNN(torch.nn.Module): 
    def __init__(self, input, hidden_channels, output, seed = 12345, p_dropout = 0.5): 
        super(GNN, self).__init__() 
        torch.manual_seed(seed)

        self.conv1 = GraphConv(input, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, hidden_channels)
        self.conv3 = GraphConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, output)
        self.p_dropout = p_dropout

    def forward(self, x, edge_index, batch): 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        x = global_mean_pool(x, batch) # [batch_size, hidden_channels]

        x = F.dropout(x, p=self.p_dropout, training=self.training)
        x = self.lin(x)
        return x

In [None]:
model = GNN(input=dataset.num_features, hidden_channels=64, output=dataset.num_classes)

In [None]:
print(model)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
for epoch in range(1, 171): 
    train(model, train_loader, optimizer, criterion) 
    train_acc = test(model, train_loader)
    test_acc = test(model, test_loader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')