## Import necessary libraries

In [1]:
import torch
from torch_geometric.datasets import TUDataset
import matplotlib.pyplot as plt

## Util Functions

In [2]:
def data_details(dataset):
    print()
    print(f'Dataset: {dataset}:')
    print('====================')
    print(f'Number of graphs: {len(dataset)}')
    print(f'Number of features: {dataset.num_features}')
    print(f'Number of classes: {dataset.num_classes}')

def graph_details(data):
    print()
    print(data)
    print('=============================================================')
    
    # Gather some statistics about the first graph.
    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    print(f'Has isolated nodes: {data.has_isolated_nodes()}')
    print(f'Has self-loops: {data.has_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')   

In [3]:
dataset = TUDataset(root='data/TUDataset', name='MUTAG')
data_details(dataset)
graph_details(dataset[0])

Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip
Processing...



Dataset: MUTAG(188):
Number of graphs: 188
Number of features: 7
Number of classes: 2

Data(edge_index=[2, 38], x=[17, 7], edge_attr=[38, 4], y=[1])
Number of nodes: 17
Number of edges: 38
Average node degree: 2.24
Has isolated nodes: False
Has self-loops: False
Is undirected: True


Done!


## Create Train and Test Dataset

In [5]:
torch.manual_seed(12345)
dataset = dataset.shuffle()

train = dataset[:150]
test = dataset[150:]

print(f'Number of Train Graphs:{len(train)}')
print(f'Number of Test Graphs:{len(test)}')

Number of Train Graphs:150
Number of Test Graphs:38


### Create Batches of Dataset

In [10]:
from torch_geometric.loader import DataLoader

train_dataloader = DataLoader(train, batch_size = 64, shuffle = True )
test_dataloader = DataLoader(test, batch_size = 64, shuffle = True )

for step, data in enumerate(train_dataloader):
    print(f'Step {step + 1}:')
    print('=======')
    print(f'Number of graphs in the current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2528], x=[1149, 7], edge_attr=[2528, 4], y=[64], batch=[1149], ptr=[65])

Step 2:
Number of graphs in the current batch: 64
DataBatch(edge_index=[2, 2544], x=[1147, 7], edge_attr=[2544, 4], y=[64], batch=[1147], ptr=[65])

Step 3:
Number of graphs in the current batch: 22
DataBatch(edge_index=[2, 854], x=[386, 7], edge_attr=[854, 4], y=[22], batch=[386], ptr=[23])



## Training Graph Neural Network for Graph Classification

Training a GNN for graph classification usually follows a simple recipe:

1. Embed each node by performing multiple rounds of message passing
2. Aggregate node embeddings into a unified graph embedding (**readout layer**)
3. Train a final classifier on the graph embedding

There exists multiple **readout layers** in literature, but the most common one is to simply take the average of node embeddings:

$$
\mathbf{x}_{\mathcal{G}} = \frac{1}{|\mathcal{V}|} \sum_{v \in \mathcal{V}} \mathcal{x}^{(L)}_v
$$

In [22]:
from torch.nn import Linear
from torch.functional import F
from torch_geometric.nn import GCNConv, GraphConv
from torch_geometric.nn import global_mean_pool,global_max_pool, global_add_pool

In [13]:
class GCN(torch.nn.Module):

    def __init__(self, hidden_channels):
        super(GCN,self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self,x,edge_index,batch):
        
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x


model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(7, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


In [23]:
class GNN(torch.nn.Module):

    def __init__(self, hidden_channels):
        super(GNN,self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GraphConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GraphConv(hidden_channels, hidden_channels)
        self.conv3 = GraphConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, dataset.num_classes)

    def forward(self,x,edge_index,batch):
        
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)

        return x


model = GNN(hidden_channels=64)
print(model)

GNN(
  (conv1): GraphConv(7, 64)
  (conv2): GraphConv(64, 64)
  (conv3): GraphConv(64, 64)
  (lin): Linear(in_features=64, out_features=2, bias=True)
)


## Train the Model

### Graph Convolution Network

In [16]:
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    for data in train_dataloader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 171):
    train()
    train_acc = test(train_dataloader)
    test_acc = test(test_dataloader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 001, Train Acc: 0.6867, Test Acc: 0.5789
Epoch: 002, Train Acc: 0.6867, Test Acc: 0.5789
Epoch: 003, Train Acc: 0.6867, Test Acc: 0.5789
Epoch: 004, Train Acc: 0.6867, Test Acc: 0.5789
Epoch: 005, Train Acc: 0.6867, Test Acc: 0.5789
Epoch: 006, Train Acc: 0.6867, Test Acc: 0.5789
Epoch: 007, Train Acc: 0.6867, Test Acc: 0.5789
Epoch: 008, Train Acc: 0.6933, Test Acc: 0.5789
Epoch: 009, Train Acc: 0.6933, Test Acc: 0.5789
Epoch: 010, Train Acc: 0.7533, Test Acc: 0.6842
Epoch: 011, Train Acc: 0.7600, Test Acc: 0.7368
Epoch: 012, Train Acc: 0.7467, Test Acc: 0.6842
Epoch: 013, Train Acc: 0.7267, Test Acc: 0.7368
Epoch: 014, Train Acc: 0.7200, Test Acc: 0.7632
Epoch: 015, Train Acc: 0.7533, Test Acc: 0.6842
Epoch: 016, Train Acc: 0.7533, Test Acc: 0.6842
Epoch: 017, Train Acc: 0.7600, Test Acc: 0.7368
Epoch: 018, Train Acc: 0.7600, Test Acc: 0.7368
Epoch: 019, Train Acc: 0.7600, Test Acc: 0.7368
Epoch: 020, Train Acc: 0.7533, Test Acc: 0.7105
Epoch: 021, Train Acc: 0.7467, Test Acc:

In [None]:
model = GNN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    for data in train_dataloader:  # Iterate in batches over the training dataset.
         out = model(data.x, data.edge_index, data.batch)  # Perform a single forward pass.
         loss = criterion(out, data.y)  # Compute the loss.
         loss.backward()  # Derive gradients.
         optimizer.step()  # Update parameters based on gradients.
         optimizer.zero_grad()  # Clear gradients.

def test(loader):
     model.eval()

     correct = 0
     for data in loader:  # Iterate in batches over the training/test dataset.
         out = model(data.x, data.edge_index, data.batch)  
         pred = out.argmax(dim=1)  # Use the class with highest probability.
         correct += int((pred == data.y).sum())  # Check against ground-truth labels.
     return correct / len(loader.dataset)  # Derive ratio of correct predictions.


for epoch in range(1, 171):
    train()
    train_acc = test(train_dataloader)
    test_acc = test(test_dataloader)
    print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 001, Train Acc: 0.7267, Test Acc: 0.7895
Epoch: 002, Train Acc: 0.6867, Test Acc: 0.5789
Epoch: 003, Train Acc: 0.6867, Test Acc: 0.5789
Epoch: 004, Train Acc: 0.6867, Test Acc: 0.5789
Epoch: 005, Train Acc: 0.6933, Test Acc: 0.5789
Epoch: 006, Train Acc: 0.7533, Test Acc: 0.6842
Epoch: 007, Train Acc: 0.7533, Test Acc: 0.7368
Epoch: 008, Train Acc: 0.7467, Test Acc: 0.7632
Epoch: 009, Train Acc: 0.8000, Test Acc: 0.7895
Epoch: 010, Train Acc: 0.7467, Test Acc: 0.7632
Epoch: 011, Train Acc: 0.8000, Test Acc: 0.8158
Epoch: 012, Train Acc: 0.8133, Test Acc: 0.8421
Epoch: 013, Train Acc: 0.8200, Test Acc: 0.7895
Epoch: 014, Train Acc: 0.7933, Test Acc: 0.8158
Epoch: 015, Train Acc: 0.8267, Test Acc: 0.7895
Epoch: 016, Train Acc: 0.7867, Test Acc: 0.8684
Epoch: 017, Train Acc: 0.7867, Test Acc: 0.7105
Epoch: 018, Train Acc: 0.7867, Test Acc: 0.8684
Epoch: 019, Train Acc: 0.7267, Test Acc: 0.6579
Epoch: 020, Train Acc: 0.7600, Test Acc: 0.8684
Epoch: 021, Train Acc: 0.8200, Test Acc: