<a href="https://colab.research.google.com/github/salujajustin/GNN-for-CO/blob/main/experimental_code/Graph_Attention_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages, set backend
!pip install -q dgl         # For CPU Build
!pip install -q dgl-cu101   # For CUDA 10.1 Build

[K     |████████████████████████████████| 4.4MB 18.2MB/s 
[K     |████████████████████████████████| 36.2MB 85kB/s 
[?25h

In [None]:
# Import dependencies
import torch
import torch.nn as nn
import torch.nn.functional as F

import time
import pdb
import networkx as nx
import numpy as np

import dgl
from dgl.nn.pytorch import GATConv

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


Using backend: pytorch


\begin{equation}
z_i^{(l)}=W^{(l)}h_i^{(l)}
\tag{1}\end{equation}

\begin{equation}
e_{ij}^{(l)}=\text{LeakyReLU}(\vec a^{(l)^T}(z_i^{(l)}|z_j^{(l)}))
\tag{2}\end{equation}

\begin{equation}
\alpha_{ij}^{(l)}=\frac{\exp(e_{ij}^{(l)})}{\sum_{k\in \mathcal{N}(i)}^{}\exp(e_{ik}^{(l)})} 
\tag{3}\end{equation}

\begin{equation}
h_i^{(l+1)}=\sigma \left(\sum_{j \in \mathcal{N}(i)}  {\alpha^{(l)}_{ij} z^{(l)}_j} \right)
\tag{4}\end{equation}

In [None]:
class GATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim, dropout=0.6):
        super(GATLayer, self).__init__()
        self.g = g                                                  # graph
        self.attn_dropout = dropout                                 # dropout
        self.feat_dropout = dropout
        self.fc = nn.Linear(in_dim, out_dim, bias=False)            # equation (1)
        self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)        # equation (2)
        
        # Initialize learnable parameters
        gain = nn.init.calculate_gain('relu')
        nn.init.xavier_normal_(self.fc.weight, gain=gain)
        nn.init.xavier_normal_(self.attn_fc.weight, gain=gain)

    def edge_attention(self, edges):                                # Edge UDF: equation (2)
        z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=1)
        a = self.attn_fc(z2)
        return {'e': F.leaky_relu(a)}

    def message_func(self, edges):                                  # Message UDF: equation (3),(4)
        return {'z': edges.src['z'], 'e': edges.data['e']}

    def reduce_func(self, nodes):                                   # Reduce UDF: equation (3),(4)
        alpha = F.softmax(nodes.mailbox['e'], dim=1)                # equation (3)
        # alpha = F.dropout(alpha, self.dropout, self.training)       # dropout
        h = torch.sum(alpha * nodes.mailbox['z'], dim=1)            # equation (4)
        return {'h': h}

    def forward(self, h):
        z = self.fc(h)                                              # equation (1)
        # z = F.dropout(z, self.dropout, self.training)               # dropout
        self.g.ndata['z'] = z
        self.g.apply_edges(self.edge_attention)                     # equation (2)
        self.g.update_all(self.message_func, self.reduce_func)      # equation (3),(4)
        return self.g.ndata.pop('h')


In [None]:
class MultiHeadGATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim, num_heads, merge='cat'):
        super(MultiHeadGATLayer, self).__init__()
        self.heads = nn.ModuleList()
        for i in range(num_heads):
            self.heads.append(GATLayer(g, in_dim, out_dim))
        self.merge = merge

    def forward(self, h):
        head_outs = [attn_head(h) for attn_head in self.heads]
        if self.merge == 'cat':
            # concat on the output feature dimension (dim=1)
            return torch.cat(head_outs, dim=1)
        else:
            # merge using average
            return torch.mean(torch.stack(head_outs))

In [None]:
class GAT(nn.Module):
    def __init__(self, g, in_dim, hidden_dim, out_dim, num_heads):
        super(GAT, self).__init__()
        self.layer1 = MultiHeadGATLayer(g, in_dim, hidden_dim, num_heads)
        # Be aware that the input dimension is hidden_dim*num_heads since
        # multiple head outputs are concatenated together. Also, only
        # one attention head in the output layer.
        self.layer2 = MultiHeadGATLayer(g, hidden_dim * num_heads, out_dim, 1)

    def forward(self, inputs):
        h = inputs
        h = self.layer1(h)
        h = F.elu(h)
        h = self.layer2(h)
        return h

In [None]:
class GAT(nn.Module):
    def __init__(self,
                 g,
                 num_layers,
                 in_dim,
                 num_hidden,
                 num_classes,
                 heads,
                 activation,
                 feat_drop,
                 attn_drop,
                 negative_slope,
                 residual):
        super(GAT, self).__init__()
        self.g = g
        self.num_layers = num_layers
        self.gat_layers = nn.ModuleList()
        self.activation = activation
        # input projection (no residual)
        self.gat_layers.append(GATConv(
            in_dim, num_hidden, heads[0],
            feat_drop, attn_drop, negative_slope, False, self.activation))
        # hidden layers
        for l in range(1, num_layers):
            # due to multi-head, the in_dim = num_hidden * num_heads
            self.gat_layers.append(GATConv(
                num_hidden * heads[l-1], num_hidden, heads[l],
                feat_drop, attn_drop, negative_slope, residual, self.activation))
        # output projection
        self.gat_layers.append(GATConv(
            num_hidden * heads[-2], num_classes, heads[-1],
            feat_drop, attn_drop, negative_slope, residual, None))

    def forward(self, inputs):
        h = inputs
        for l in range(self.num_layers):
            h = self.gat_layers[l](self.g, h).flatten(1)
        # output projection
        logits = self.gat_layers[-1](self.g, h).mean(1)
        return logits

### Utility Functions

In [None]:
def accuracy(logits, labels):
    indices = torch.argmax(logits, dim=1)               # indices with highest value
    num_correct = torch.sum(indices == labels)          # how many predictions match labels
    return (num_correct.item()*1.0)/len(labels)         # convert to float and find percentage 

def evaluate(model, features, labels, mask):
    model.eval()
    with torch.no_grad():                               # deactivate autograd during eval
        logits = model(features)
        logits = logits[mask]
        labels = labels[mask]
        return accuracy(logits, labels)

def train(model, features, labels, mask):
    model.train()
    print(labels.size(),features.size())
    logits = model(features)
    logp = F.log_softmax(logits, 1)
    loss = F.nll_loss(logp[mask], labels[mask])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()

### Run

In [None]:
# Cora dataset consists of 2708 publications classified into one of seven classes
# Each publication is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary
dataset = dgl.data.CoraGraphDataset()

# Check is GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset and attributes
graph = dataset[0]                                  # Only 1 graph in this dataset
graph = graph.int().to(device)                      # Cast to GPU if available, else cpu
node_features = graph.ndata['feat']                 # [2708, 1433]: each node has a word vector of 1433 unique words
node_labels = graph.ndata['label']                  # [2708]: each node has one label of range [0-6]
train_mask = graph.ndata['train_mask']
valid_mask = graph.ndata['val_mask']
test_mask = graph.ndata['test_mask']
num_feats = node_features.size()[1]
num_classes = dataset.num_classes

# GAT Hyperparameters
num_heads = 8
num_layers = 1
num_out_heads = 1

heads = ([num_heads] * num_layers) + [num_out_heads]
model = GAT(g=graph, num_layers=num_layers, in_dim=num_feats, num_hidden=8, num_classes=num_classes, heads=heads, activation=F.elu, feat_drop=0.6, attn_drop=0.6, negative_slope=0.2, residual=False)

# Old model
# model = GAT(graph, in_dim=num_feats, hidden_dim=8, out_dim=num_classes, num_heads=8)



model = model.to(device)

# create optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3, weight_decay=5e-4)

# Main
for epoch in range(300):

    loss = train(model, node_features, node_labels, train_mask)
    val_acc = evaluate(model, node_features, node_labels, valid_mask)

    # if epoch % 10 == 9:
    print("Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f}".format(epoch+1, loss, val_acc))

# Testing
test_acc = evaluate(model, node_features, node_labels, test_mask)
print("Test Accuracy {:.4f}".format(test_acc))

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.
torch.Size([2708]) torch.Size([2708, 1433])
Epoch 00001 | Loss 1.9457 | Accuracy 0.2420
torch.Size([2708]) torch.Size([2708, 1433])
Epoch 00002 | Loss 1.9351 | Accuracy 0.2620
torch.Size([2708]) torch.Size([2708, 1433])
Epoch 00003 | Loss 1.9296 | Accuracy 0.2340
torch.Size([2708]) torch.Size([2708, 1433])
Epoch 00004 | Loss 1.9210 | Accuracy 0.3200
torch.Size([2708]) torch.Size([2708, 1433])
Epoch 00005 | Loss 1.9140 | Accuracy 0.3800
torch.Size([2708]) torch.Size([2708, 1433])
Epoch 00006 | Loss 1.9095 | Accuracy 0.3160
torch.Size([2708]) torch.Size([2708, 1433])
Epoch 00007 | Loss 1.8954 | Accuracy 0.3460
torch.Size([2708]) torch.Size([2708, 1433])
Epoch 00008 | Loss 1.8773 | Accuracy 0.3520
torch.Size([2708]) torch.Size([2708, 1433])
Epoch 00009 | Loss 1.8887 | Accuracy 0.4200
torch.Size([2708]) torch.S

#### Misc testing below (ignore)

In [None]:
print(graph)

Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'label': Scheme(shape=(), dtype=torch.int64), 'feat': Scheme(shape=(1433,), dtype=torch.float32)}
      edata_schemes={})


In [None]:
print(labels.size())
print(max(labels))
print(min(labels))
print(labels[0])

NameError: ignored

In [None]:
import torch
import torch.nn as nn
import dgl.function as fn
from dgl.nn import GATConv


class GAT(nn.Module):
    def __init__(self,
                 g,
                 num_layers,
                 in_dim,
                 num_hidden,
                 num_classes,
                 heads,
                 activation,
                 feat_drop,
                 attn_drop,
                 negative_slope,
                 residual):
        super(GAT, self).__init__()
        self.g = g
        self.num_layers = num_layers
        self.gat_layers = nn.ModuleList()
        self.activation = activation
        # input projection (no residual)
        self.gat_layers.append(GATConv(
            in_dim, num_hidden, heads[0],
            feat_drop, attn_drop, negative_slope, False, self.activation))
        # hidden layers
        for l in range(1, num_layers):
            # due to multi-head, the in_dim = num_hidden * num_heads
            self.gat_layers.append(GATConv(
                num_hidden * heads[l-1], num_hidden, heads[l],
                feat_drop, attn_drop, negative_slope, residual, self.activation))
        # output projection
        self.gat_layers.append(GATConv(
            num_hidden * heads[-2], num_classes, heads[-1],
            feat_drop, attn_drop, negative_slope, residual, None))

    def forward(self, inputs):
        h = inputs
        for l in range(self.num_layers):
            h = self.gat_layers[l](self.g, h).flatten(1)
        # output projection
        logits = self.gat_layers[-1](self.g, h).mean(1)
        return logits

In [None]:
class EarlyStopping:
    def __init__(self, patience=10):
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def step(self, acc, model):
        score = acc
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(model)
        elif score < self.best_score:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(model)
            self.counter = 0
        return self.early_stop

    def save_checkpoint(self, model):
        '''Saves model when validation loss decrease.'''
        torch.save(model.state_dict(), 'es_checkpoint.pt')