# Tutorial3: GAT implementation

## Outline

- Implementation of GAT

Official resources:
* [Code](https://dsgiitr.com/blogs/gat/)

In [1]:
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

1.12.1


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

## Structure

In [3]:
class GATLayer(nn.Module):
    """
    Simple PyTorch Implementation of the Graph Attention layer.
    """
    def __init__(self):
        super(GATLayer, self).__init__()
      
    def forward(self, input, adj):
        print("")

## Let's start from the forward method

### Linear Transformation : 1. apply a parameterized linear transformation to every node

$$
\bar{h'}_i = \textbf{W}\cdot \bar{h}_i
$$
with $\textbf{W}\in\mathbb R^{F'\times F}$ and $\bar{h}_i\in\mathbb R^{F}$.

$$
\bar{h'}_i \in \mathbb{R}^{F'}
$$

In [4]:
in_features = 5 # node feature 수
out_features = 2 # 2 elements for each node
nb_nodes = 3 # 노드 수

# xavier paramiter inizializator for Weight matrix
W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
nn.init.xavier_uniform_(W.data, gain=1.414)

#data (3,5)
input = torch.rand(nb_nodes,in_features) 


# linear transformation of each node
h = torch.mm(input, W) # input * W -> (3,5)x(5,2) => (3,2)
N = h.size()[0] # Number of node

print(W.shape)
print(h.shape)
print(N)

torch.Size([5, 2])
torch.Size([3, 2])
3


### Attention Mechanism : 2. Self attention

![title](AttentionMechanism.png)

In [5]:
# attention initialization
a = nn.Parameter(torch.zeros(size=(2*out_features, 1))) # xavier paramiter inizializator
nn.init.xavier_uniform_(a.data, gain=1.414)
# linear transform한 결과물(3,2)을 concat한 것
# (3,2)에서 3은 노드 수, F' = 2 => (4,1)
print(a.shape)

leakyrelu = nn.LeakyReLU(0.2)  # LeakyReLU

torch.Size([4, 1])


In [6]:
h # weight가 곱해진 node embedding

tensor([[-0.4319,  1.0813],
        [-0.4736, -0.3457],
        [-0.3334, -0.0062]], grad_fn=<MmBackward0>)

In [7]:
h.repeat(1,N) # 열방향으로 N=3번 반복 => (3,2) -> (3,6)

tensor([[-0.4319,  1.0813, -0.4319,  1.0813, -0.4319,  1.0813],
        [-0.4736, -0.3457, -0.4736, -0.3457, -0.4736, -0.3457],
        [-0.3334, -0.0062, -0.3334, -0.0062, -0.3334, -0.0062]],
       grad_fn=<RepeatBackward0>)

In [8]:
h.repeat(1,N).view(N * N, -1) # (3,6) -> (9,2)

tensor([[-0.4319,  1.0813],
        [-0.4319,  1.0813],
        [-0.4319,  1.0813],
        [-0.4736, -0.3457],
        [-0.4736, -0.3457],
        [-0.4736, -0.3457],
        [-0.3334, -0.0062],
        [-0.3334, -0.0062],
        [-0.3334, -0.0062]], grad_fn=<ViewBackward0>)

### Attention : 4. attention mechanism $a$ : a single-layer feed forward neural network

$\alpha_{i,j} = \frac{exp(LeakyReLU(a^{-T}[\mathbf{W}h_i||\mathbf{W}h_j]))}{\sum_{k\in N(i)}exp(LeakyReLU(a^{-T}[\mathbf{W}h_i||\mathbf{W}h_j]))}$

- a_input = $[\mathbf{W}h_i||\mathbf{W}h_j]$

In [9]:
# all the possible combination
a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * out_features)
print(a_input)
print(a_input.shape)

tensor([[[-0.4319,  1.0813, -0.4319,  1.0813],
         [-0.4319,  1.0813, -0.4736, -0.3457],
         [-0.4319,  1.0813, -0.3334, -0.0062]],

        [[-0.4736, -0.3457, -0.4319,  1.0813],
         [-0.4736, -0.3457, -0.4736, -0.3457],
         [-0.4736, -0.3457, -0.3334, -0.0062]],

        [[-0.3334, -0.0062, -0.4319,  1.0813],
         [-0.3334, -0.0062, -0.4736, -0.3457],
         [-0.3334, -0.0062, -0.3334, -0.0062]]], grad_fn=<ViewBackward0>)
torch.Size([3, 3, 4])


![title](a_input.png)

In [10]:
# a = attention (4,1)
# a_input = all the possible combination(3,3,4), 첫번째는 노드 수
# torch.matmul(a_input, a) => (3, 3, 1)
# torch.squeeze(dim = n) : size가 1d인 nth dim 삭제 => (3,3)
e = leakyrelu(torch.matmul(a_input, a).squeeze(2))

In [11]:
print(a_input.shape,a.shape)
print("")
print(torch.matmul(a_input,a).shape)
print("")
print(torch.matmul(a_input,a).squeeze(2).shape)

torch.Size([3, 3, 4]) torch.Size([4, 1])

torch.Size([3, 3, 1])

torch.Size([3, 3])


In [12]:
e

tensor([[ 1.1574, -0.0503,  0.0642],
        [ 0.5667, -0.1684, -0.1053],
        [ 0.8273, -0.1163, -0.0532]], grad_fn=<LeakyReluBackward0>)

### Masked Attention

In [13]:
# Masked Attention
# 0,1의 값을 가지는 (3,3) matrix생성, adjacency matrix
adj = torch.randint(2, (3, 3))
print(adj)

# torch.ones_like(x) : 1로만 이루어진 x와 동일한 모양의 tensor 생성
# 0에 가까운 매우 작은 수로 initialize
zero_vec  = -9e15*torch.ones_like(e)
print(zero_vec)

tensor([[1, 1, 0],
        [0, 0, 1],
        [1, 1, 1]])
tensor([[-9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15]])


In [14]:
# adjacency matrix에서 0보다 큰 값을 가지는 경우 e의 값을, 아닌경우 zero_vec의 값을 가져옴
attention = torch.where(adj > 0, e, zero_vec)
print(adj,"\n",e,"\n",zero_vec)
attention

tensor([[1, 1, 0],
        [0, 0, 1],
        [1, 1, 1]]) 
 tensor([[ 1.1574, -0.0503,  0.0642],
        [ 0.5667, -0.1684, -0.1053],
        [ 0.8273, -0.1163, -0.0532]], grad_fn=<LeakyReluBackward0>) 
 tensor([[-9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -9.0000e+15]])


tensor([[ 1.1574e+00, -5.0288e-02, -9.0000e+15],
        [-9.0000e+15, -9.0000e+15, -1.0530e-01],
        [ 8.2729e-01, -1.1631e-01, -5.3176e-02]], grad_fn=<WhereBackward0>)

### Normalization : 3. Softmax Normalization

In [15]:
attention = F.softmax(attention, dim=1)

### 학습한 attention 사용하기 : 5. Node i의 이웃의 중요도를 결정하여 Input 데이터를 재정의

In [16]:

h_prime   = torch.matmul(attention, h) # attention으로 node embedding update

In [17]:
attention

tensor([[0.7699, 0.2301, 0.0000],
        [0.0000, 0.0000, 1.0000],
        [0.5544, 0.2158, 0.2298]], grad_fn=<SoftmaxBackward0>)

In [18]:
h_prime # attention (3,3)x node embedding times weight matrix (3,2) => (3,2)

tensor([[-0.4415,  0.7529],
        [-0.3334, -0.0062],
        [-0.4182,  0.5234]], grad_fn=<MmBackward0>)

#### h_prime vs h

In [19]:
print(h_prime,"\n",h)

tensor([[-0.4415,  0.7529],
        [-0.3334, -0.0062],
        [-0.4182,  0.5234]], grad_fn=<MmBackward0>) 
 tensor([[-0.4319,  1.0813],
        [-0.4736, -0.3457],
        [-0.3334, -0.0062]], grad_fn=<MmBackward0>)


# Build the layer

In [20]:
class GATLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout, alpha, concat=True):
        super(GATLayer, self).__init__()
        
        '''
        TODO
        '''
        
    def forward(self, input, adj):
        # Linear Transformation
        h = torch.mm(input, self.W) # matrix multiplication
        N = h.size()[0]

        # Attention Mechanism
        a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features)
        e       = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))

        # Masked Attention
        zero_vec  = -9e15*torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime   = torch.matmul(attention, h)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

In [21]:
class GATLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout, alpha, concat=True):
        super(GATLayer, self).__init__()
        self.dropout       = dropout        # drop prob = 0.6
        self.in_features   = in_features    # 
        self.out_features  = out_features   # 
        self.alpha         = alpha          # LeakyReLU with negative input slope, alpha = 0.2
        self.concat        = concat         # conacat = True for all layers except the output layer.

        
        # Xavier Initialization of Weights
        # Alternatively use weights_init to apply weights of choice 
        self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        
        self.a = nn.Parameter(torch.zeros(size=(2*out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)
        
        # LeakyReLU
        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, input, adj):
        # Linear Transformation (Wxh)
        h = torch.mm(input, self.W) # matrix multiplication
        N = h.size()[0]
        print(N)

        # Attention Mechanism
        a_input = torch.cat([h.repeat(1, N).view(N * N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features)
        e       = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))

        # Masked Attention
        zero_vec  = -9e15*torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime   = torch.matmul(attention, h)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

# Use it

In [22]:
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T

import matplotlib.pyplot as plt

name_data = 'Cora'
dataset = Planetoid(root= '/tmp/' + name_data, name = name_data)
dataset.transform = T.NormalizeFeatures()

print(f"Number of Classes in {name_data}:", dataset.num_classes)
print(f"Number of Node Features in {name_data}:", dataset.num_node_features)

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index


Number of Classes in Cora: 7
Number of Node Features in Cora: 1433


Processing...
Done!


In [23]:
class GAT(torch.nn.Module):
    def __init__(self):
        super(GAT, self).__init__()
        self.hid = 8 # hidden layer
        self.in_head = 8 # multi head attention
        self.out_head = 1
        
        
        self.conv1 = GATConv(dataset.num_features, self.hid, heads=self.in_head, dropout=0.6)
        self.conv2 = GATConv(self.hid*self.in_head, dataset.num_classes, concat=False,
                             heads=self.out_head, dropout=0.6) 
        # concat = False : the multi-head attentions are averaged instead of concatenated

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # F.dropout : https://pytorch.org/docs/stable/generated/torch.nn.functional.dropout.html
        # p : probability of an element to be zeroed
        # training에만 dropout 적용
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x) # normalization
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1) # later with nll loss
    
    
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = "cpu"

model = GAT().to(device)
data = dataset[0].to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

model.train()
for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    
    if epoch%200 == 0:
        print(loss)
    
    loss.backward()
    optimizer.step()
    
    

tensor(1.9438, grad_fn=<NllLossBackward0>)
tensor(0.7586, grad_fn=<NllLossBackward0>)
tensor(0.6620, grad_fn=<NllLossBackward0>)
tensor(0.6435, grad_fn=<NllLossBackward0>)
tensor(0.6162, grad_fn=<NllLossBackward0>)


In [24]:
model.eval()
_, pred = model(data).max(dim=1)
correct = float(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / data.test_mask.sum().item()
print('Accuracy: {:.4f}'.format(acc))

Accuracy: 0.8180
