# GCN on CORA in PyTorch

In [None]:
!pip install -q dgl

[K     |████████████████████████████████| 4.4 MB 8.0 MB/s 
[?25h

In [None]:
import numpy as np
import pandas as pd
import random
import os, sys, pickle
import random, math, gc

from collections import defaultdict

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F

from datetime import datetime
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

import dgl
from dgl.data import CoraGraphDataset

DGL backend not selected or invalid.  Assuming PyTorch for now.
Using backend: pytorch


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
class GCNDataset(Dataset):
    def __init__(self, graph, is_train):
        super(GCNDataset, self).__init__()
        self.graph = graph
        self.mask = graph.ndata['train_mask'] if is_train else graph.ndata['test_mask']
        self.label = graph.ndata['label']
        self.node = graph.nodes()
        self.feat = graph.ndata['feat'].float()

    def __len__(self):
        return self.graph.num_nodes()

    def __getitem__(self, idx):
        return {
            'node': self.node[idx],
            'y': self.label[idx],
            'mask': self.mask[idx],
            'x': self.feat[idx]
        }

def get_A_mat(graph, config):
    A = np.zeros((graph.num_nodes(), graph.num_nodes()))
    for src, dst in zip(graph.edges()[0].numpy(), graph.edges()[1].numpy()):
        A[src, dst] += 1
    A = A + np.identity(graph.num_nodes())
    D = np.sum(A, axis=1)
    D = np.diag(np.power(D, -0.5))
    Ahat = np.dot(D, A).dot(D)
    return torch.tensor(Ahat).float().to(config.device)

In [None]:
class GCNLayer(nn.Module):
    def __init__(self, input, output, dropout):
        super(GCNLayer, self).__init__()
        self.input = input
        self.output = output
        self.W = nn.Linear(input, output)
        self.dropout = nn.Dropout(dropout)
        # torch.nn.init.uniform_(self.W.weight, -1/math.sqrt(output), 1/math.sqrt(output))
        torch.nn.init.uniform_(self.W.weight)        
    
    def forward(self, x, adj):
        output = torch.spmm(adj, x)
        output = self.dropout(output)
        output = self.W(output)
        return output

class GCN(nn.Module):
    def __init__(self, config):
        super(GCN, self).__init__()
        self.gcn1 = GCNLayer(config.input_dim, config.hidden_dim, dropout=0.1) 
        self.gcn2 = GCNLayer(config.hidden_dim, config.output_dim, dropout=0.1) 
        
    def forward(self, batch_data, A):
        label, data, mask = batch_data['y'], batch_data['x'], batch_data['mask']
        data = F.relu(self.gcn1(data, A))
        data = self.gcn2(data, A)
        return data[mask], label[mask]

In [None]:
class Config:
    learning_rate = 0.01
    weight_decay = 5e-4
    hidden_dim = 16
    epochs = 200
    early_stopping_round = None
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    seed = 1995

config = Config()

dataset = CoraGraphDataset()
graph = dataset[0]
config.batch_size = graph.num_nodes()
config.input_dim = graph.ndata['feat'].shape[1]
config.output_dim = graph.ndata['label'].unique().shape[0]

seed_everything(config.seed)
train_set = GCNDataset(graph, True)
valid_set = GCNDataset(graph, False)
train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=False)
valid_loader = DataLoader(valid_set, batch_size=config.batch_size, shuffle=False)

A = get_A_mat(graph, config)
model = GCN(config)
model = model.to(config.device)
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
loss_fn = nn.CrossEntropyLoss()
history = defaultdict(list)

start = datetime.now()
best_loss, early_step, best_epoch = 0, 0, 0
for epoch in range(config.epochs):
    model.train()
    for batch_data in train_loader:
        optimizer.zero_grad()
        batch_data = {k:v.to(config.device) for k,v in batch_data.items()}
        output, true = model(batch_data, A)
        acc_tr = torch.sum(true == torch.argmax(output, axis=1)) / len(true)
        loss = loss_fn(output, true)
        loss.backward()
        optimizer.step()

    history['train_loss'].append(loss.item())
    history['train_acc'].append(acc_tr)

    model.eval()
    with torch.no_grad():
        for batch_data in valid_loader:
            batch_data = {k:v.to(config.device) for k,v in batch_data.items()}
            output, true = model(batch_data, A)
            acc = torch.sum(true == torch.argmax(output, axis=1)) / len(true)
            loss = loss_fn(output, true)

    history['valid_loss'].append(loss.item())
    history['valid_acc'].append(acc)

    if epoch == 0 or epoch == config.epochs-1 or (epoch+1)%10 == 0:
        print(f'EPOCH {epoch+1} : TRAINING loss {history["train_loss"][-1]:.3f}, TRAINING ACC {history["train_acc"][-1]:.3f}, VALID loss {history["valid_loss"][-1]:.3f}, VALID ACC {history["valid_acc"][-1]:.3f}')
    
    if history['valid_acc'][-1] > best_loss:
        best_loss = history['valid_acc'][-1]
        best_epoch = epoch

    elif(config.early_stopping_round is not None):
        
        early_step += 1
        if (early_step >= config.early_stopping_round):
            break
end = datetime.now()
print(end-start)
print(f'At EPOCH {best_epoch + 1}, We have Best Acc {best_loss}')

Downloading /root/.dgl/cora_v2.zip from https://data.dgl.ai/dataset/cora_v2.zip...
Extracting file to /root/.dgl/cora_v2
Finished data loading and preprocessing.
  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.
EPOCH 1 : TRAINING loss 2.322, TRAINING ACC 0.143, VALID loss 2.266, VALID ACC 0.091
EPOCH 10 : TRAINING loss 1.946, TRAINING ACC 0.193, VALID loss 1.947, VALID ACC 0.148
EPOCH 20 : TRAINING loss 1.872, TRAINING ACC 0.229, VALID loss 1.923, VALID ACC 0.146
EPOCH 30 : TRAINING loss 1.829, TRAINING ACC 0.486, VALID loss 1.884, VALID ACC 0.350
EPOCH 40 : TRAINING loss 1.765, TRAINING ACC 0.736, VALID loss 1.846, VALID ACC 0.684
EPOCH 50 : TRAINING loss 1.687, TRAINING ACC 0.757, VALID loss 1.801, VALID ACC 0.663
EPOCH 60 : TRAINING loss 1.589, TRAINING ACC 0.793, VALID loss 1.732, VALID ACC 0.734
EPOCH 70 : TRAINING loss 1.433, TRAINING ACC 0.864, VALI

---

In [None]:
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-11-28 13:39:52

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

numpy     : 1.19.5
sys       : 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
IPython   : 5.5.0
torch     : 1.10.0+cu111
networkx  : 2.6.3
dgl       : 0.6.1
pandas    : 1.1.5
matplotlib: 3.2.2



---

**END**