## Install prerequisites

In [1]:
import time
import humanize
start_time = time.time()

In [2]:
# install Open Graph Benchmark
! pip install ogb

# install PyTorch Geometric
!pip install torch-scatter==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.5.0.html
!pip install torch-sparse==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.5.0.html
!pip install torch-cluster==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.5.0.html
!pip install torch-spline-conv==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.5.0.html
!pip install torch-geometric

Collecting ogb
  Downloading ogb-1.2.0-py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 916 kB/s 
Installing collected packages: ogb
Successfully installed ogb-1.2.0
Looking in links: https://pytorch-geometric.com/whl/torch-1.5.0.html
Collecting torch-scatter==latest+cu101
  Downloading https://pytorch-geometric.com/whl/torch-1.5.0/torch_scatter-latest%2Bcu101-cp37-cp37m-linux_x86_64.whl (12.3 MB)
[K     |████████████████████████████████| 12.3 MB 583 kB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.5
Looking in links: https://pytorch-geometric.com/whl/torch-1.5.0.html
Collecting torch-sparse==latest+cu101
  Downloading https://pytorch-geometric.com/whl/torch-1.5.0/torch_sparse-latest%2Bcu101-cp37-cp37m-linux_x86_64.whl (21.6 MB)
[K     |████████████████████████████████| 21.6 MB 60.4 MB/s 
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.5
Looking in links: htt

## Import libraries

In [3]:
import torch
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR

from torch_sparse import SparseTensor
from torch_scatter import scatter
from torch_geometric.nn.inits import glorot, zeros
from torch_geometric.nn import Node2Vec

from ogb.nodeproppred import PygNodePropPredDataset, Evaluator

## Logger

In [4]:
class Logger(object):
    def __init__(self, runs, info=None):
        self.info = info
        self.results = [[] for _ in range(runs)]

    def add_result(self, run, result):
        assert len(result) == 3
        assert run >= 0 and run < len(self.results)
        self.results[run].append(result)

    def print_statistics(self, run=None):
        if run is not None:
            result = 100 * torch.tensor(self.results[run])
            argmax = result[:, 1].argmax().item()
            print(f'Run {run + 1:02d}:')
            print(f'Highest Train: {result[:, 0].max():.2f}')
            print(f'Highest Valid: {result[:, 1].max():.2f}')
            print(f'  Final Train: {result[argmax, 0]:.2f}')
            print(f'   Final Test: {result[argmax, 2]:.2f}')
        else:
            result = 100 * torch.tensor(self.results)

            best_results = []
            for r in result:
                train1 = r[:, 0].max().item()
                valid = r[:, 1].max().item()
                train2 = r[r[:, 1].argmax(), 0].item()
                test = r[r[:, 1].argmax(), 2].item()
                best_results.append((train1, valid, train2, test))

            best_result = torch.tensor(best_results)

            print(f'All runs:')
            r = best_result[:, 0]
            print(f'Highest Train: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 1]
            print(f'Highest Valid: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 2]
            print(f'  Final Train: {r.mean():.2f} ± {r.std():.2f}')
            r = best_result[:, 3]
            print(f'   Final Test: {r.mean():.2f} ± {r.std():.2f}')

## Config

In [5]:
class args:
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    log_steps = 1
    embedding_dim = 128
    walk_length = 80
    context_size = 20
    walks_per_node = 10
    batch_size = 256
    num_layers = 3
    skip_layers = 3
    hidden_channels = 256
    dropout = 0.05
    lr = 0.01
    epochs = 100
    eval_steps = 1
    runs = 1
    use_node_embedding = True

In [6]:
device = torch.device(args.device)

dataset = PygNodePropPredDataset(name='ogbn-proteins')

data = dataset[0]

model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length,
                     args.context_size, args.walks_per_node,
                     sparse=True).to(device)

loader = model.loader(batch_size=args.batch_size, shuffle=True,
                      num_workers=4)
optimizer = torch.optim.SparseAdam(model.parameters(), lr=args.lr)

Downloading https://snap.stanford.edu/ogb/data/nodeproppred/proteinfunc.zip


Downloaded 0.21 GB: 100%|██████████| 216/216 [00:06<00:00, 31.65it/s]


Extracting dataset/proteinfunc.zip
Processing...
Loading necessary files...
This might take a while.


  0%|          | 0/1 [00:00<?, ?it/s]

Processing graphs...


100%|██████████| 1/1 [00:02<00:00,  2.18s/it]
100%|██████████| 1/1 [00:00<00:00, 352.46it/s]


Converting graphs into PyG objects...
Saving...
Done!


In [7]:
print(humanize.naturaldelta(start_time - time.time()))

3 minutes


## Node2Vec

In [8]:
def save_embedding(model):
    torch.save(model.embedding.weight.data.cpu(), 'embedding.pt')

In [9]:
model.train()
for epoch in range(1, args.epochs + 1):
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, '
                  f'Loss: {loss:.4f}')

        if (i + 1) % 10 == 0:  # Save model every 10 steps.
            save_embedding(model)
    save_embedding(model)

Epoch: 01, Step: 100/518, Loss: 4.3751
Epoch: 01, Step: 200/518, Loss: 2.0068
Epoch: 01, Step: 300/518, Loss: 1.4070
Epoch: 01, Step: 400/518, Loss: 1.2177
Epoch: 01, Step: 500/518, Loss: 1.1533
Epoch: 02, Step: 100/518, Loss: 1.1252
Epoch: 02, Step: 200/518, Loss: 1.1169
Epoch: 02, Step: 300/518, Loss: 1.1068
Epoch: 02, Step: 400/518, Loss: 1.1041
Epoch: 02, Step: 500/518, Loss: 1.1045
Epoch: 03, Step: 100/518, Loss: 1.1021
Epoch: 03, Step: 200/518, Loss: 1.1017
Epoch: 03, Step: 300/518, Loss: 1.1011
Epoch: 03, Step: 400/518, Loss: 1.1022
Epoch: 03, Step: 500/518, Loss: 1.0981
Epoch: 04, Step: 100/518, Loss: 1.1010
Epoch: 04, Step: 200/518, Loss: 1.0977
Epoch: 04, Step: 300/518, Loss: 1.0987
Epoch: 04, Step: 400/518, Loss: 1.0982
Epoch: 04, Step: 500/518, Loss: 1.0977
Epoch: 05, Step: 100/518, Loss: 1.0987
Epoch: 05, Step: 200/518, Loss: 1.0984
Epoch: 05, Step: 300/518, Loss: 1.1009
Epoch: 05, Step: 400/518, Loss: 1.0988
Epoch: 05, Step: 500/518, Loss: 1.0997
Epoch: 06, Step: 100/518,

In [10]:
print(humanize.naturaldelta(start_time - time.time()))

3 hours


## Model

In [11]:
class MLP(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout):
        super(MLP, self).__init__()

        self.lins = torch.nn.ModuleList()
        self.lins.append(torch.nn.Linear(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.lins.append(torch.nn.Linear(hidden_channels, hidden_channels))
        self.lins.append(torch.nn.Linear(hidden_channels, out_channels))

        self.dropout = dropout

    def reset_parameters(self):
        for lin in self.lins:
            lin.reset_parameters()

    def forward(self, x):
        for lin in self.lins[:-1]:
            x = lin(x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.lins[-1](x)
        return x


In [12]:
def train(model, x, y_true, train_idx, optimizer):
    model.train()
    criterion = torch.nn.BCEWithLogitsLoss()

    optimizer.zero_grad()
    out = model(x)[train_idx]
    loss = criterion(out, y_true[train_idx].to(torch.float))
    loss.backward()
    optimizer.step()

    return loss.item()


@torch.no_grad()
def test(model, x, y_true, split_idx, evaluator):
    model.eval()

    y_pred = model(x)

    train_rocauc = evaluator.eval({
        'y_true': y_true[split_idx['train']],
        'y_pred': y_pred[split_idx['train']],
    })['rocauc']
    valid_rocauc = evaluator.eval({
        'y_true': y_true[split_idx['valid']],
        'y_pred': y_pred[split_idx['valid']],
    })['rocauc']
    test_rocauc = evaluator.eval({
        'y_true': y_true[split_idx['test']],
        'y_pred': y_pred[split_idx['test']],
    })['rocauc']

    return train_rocauc, valid_rocauc, test_rocauc

In [13]:
device = f'{args.device}' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)

dataset = PygNodePropPredDataset(name='ogbn-proteins')
split_idx = dataset.get_idx_split()
data = dataset[0]

x = scatter(data.edge_attr, data.edge_index[0], dim=0,
            dim_size=data.num_nodes, reduce='mean').to('cpu')

if args.use_node_embedding:
    embedding = torch.load('embedding.pt', map_location='cpu')
    x = torch.cat([x, embedding], dim=-1)

x = x.to(device)
y_true = data.y.to(device)
train_idx = split_idx['train'].to(device)

model = MLP(x.size(-1), args.hidden_channels, 112, args.num_layers,
            args.dropout).to(device)

evaluator = Evaluator(name='ogbn-proteins')
logger = Logger(args.runs, args)

In [14]:
for run in range(args.runs):
    model.reset_parameters()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    for epoch in range(1, 1 + args.epochs):
        loss = train(model, x, y_true, train_idx, optimizer)

        if epoch % args.eval_steps == 0:
            result = test(model, x, y_true, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_rocauc, valid_rocauc, test_rocauc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_rocauc:.2f}%, '
                      f'Valid: {100 * valid_rocauc:.2f}% '
                      f'Test: {100 * test_rocauc:.2f}%')

    logger.print_statistics(run)
logger.print_statistics()

Run: 01, Epoch: 01, Loss: 0.6916, Train: 42.99%, Valid: 44.82% Test: 44.55%
Run: 01, Epoch: 02, Loss: 0.6201, Train: 42.78%, Valid: 45.31% Test: 45.95%
Run: 01, Epoch: 03, Loss: 0.4820, Train: 46.30%, Valid: 48.87% Test: 49.84%
Run: 01, Epoch: 04, Loss: 0.3747, Train: 58.75%, Valid: 57.45% Test: 58.17%
Run: 01, Epoch: 05, Loss: 0.4017, Train: 67.66%, Valid: 64.69% Test: 62.91%
Run: 01, Epoch: 06, Loss: 0.3927, Train: 71.53%, Valid: 68.02% Test: 64.31%
Run: 01, Epoch: 07, Loss: 0.3519, Train: 73.32%, Valid: 69.36% Test: 64.76%
Run: 01, Epoch: 08, Loss: 0.3271, Train: 74.19%, Valid: 69.94% Test: 64.86%
Run: 01, Epoch: 09, Loss: 0.3267, Train: 74.68%, Valid: 70.29% Test: 64.93%
Run: 01, Epoch: 10, Loss: 0.3347, Train: 74.99%, Valid: 70.54% Test: 65.00%
Run: 01, Epoch: 11, Loss: 0.3379, Train: 75.19%, Valid: 70.76% Test: 65.09%
Run: 01, Epoch: 12, Loss: 0.3337, Train: 75.22%, Valid: 70.88% Test: 65.19%
Run: 01, Epoch: 13, Loss: 0.3260, Train: 75.10%, Valid: 70.86% Test: 65.28%
Run: 01, Epo