In [1]:
#%%
! pip install dgl-cu101 ogb
ROOT = "/kaggle/input/obgnproteins-dgl-data/"

Collecting dgl-cu101
  Downloading dgl_cu101-0.5.2-cp37-cp37m-manylinux1_x86_64.whl (25.5 MB)
[K     |████████████████████████████████| 25.5 MB 18.7 MB/s 
[?25hCollecting ogb
  Downloading ogb-1.2.3-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.5 MB/s 
Collecting outdated>=0.2.0
  Downloading outdated-0.2.0.tar.gz (4.0 kB)
Collecting littleutils
  Downloading littleutils-0.2.2.tar.gz (6.6 kB)
Building wheels for collected packages: outdated, littleutils
  Building wheel for outdated (setup.py) ... [?25l- \ done
[?25h  Created wheel for outdated: filename=outdated-0.2.0-py3-none-any.whl size=4960 sha256=e27d224c62a7fbe616f2904662309657d72318f9a0cb76023416794b96cc3089
  Stored in directory: /root/.cache/pip/wheels/6f/cd/a2/e49170b2cf59e88b952f3414f25a54d9f16f033bded4aaab26
  Building wheel for littleutils (setup.py) ... [?25l- \ done
[?25h  Created wheel for littleutils: filename=littleutils-0.2.2-py3-none-any.whl size=7048 sha25

In [2]:
from dgl.data.utils import load_graphs
import numpy as np
import random
import os
import math
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.function as fn
from dgl import DGLGraph
from dgl.nn import GraphConv, SAGEConv
from ogb.nodeproppred import Evaluator
from torch.optim import Adam
from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import ReduceLROnPlateau

x,_ = load_graphs(ROOT + "data.bin")
graph = x[0]

train_idx = torch.load(ROOT + 'train.pt')
test_idx = torch.load(ROOT + 'test.pt')
valid_idx = torch.load(ROOT + 'valid.pt')

splitted_idx = {'train':train_idx, 'test':test_idx, 'valid':valid_idx}

device = torch.device('cuda')

Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


DGL backend not selected or invalid.  Assuming PyTorch for now.
Using backend: pytorch


In [3]:
# copy mean of edge features to node
# split node feature for every channel of the edge
graph.update_all(fn.copy_e('feat','m'),fn.sum('m','edata_mean'))

n_feat = graph.ndata['edata_mean'].shape[1]

for channel in range(n_feat):
        graph.ndata['feat_' + str(channel)] = graph.ndata['edata_mean'][:, channel:channel+1]


In [4]:
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    print ('random seed set to be ' + str(seed))

def run_a_train_epoch(graph, node_idx, model, criterion, optimizer, evaluator):
    model.train()
    logits = model(graph)[node_idx]
    labels = graph.ndata['labels'][node_idx]
    loss = criterion(logits, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    loss = loss.data.item()
    labels = labels.cpu().numpy()
    preds = logits.cpu().detach().numpy()

    return loss, evaluator.eval({"y_true": labels, "y_pred": preds})['rocauc']

def run_an_eval_epoch(graph, splitted_idx, model, evaluator):
    model.eval()
    with torch.no_grad():
        logits = model(graph)
    labels = graph.ndata['labels'].cpu().numpy()
    preds = logits.cpu().detach().numpy()

    train_score = evaluator.eval({
        "y_true": labels[splitted_idx["train"]],
        "y_pred": preds[splitted_idx["train"]]
    })
    val_score = evaluator.eval({
        "y_true": labels[splitted_idx["valid"]],
        "y_pred": preds[splitted_idx["valid"]]
    })
    test_score = evaluator.eval({
        "y_true": labels[splitted_idx["test"]],
        "y_pred": preds[splitted_idx["test"]]
    })

    return train_score['rocauc'], val_score['rocauc'], test_score['rocauc']

In [5]:
set_random_seed(0)

class Block(nn.Module):
    def __init__(self,in_feats,out_feats,num_channels=8):
        super(Block, self).__init__()
        self.gc = GraphConv(in_feats, out_feats)
        
    def forward(self, g, node_state):
        node_states = self.gc(g, node_state)
        node_states = F.relu(node_states)
        return node_states

class Net(nn.Module):
    def __init__(self, in_feats, n_hidden, out_feats,num_paths=2):
        super(Net, self).__init__()
        self.num_paths = num_paths
        self.init = Block(in_feats, n_hidden)
        self.lyrs = nn.ModuleList()
        for _ in range(6):
            self.lyrs.append(Block(n_hidden, n_hidden))
        self.lin = nn.Linear(n_hidden, out_feats)
        
    def forward(self, g):
        node_state = g.ndata['edata_mean']
        out = self.init(g, node_state)
        
        for i in range(6):
            _out = out
            out = self.lyrs[i](g, out)
            out += _out
        # for i in range(4,6):
        #     _out = self.lyrs[i](g, _out)
        # out = torch.stack([out, _out],dim=1).sum(1)
        out = self.lin(out)
        return out

random seed set to be 0


In [6]:
warmup_epochs = 10
num_epochs = 500
patience = 50
log_every = 10
lr = 1e-2
weight_decay = 0

model = Net(in_feats=8, n_hidden=256, out_feats=112).to(device)
graph = graph.to(device)

optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
min_lr = 1e-3
scheduler = ReduceLROnPlateau(optimizer, 'max', factor=0.7, patience=200,  verbose=True, cooldown=20, min_lr=min_lr)
print ('scheduler min_lr', min_lr)
criterion = nn.BCEWithLogitsLoss()
evaluator = Evaluator('ogbn-proteins')


scheduler min_lr 0.001


In [7]:
dur = []
best_score = 0.
num_patient_epochs = 0
model_folder = './saved_models/'
model_path = model_folder + "model.pt"
log_path = "log.txt"

def printw(line):
    with open(log_path,'a') as f:
        f.write(line+"\n")
    print(line)

if not os.path.exists(model_folder):
    os.makedirs(model_folder)
    
print("Warming up for {:d} epochs...".format(warmup_epochs))
t0 = time.time()
for _ in range(warmup_epochs):
    loss, train_score = run_a_train_epoch(graph, splitted_idx["train"], model, criterion, optimizer, evaluator)
    scheduler.step(loss)
dur = time.time() - t0 
print("Done in {:.2f} sec".format(dur))

for epoch in range(1, num_epochs + 1):
    t0 = time.time()
    loss, train_score = run_a_train_epoch(graph, splitted_idx["train"], model, criterion, optimizer, evaluator)
    train_score, val_score, test_score = run_an_eval_epoch(graph, splitted_idx, model, evaluator)
    scheduler.step(val_score)
    dur = time.time() - t0   

    # Early stop
    if test_score > best_score:
        torch.save(model.state_dict(), model_path)
        best_score = test_score
        num_patient_epochs = 0
    else:
        num_patient_epochs += 1

    if (epoch%log_every) == 0:
        printw('Epoch {:d}\t loss {:.4f}\t train score {:.4f}\t val score {:.4f}\t test score {:.4f}\t dur {:.2f}s\t patience {:d}'.format(epoch, loss, train_score, val_score,test_score, dur, num_patient_epochs))

    if num_patient_epochs == patience:
        break

Warming up for 10 epochs...
Done in 56.68 sec
Epoch 10	 loss 0.5154	 train score 0.4649	 val score 0.4017	 test score 0.4204	 dur 11.30s	 patience 5
Epoch 20	 loss 0.4200	 train score 0.5424	 val score 0.4780	 test score 0.4805	 dur 11.52s	 patience 0
Epoch 30	 loss 0.3701	 train score 0.5834	 val score 0.5480	 test score 0.5538	 dur 11.64s	 patience 0
Epoch 40	 loss 0.3517	 train score 0.6326	 val score 0.6128	 test score 0.5538	 dur 12.09s	 patience 0
Epoch 50	 loss 0.3386	 train score 0.6639	 val score 0.6660	 test score 0.5609	 dur 11.69s	 patience 1
Epoch 60	 loss 0.3278	 train score 0.6998	 val score 0.7005	 test score 0.5758	 dur 11.82s	 patience 0
Epoch 70	 loss 0.3195	 train score 0.7221	 val score 0.7217	 test score 0.5936	 dur 11.37s	 patience 0
Epoch 80	 loss 0.3128	 train score 0.7412	 val score 0.7351	 test score 0.5995	 dur 11.53s	 patience 1
Epoch 90	 loss 0.3079	 train score 0.7559	 val score 0.7512	 test score 0.6342	 dur 11.63s	 patience 0
Epoch 100	 loss 0.3036	 tra

In [8]:
printw("Best test score: " +  str(best_score))

Best test score: 0.6807677590571385
