# GCN in a transductive setting for Node betweenness
This notebook shows how a GCN/GraphSAGE model is trained to compute Node betweenness centrality on different graphs

# 1. Codebase

In [38]:
%load_ext autoreload
%autoreload 2
import torch
from torch.nn import *
from TFM_edge_betweenness_model import META1
from TFM_node_betweenness_training import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 2. Selected graphs for HP search

Graphs with higher number of node betweenness.

In [39]:
import os.path as osp

from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T


def get_planetoid_dataset(root,name, normalize_features=False, transform=None):
    #path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', name)
    dataset = Planetoid(root, name)

    if transform is not None and normalize_features:
        dataset.transform = T.Compose([T.NormalizeFeatures(), transform])
    elif normalize_features:
        dataset.transform = T.NormalizeFeatures()
    elif transform is not None:
        dataset.transform = transform

    return dataset

#dataset1 = TUDataset(root='temp/'+thename, name='REDDIT-BINARY')
dataset = get_planetoid_dataset(root='temp/planetoid',name='Cora', normalize_features=True)
#inspectGraphDataset(dataset, thename)

# 2. Benchmark train

In [62]:
from __future__ import division

import time

import torch
import torch.nn.functional as F
from torch import tensor
from torch.optim import Adam

from torch_geometric.nn import SGConv, APPNP, ChebConv, GATConv

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def index_to_mask(index, size):
    mask = torch.zeros(size, dtype=torch.uint8, device=index.device)
    mask[index] = 1
    return mask


def random_planetoid_splits(data, num_classes):
    # Set new random planetoid splits:
    # * 20 * num_classes labels for training
    # * 500 labels for validation
    # * 1000 labels for testing

    indices = []
    for i in range(num_classes):
        index = (data.y == i).nonzero().view(-1)
        index = index[torch.randperm(index.size(0))]
        indices.append(index)

    train_index = torch.cat([i[:20] for i in indices], dim=0)

    rest_index = torch.cat([i[20:] for i in indices], dim=0)
    rest_index = rest_index[torch.randperm(rest_index.size(0))]

    data.train_mask = index_to_mask(train_index, size=data.num_nodes)
    data.val_mask = index_to_mask(rest_index[:500], size=data.num_nodes)
    data.test_mask = index_to_mask(rest_index[500:1500], size=data.num_nodes)

    return data


def run(dataset,
        model,
        runs,
        epochs,
        lr,
        weight_decay,
        early_stopping,
        permute_masks=None,
        logger=None):

    val_losses, accs, durations = [], [], []
    for _ in range(runs):
        data = dataset[0]
        if permute_masks is not None:
            data = permute_masks(data, dataset.num_classes)
        data = data.to(device)

        model.to(device).reset_parameters()
        optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

        if torch.cuda.is_available():
            torch.cuda.synchronize()

        t_start = time.perf_counter()

        best_val_loss = float('inf')
        test_acc = 0
        val_loss_history = []

        for epoch in range(1, epochs + 1):
            train(model, optimizer, data)
            eval_info = evaluate(model, data)
            eval_info['epoch'] = epoch

            if logger is not None:
                logger(eval_info)

            if eval_info['val_loss'] < best_val_loss:
                best_val_loss = eval_info['val_loss']
                test_acc = eval_info['test_acc']

            val_loss_history.append(eval_info['val_loss'])
            if early_stopping > 0 and epoch > epochs // 2:
                tmp = tensor(val_loss_history[-(early_stopping + 1):-1])
                if eval_info['val_loss'] > tmp.mean().item():
                    break

        if torch.cuda.is_available():
            torch.cuda.synchronize()

        t_end = time.perf_counter()

        val_losses.append(best_val_loss)
        accs.append(test_acc)
        durations.append(t_end - t_start)

    loss, acc, duration = tensor(val_losses), tensor(accs), tensor(durations)

    print('Val Loss: {:.4f}, Test Accuracy: {:.3f} ± {:.3f}, Duration: {:.3f}'.
          format(loss.mean().item(),
                 acc.mean().item(),
                 acc.std().item(),
                 duration.mean().item()))
    with open('preliminary_semisuperv.csv','a') as f:
        if 'conv1' in dir(model):
            f.write('{}, {}, {:.4f}, {:.3f} ± {:.3f}, {:.3f}\n'.
              format(
                     model.conv1.__class__.__name__, 
                     str(runs)+'_epochs='+str(epochs),
                     loss.mean().item(),
                     acc.mean().item(),
                     acc.std().item(),
                     duration.mean().item()))
        else:
            f.write('{}, {}, {:.4f}, {:.3f} ± {:.3f}, {:.3f}\n'.
              format(
                     model.__class__.__name__, 
                     str(runs)+'_epochs='+str(epochs),
                     loss.mean().item(),
                     acc.mean().item(),
                     acc.std().item(),
                     duration.mean().item()))
            

def train(model, optimizer, data):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()


def evaluate(model, data):
    model.eval()

    with torch.no_grad():
        logits = model(data)

    outs = {}
    for key in ['train', 'val', 'test']:
        mask = data['{}_mask'.format(key)]
        loss = F.nll_loss(logits[mask], data.y[mask]).item()
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()

        outs['{}_loss'.format(key)] = loss
        outs['{}_acc'.format(key)] = acc

    return outs

In [63]:
import warnings
warnings.filterwarnings('ignore')

In [69]:
import argparse 

#parser = argparse.ArgumentParser()
#parser.add_argument('--random_splits', type=bool, default=False)
#parser.add_argument('--runs', type=int, default=100)
#parser.add_argument('--epochs', type=int, default=200)
#parser.add_argument('--lr', type=float, default=0.01)
#parser.add_argument('--weight_decay', type=float, default=0.0005)
#parser.add_argument('--early_stopping', type=int, default=10)
#parser.add_argument('--hidden', type=int, default=16)
#parser.add_argument('--dropout', type=float, default=0.5)
#parser.add_argument('--normalize_features', type=bool, default=True)
#args = parser.parse_args()    
    
class Net(torch.nn.Module):
    def __init__(self, dataset):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)  

permute_masks = random_planetoid_splits 
run(dataset, Net(dataset), 100, 200, 0.01, 0.0005,
    10, permute_masks)

Val Loss: 0.8864, Test Accuracy: 0.789 ± 0.018, Duration: 1.091


In [65]:
#SGConv
class Net(torch.nn.Module):
    def __init__(self, dataset):
        super(Net, self).__init__()
        self.conv1 = SGConv(
            dataset.num_features, dataset.num_classes, K=3, cached=True)

    def reset_parameters(self):
        self.conv1.reset_parameters()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        return F.log_softmax(x, dim=1)
    

permute_masks = random_planetoid_splits 
run(dataset, Net(dataset), 100, 200, 0.01, 0.0005,
    10, permute_masks)

Val Loss: 1.7587, Test Accuracy: 0.790 ± 0.018, Duration: 0.585


In [66]:
#Chebconv
class Net(torch.nn.Module):
    def __init__(self, dataset):
        super(Net, self).__init__()
        self.conv1 = ChebConv(dataset.num_features, 16, 3)
        self.conv2 = ChebConv(16, dataset.num_classes, 3)

    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)
    

permute_masks = random_planetoid_splits 
run(dataset, Net(dataset), 100, 200, 0.01, 0.0005,
    10, permute_masks)

Val Loss: 0.8133, Test Accuracy: 0.769 ± 0.027, Duration: 5.049


In [67]:
#APPNPN
class Net(torch.nn.Module):
    def __init__(self, dataset):
        super(Net, self).__init__()
        self.lin1 = Linear(dataset.num_features, 16)
        self.lin2 = Linear(16, dataset.num_classes)
        self.prop1 = APPNP(3, 0.1)
        self.conv1 = APPNP(3, 0.1) # for reporting only

    def reset_parameters(self):
        self.lin1.reset_parameters()
        self.lin2.reset_parameters()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        x = self.prop1(x, edge_index)
        return F.log_softmax(x, dim=1)
    
permute_masks = random_planetoid_splits 
run(dataset, Net(dataset), 100, 200, 0.01, 0.0005,
    10, permute_masks)

Val Loss: 0.8894, Test Accuracy: 0.808 ± 0.017, Duration: 0.984


In [68]:
class Net(torch.nn.Module):
    def __init__(self, dataset):
        super(Net, self).__init__()
        self.conv1 = GATConv(
            dataset.num_features,
            16,
            heads=8,
            dropout=0.5)
        self.conv2 = GATConv(
            16 * 8,
            dataset.num_classes,
            heads=8,
            concat=False,
            dropout=0.5)

    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

    
permute_masks = random_planetoid_splits 
run(dataset, Net(dataset), 100, 200, 0.01, 0.0005,
    10, permute_masks)

Val Loss: 0.8111, Test Accuracy: 0.803 ± 0.016, Duration: 1.967


In [70]:
report = pd.read_csv('preliminary_semisuperv.csv')

In [73]:
report

Unnamed: 0,Model,params,Val Loss,Test Accuracy,Duration
0,SGConv,100_epochs=200,1.7587,0.790 ± 0.018,0.585
1,ChebConv,100_epochs=200,0.8133,0.769 ± 0.027,5.049
2,APPNP,100_epochs=200,0.8894,0.808 ± 0.017,0.984
3,GATConv,100_epochs=200,0.8111,0.803 ± 0.016,1.967
4,GCNConv,100_epochs=200,0.8864,0.789 ± 0.018,1.091


In [76]:
report.to_latex(index=False)

'\\begin{tabular}{llrlr}\n\\toprule\n    Model &           params &   Val Loss &   Test Accuracy &   Duration \\\\\n\\midrule\n   SGConv &   100\\_epochs=200 &     1.7587 &   0.790 ± 0.018 &      0.585 \\\\\n ChebConv &   100\\_epochs=200 &     0.8133 &   0.769 ± 0.027 &      5.049 \\\\\n    APPNP &   100\\_epochs=200 &     0.8894 &   0.808 ± 0.017 &      0.984 \\\\\n  GATConv &   100\\_epochs=200 &     0.8111 &   0.803 ± 0.016 &      1.967 \\\\\n  GCNConv &   100\\_epochs=200 &     0.8864 &   0.789 ± 0.018 &      1.091 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [75]:
b

NameError: name 'b' is not defined