# **Edge Sampling on GNNs**
### **CIS 700 - Machine Learning With Graphs**
### **Sahil Gupta**


Initialise the environment, import packages and enable training on GPU

In [None]:
import os
import torch
os.environ['TORCH'] = torch.__version__
os.environ['PYTHONWARNINGS'] = "ignore"
!pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

Download graph datasets to storage. In our case, we are using Cora, CiteSeer and PubMed datasets

In [2]:
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

datasets={}
datasets['Cora'] = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures())
datasets['CiteSeer'] = Planetoid(root='data/CiteSeer', name='CiteSeer', transform=NormalizeFeatures())
datasets['PubMed'] = Planetoid(root='data/PubMed', name='PubMed', transform=NormalizeFeatures())

print(f'Dataset: {datasets}:')
print('======================')
# print(f'Number of features: {datasets['Cora'].num_features}')
# print(f'Number of classes: {datasets['Cora'].num_classes}')

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Dow

Dataset: {'Cora': Cora(), 'CiteSeer': CiteSeer(), 'PubMed': PubMed()}:


Done!


Creating 3 GNNs: GCN, GAT and TAGCN[3], and defining train and test functions

In [3]:
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GATConv
from torch_geometric.nn import TAGConv
import torch.nn.functional as F

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, dataset):
        super().__init__()
        # torch.manual_seed(1234567)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x


class GAT(torch.nn.Module):
    def __init__(self, hidden_channels, dataset):
        heads = 8
        super().__init__()
        self.conv1 = GATConv(dataset.num_features, hidden_channels, heads, dropout=0.6)
        # On the Pubmed dataset, use `heads` output heads in `conv2`.
        self.conv2 = GATConv(hidden_channels * heads, dataset.num_classes, heads=1,
                             concat=False, dropout=0.6)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.6, training=self.training)
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return x

class TAGCN(torch.nn.Module):
    def __init__(self, hidden_channels, dataset):
        super().__init__()
        self.conv1 = TAGConv(dataset.num_features, hidden_channels)
        self.conv2 = TAGConv(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index):
        # x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

def train(data):
      model.train()
      optimizer.zero_grad()
      out = model(data.x, data.edge_index)
      loss = criterion(out[data.train_mask], data.y[data.train_mask])
      loss.backward()
      optimizer.step()
      return loss

def test(data):
      model.eval()
      out = model(data.x, data.edge_index)
      pred = out.argmax(dim=1)
      test_correct = pred[data.test_mask] == data.y[data.test_mask]
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())
      return test_acc


Specifying experiment parameters and running experiments (Training + Testing)

In [None]:
from torch_geometric.utils import dropout_edge
from torch_geometric.utils import dropout_path
import copy
import numpy as np

# Experiment Parameters
models = {
    "GCN" : GCN,
    "GAT" : GAT,
    "TAGCN": TAGCN
}
sampling_ratio_range = np.arange (0.1, 1.1, 0.1)
epoch_range = range(1, 101)
sampling_methods = {
    "RE" : dropout_edge,
    "RW" : dropout_path
}
experiment_count = 5
acc_values = [None]*experiment_count

for i in range(experiment_count):
  acc_values[i]={}
  for modelKey in models:
    acc_values[i][modelKey] = {}

    for datasetKey in datasets:
      dataset = datasets[datasetKey]
      model = models[modelKey](hidden_channels=16, dataset = dataset).to(device)
      optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
      criterion = torch.nn.CrossEntropyLoss()

      acc_values[i][modelKey][datasetKey] = {}


      for sampling_method in sampling_methods:
        acc_values[i][modelKey][datasetKey][sampling_method] = {}
        acc_values[i][modelKey][datasetKey][sampling_method]['sampling_ratio'] = []
        acc_values[i][modelKey][datasetKey][sampling_method]['test_acc'] = []
        for sampling_ratio in sampling_ratio_range:
          dataset = copy.deepcopy(datasets[datasetKey])


          data = dataset[0].to(device)
          # print (sampling_ratio)

          totalEdges = data.edge_index.size(dim=1)
          data.edge_index , data.edge_mask = sampling_methods[sampling_method](data.edge_index , p = 1-sampling_ratio)
          true_sampling_ratio = float(data.edge_index.size(dim=1))/totalEdges
          # print(data.edge_index.size(dim=1))
          print(
               "Training + Testing: " +
               "Experiment: " + str(i) +
               ", Model: "  + modelKey +
               ", Dataset: "  + datasetKey +
               ", Sampling Method: " + sampling_method +
               ", Sampling Ratio: " + str(true_sampling_ratio)
               )

          for epoch in epoch_range:
            loss = train(data)
            # print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
          test_acc = test(data)
          print(f'Test Accuracy: {test_acc:.4f}')

          # print(true_sampling_ratio)
          acc_values[i][modelKey][datasetKey][sampling_method]['sampling_ratio'].append(true_sampling_ratio)
          acc_values[i][modelKey][datasetKey][sampling_method]['test_acc'].append(test_acc)

Training + Testing: Experiment: 0, Model: GCN, Dataset: Cora, Sampling Method: RE, Sampling Ratio: 0.10145888594164457
Test Accuracy: 0.5910
Training + Testing: Experiment: 0, Model: GCN, Dataset: Cora, Sampling Method: RE, Sampling Ratio: 0.2044334975369458
Test Accuracy: 0.6530
Training + Testing: Experiment: 0, Model: GCN, Dataset: Cora, Sampling Method: RE, Sampling Ratio: 0.3006820765441455
Test Accuracy: 0.6800
Training + Testing: Experiment: 0, Model: GCN, Dataset: Cora, Sampling Method: RE, Sampling Ratio: 0.39759378552482
Test Accuracy: 0.7170
Training + Testing: Experiment: 0, Model: GCN, Dataset: Cora, Sampling Method: RE, Sampling Ratio: 0.5035998484274347
Test Accuracy: 0.7440
Training + Testing: Experiment: 0, Model: GCN, Dataset: Cora, Sampling Method: RE, Sampling Ratio: 0.5942591890867753
Test Accuracy: 0.7720
Training + Testing: Experiment: 0, Model: GCN, Dataset: Cora, Sampling Method: RE, Sampling Ratio: 0.7068965517241379
Test Accuracy: 0.7730
Training + Testing: E

Plotting results: Comparing Different edge sampling methods - Percentage drop in test accuracy

In [None]:
import matplotlib.pyplot as plt
import numpy as np

legend = {
    "RE" : {'color':'r', 'label':'Random Edge'},
    "RW" : {'color':'b', 'label':'Random Walk'},
}

# print (acc_values)

fig = plt.figure(constrained_layout=True, figsize=(13,13))

subfigs = fig.subfigures(len(acc_values[0]), 1)

for subfig, modelKey in zip(subfigs,acc_values[0]):

    subfig.suptitle(modelKey)
    axs = subfig.subplots(1, len(acc_values[0][modelKey]))
    for a,datasetKey in zip(axs,acc_values[0][modelKey].keys()):
      for sampling_method in acc_values[0][modelKey][datasetKey]:

        y_values=[]
        for i in range(experiment_count):
          curr_exp_acc = acc_values[i][modelKey][datasetKey][sampling_method]['test_acc']
          max_accuracy = curr_exp_acc[len(curr_exp_acc)-1]
          curr_exp_acc = [(max_accuracy - k)*100/max_accuracy for k in curr_exp_acc]
          y_values.append(np.array(curr_exp_acc))

        y = [np.mean(k) for k in zip(*y_values)]
        # print(y)
        yerr = [np.std(k) for k in zip(*y_values)]
        # print(yerr)

        x = acc_values[i][modelKey][datasetKey][sampling_method]['sampling_ratio']
        a.errorbar(x,y,yerr = yerr, color = legend[sampling_method]['color'], label = legend[sampling_method]['label'])

        a.legend()
        a.set(xlabel="Edge Sampling Ratio", ylabel="Percentage drop in test accuracy")
        a.set_title(datasetKey)
        # a.set_xticks(np.arange(min(x)-0.1, max(x)+0.1, 0.10))
        # a.set_yticks(np.arange(0, max(y)+0.01, 0.02))

plt.show()

Plotting results: Comparing Different edge sampling methods - Absolute test accuracy

In [None]:
import matplotlib.pyplot as plt
import numpy as np

legend = {
    "RE" : {'color':'r', 'label':'Random Edge'},
    "RW" : {'color':'b', 'label':'Random Walk'},
}

# print (acc_values)

fig = plt.figure(constrained_layout=True, figsize=(13,13))

subfigs = fig.subfigures(len(acc_values[0]), 1)

for subfig, modelKey in zip(subfigs,acc_values[0]):

    subfig.suptitle(modelKey)
    axs = subfig.subplots(1, len(acc_values[0][modelKey]))
    for a,datasetKey in zip(axs,acc_values[0][modelKey].keys()):
      for sampling_method in acc_values[0][modelKey][datasetKey]:

        y_values=[]
        for i in range(experiment_count):
          curr_exp_acc = acc_values[i][modelKey][datasetKey][sampling_method]['test_acc']
          y_values.append(np.array(curr_exp_acc))


        y = [np.mean(k) for k in zip(*y_values)]
        # print(y)
        yerr = [np.std(k) for k in zip(*y_values)]
        # print(yerr)
        x = acc_values[i][modelKey][datasetKey][sampling_method]['sampling_ratio']
        a.errorbar(x,y,yerr = yerr, color = legend[sampling_method]['color'], label = legend[sampling_method]['label'])

        a.legend()
        a.set(xlabel="Edge Sampling Ratio", ylabel="Test Accuracy")
        a.set_title(datasetKey)
        # a.set_xticks(np.arange(min(x)-0.1, max(x)+0.1, 0.10))
        # a.set_yticks(np.arange(0, max(y)+0.01, 0.02))

plt.show()

Plotting results: Comparing Different edge sampling methods - Comparing accuracies of various GNNs

In [None]:
legend_models = {
    "GCN" : {'color':'r', 'label':'GCN'},
    "GAT" : {'color':'b', 'label':'GAT'},
    "TAGCN" : {'color':'g', 'label':'TAGCN'},
}

fig = plt.figure(constrained_layout=True, figsize=(13,13))

subfigs = fig.subfigures(len(acc_values[0]), 1)

# print(list(acc_values[0].keys())[0])
for subfig, datasetKey in zip(subfigs,acc_values[0][list(acc_values[0].keys())[0]]):

    subfig.suptitle(datasetKey)
    axs = subfig.subplots(1, len(acc_values[0][list(acc_values[0].keys())[0]][datasetKey]))
    # print(acc_values[0][list(acc_values[0].keys())[0]][datasetKey])
    for a,sampling_method in zip(axs,acc_values[0][list(acc_values[0].keys())[0]][datasetKey]):
      for modelKey in acc_values[0].keys():

        y_values=[]
        for i in range(experiment_count):
          curr_exp_acc = acc_values[i][modelKey][datasetKey][sampling_method]['test_acc']
          y_values.append(np.array(curr_exp_acc))

        # print("y_values")
        # print(y_values)
        y = [np.mean(k) for k in zip(*y_values)]
        # print(y)
        yerr = [np.std(k) for k in zip(*y_values)]
        x = acc_values[i][modelKey][datasetKey][sampling_method]['sampling_ratio']
        a.errorbar(x,y,yerr = yerr, color = legend_models[modelKey]['color'], label = legend_models[modelKey]['label'])

        a.legend()
        a.set(xlabel="Edge Sampling Ratio", ylabel="Test Accuracy")
        a.set_title(legend[sampling_method]['label'])
        # a.set_xticks(np.arange(min(x)-0.1, max(x)+0.1, 0.10))
        # a.set_yticks(np.arange(0, max(y)+0.01, 0.02))

plt.show()

References:

1. Initial code taken from: https://www.datacamp.com/tutorial/comprehensive-introduction-graph-neural-networks-gnns-tutorial

2. Jure Leskovec and Christos Faloutsos. 2006. Sampling from large graphs. In Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining (KDD '06). Association for Computing Machinery, New York, NY, USA, 631–636. https://doi.org/10.1145/1150402.1150479

3. Jian Du, Shanghang Zhang, Guanhang Wu, José M. F. Moura, & Soummya Kar. (2018). Topology Adaptive Graph Convolutional Networks.


