In [1]:
# Check PyTorch version installed on this system
!python -c "import torch; print(torch.__version__)"

2.1.0+cu121


In [2]:
%%capture
# Download the corresponding PyTorch Geometric module
"""
Assign to TORCH with what you get from the cell above. E.g., export TORCH=1.12.1+cu113
"""
%env TORCH=2.0.1+cu118
!pip install torch-geometric

In [3]:
import random
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.utils import from_networkx
import torch_geometric.utils as geom_utils
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import torch_geometric.nn as geom_nn
import networkx as nx
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
class ClippedReLU(torch.autograd.Function):
  @staticmethod
  def forward(ctx, input):
    output = input.clone()
    input.clamp(min=-1, max=1)
    ctx.save_for_backward(input)
    return output

  @staticmethod
  def backward(ctx, grad_output):
    input, = ctx.saved_tensors
    grad_input = grad_output.clone()
    grad_input[input < -1] = 0
    grad_input[input > 1] = 0
    return grad_input

  # dtype = torch.float
  # device = torch.device("cpu")
  # relu = ClippedReLU().apply

In [5]:
class GCN(torch.nn.Module):
  def __init__(self, gcn_channels, num_gcn_layers, gcn_act, pooling,
               mlp_channels, num_mlp_layers, mlp_act,
               final_act, uniform_init=True):
    super(GCN, self).__init__()
    self.gcn = nn.ModuleList(
        [geom_nn.GCNConv(gcn_channels[0], gcn_channels[1], add_self_loops=True, normalize=False)] +
        [geom_nn.GCNConv(gcn_channels[1], gcn_channels[1], add_self_loops=True, normalize=False)
            for _ in range(num_gcn_layers-2)] +
        [geom_nn.GCNConv(gcn_channels[1], gcn_channels[-1], add_self_loops=True, normalize=False)]
    ) if num_gcn_layers > 1 else nn.ModuleList(
        [geom_nn.GCNConv(gcn_channels[0], gcn_channels[-1], add_self_loops=True, normalize=False)]
    )
    self.gcn_act = gcn_act
    self.pooling = pooling

    self.mlp = nn.ModuleList(
        [nn.Linear(gcn_channels[-1], mlp_channels[0])] +
        [nn.Linear(mlp_channels[0], mlp_channels[0]) for _ in range(num_mlp_layers-2)] +
        [nn.Linear(mlp_channels[0], mlp_channels[-1])]
    ) if num_mlp_layers > 1 else nn.ModuleList(
        [nn.Linear(mlp_channels[0], mlp_channels[-1])]
    )
    self.mlp_act = mlp_act
    self.final_act = final_act

    if  (uniform_init):
      for layer in self.gcn:
        nn.init.uniform_(layer.lin.weight, -1.0, 1.0)
        layer.lin.bias = nn.Parameter(torch.rand(1) * 2 - 1)
      for layer in self.mlp:
        nn.init.uniform_(layer.weight, -1.0, 1.0)
        layer.bias = nn.Parameter(torch.rand(1) * 2 - 1)

  def forward(self, x, edge_index):
    for layer in self.gcn[:-1]:
      x = self.gcn_act(layer(x, edge_index))
    x = self.pooling(self.gcn[-1](x, edge_index), None)

    for layer in self.mlp[:-1]:
      x = self.mlp_act(layer(x))
    x = self.final_act(self.mlp[-1](x))

    return x

In [6]:
from enum import Enum
import os
from math import log

GRAPHS_PATH = "drive/MyDrive/GRL_Mini_Project/data"
MIN_P = 100000

subtype_probs = {
    "regular": lambda n: 0.5,
    "sparse": lambda n: log(n) / (10 * n),
    "very_sparse": lambda n: 1 / n,
}

class GenerationMode(Enum):
  ER = "erdos_renyi"
  SBM = "stochastic_block_model"
  BA = "barabasi_albert"

def graph_file_name(mode: GenerationMode, **kwargs):
  if mode == GenerationMode.ER:
    num_nodes = kwargs["num_nodes"]
    p_edge = kwargs["p_edge"]
    num_features = kwargs["num_features"]
    idx = kwargs.get("idx", "0")
    subtype = kwargs.get("subtype", "regular")

    if (p_edge < 1 / MIN_P):
      raise ValueError(f"Cannot assign file name, probability {p} too small.")

    return os.sep.join([
        GRAPHS_PATH, "erdos_renyi", subtype, "_".join([
            "graph", str(num_nodes), str(int(p_edge * MIN_P)), str(num_features), str(idx)
        ]) + ".pt"
    ])

  elif mode == GenerationMode.SBM:
    num_nodes = kwargs["num_nodes"]
    subtype = kwargs["subtype"]
    idx = kwargs["idx"]

    return os.sep.join([
        GRAPHS_PATH, "stochastic_block_model", subtype, str(num_nodes), str(idx) + ".pt"
    ])

  elif mode == GenerationMode.BA:
    num_nodes = kwargs["num_nodes"]
    subtype = kwargs["subtype"]
    idx = kwargs["idx"]

    return os.sep.join([
        GRAPHS_PATH, "barabasi_albert", subtype, str(num_nodes), str(idx) + ".pt"
    ])

  else:
    raise ValueError("Cannot identify generation mode.")

print(graph_file_name(mode=GenerationMode.ER, num_nodes=10, p_edge=0.5, num_features=10))
print(graph_file_name(mode=GenerationMode.SBM, num_nodes=10, subtype="few_dense_out", idx=1))
print(graph_file_name(mode=GenerationMode.BA, num_nodes=5, subtype="very sparse", idx=2))

drive/MyDrive/GRL_Mini_Project/data/erdos_renyi/regular/graph_10_50000_10_0.pt
drive/MyDrive/GRL_Mini_Project/data/stochastic_block_model/few_dense_out/10/1.pt
drive/MyDrive/GRL_Mini_Project/data/barabasi_albert/very sparse/5/2.pt


In [7]:
MODELS_PATH = "drive/MyDrive/GRL_Mini_Project/models/uniform_init" # TODO os.sep

def gnn_file_name(model_type, params, idx):
  if model_type == "GCN":
    gcn_channels = "_".join(str(c) for c in params["gcn_channels"])
    num_gcn_layers = params["num_gcn_layers"]
    gcn_act = params["gcn_name"]
    pooling = params["pooling_name"]
    mlp_channels = "_".join(str(c) for c in params["mlp_channels"])
    num_mlp_layers = params["num_mlp_layers"]
    mlp_act = params["mlp_name"]
    final_act = params["final_name"]
    num_features = params["num_features"]

    return os.sep.join([
        MODELS_PATH, "_".join([
            str(num_features), gcn_channels, str(num_gcn_layers), gcn_act,
            pooling, mlp_channels, str(num_mlp_layers), mlp_act, final_act
        ]), str(idx) + ".pt"
    ])

  else:
    raise ValueError(f"Cannot recognise GNN type {model_type}")

print(gnn_file_name(
    "GCN", {"gcn_channels":(10, 10, 2), "num_gcn_layers":3, "pooling_name":"mean",
   "gcn_name":"clipped_relu", "mlp_channels":(3, 1), "num_mlp_layers":2,
    "mlp_name":"tanh", "final_name":"sigmoid", "num_features":10
}, idx=1))

drive/MyDrive/GRL_Mini_Project/models/uniform_init/10_10_10_2_3_clipped_relu_mean_3_1_2_tanh_sigmoid/1.pt


In [8]:
import pathlib

def generate_models(params):
  file_name = gnn_file_name("GCN", params, idx=0)
  file_name = os.sep.join(file_name.split(os.sep)[:-1])
  if (not os.path.isdir(file_name)):
    pathlib.Path(file_name).mkdir(parents=True, exist_ok=True)
    print("Generated parent folder", file_name)

  models = []
  for i in range(params["num_models"]):
    file_name = gnn_file_name("GCN", params, idx=i)
    if (os.path.isfile(file_name)):
      model = torch.load(file_name)
    else:
      model = GCN(
          gcn_channels=params["gcn_channels"], num_gcn_layers=params["num_gcn_layers"], gcn_act=params["gcn_act"],
          pooling=params["pooling"], mlp_channels=params["mlp_channels"],
          num_mlp_layers=params["num_mlp_layers"], mlp_act=params["mlp_act"], final_act=params["final_act"]
      ).to(device)
      torch.save(model, file_name)
      print(f"Generated model {i+1}/{params['num_models']}")
    models.append(model)
  print("Initialised all models.")
  return models

In [9]:
def check_valid_files(params, data_params):
  i, j = 0, 0
  for num_nodes in data_params["nums_nodes"]:
    for subtype in data_params["subtypes"]:
      for idx in range(data_params["num_graphs"]):
        file_name = graph_file_name(
            mode=data_params["mode"],
            num_nodes=num_nodes,
            p_edge=subtype_probs.get(subtype, lambda _: None)(num_nodes),
            num_features=params["num_features"],
            subtype=subtype,
            idx=idx,
        )
        j += 1
        if (not os.path.isfile(file_name)):
          print(f"Failed to locate", file_name)
          i += 1
  if (i):
    print(f"Failed to locate {i} out of {j} necessary files.")
    return False
  else:
    print(f"Found all {j} necessary files.")
    return True

In [10]:
from random import shuffle
import gc
from itertools import product

def test(models, params, data_params):
  keys = list(product(data_params["nums_nodes"], data_params["subtypes"], range(data_params["num_graphs"])))
  shuffle(keys) # to decrease GPU RAM load; less chance of a crash due to garbage collector falling behind

  count = 0
  results = [{} for _ in range(params["num_models"])]
  for num_nodes, subtype, idx in keys:
    file_name = graph_file_name(
        mode=data_params["mode"],
        num_nodes=num_nodes,
        p_edge=subtype_probs.get(subtype, lambda _: None)(num_nodes),
        num_features=params["num_features"],
        subtype=subtype,
        idx=idx,
    )

    g = torch.load(file_name)
    for i, model in enumerate(models):
      if (num_nodes, subtype) not in results[i]:
        results[i][(num_nodes, subtype)] = []
      results[i][(num_nodes, subtype)].append(1 if model(g.x, g.edge_index) >= 0.5 else 0)
      if (num_nodes >= 1000):
        gc.collect()
    if (num_nodes >= 1000):
      del g
      gc.collect()

    count += 1
    if (count % 10 == 0):
      print(f"Iteration {count}/{len(keys)} passed")
  return results

In [11]:
import traceback
from datetime import datetime

def run_full_test(params, data_params):
  results = None

  try:
    models = generate_models(params)
    if check_valid_files(params, data_params):
      results = test(models, params, data_params)
  except KeyboardInterrupt:
    return
  except Exception:
    print("Something went wrong!")
    trace = traceback.format_exc()
    print(trace)
    with open("drive/MyDrive/GRL_Mini_Project/error.txt", "a") as f:
      f.write("\n".join([
          "Test with parameters:" , "\t" + params.__str__(), "\t" +
          data_params.__str__(), "raised exception:", trace, "\n\n"
      ]))

  if results:
    mean_results = [{} for _ in range(params["num_models"])]
    for num_nodes, subtype in product(data_params["nums_nodes"], data_params["subtypes"]):
      for i in range(params["num_models"]):
        mean_results[i][(num_nodes, subtype)] = sum(results[i][(num_nodes, subtype)]) / data_params["num_graphs"]

    with open("drive/MyDrive/GRL_Mini_Project/results_uniform_final.txt", "a") as f:
      f.write("\n".join([str(datetime.now()), params.__str__(), data_params.__str__(), mean_results.__str__(), "\n\n"]))
    print("Finished test.")

In [12]:
params_one_clipped_mean = {
    "model_name": "GCN",
    "num_models": 10,
    "num_features": 10, # deprecated
    "gcn_channels": (10, 10), # first value must equal num_features
    "num_gcn_layers": 1,
    "gcn_act": ClippedReLU().apply,
    "gcn_name": "clipped_relu",
    "pooling": geom_nn.pool.global_mean_pool,
    "pooling_name": "mean",
    "mlp_channels": (10, 1), # do not change
    "num_mlp_layers": 2, # do not change
    "mlp_act": nn.Tanh(), # do not change
    "mlp_name": "tanh", # do not change
    "final_act": nn.Sigmoid(), # do not change
    "final_name": "sigmoid", # do not change
}

params_two_clipped_mean = {
    "model_name": "GCN",
    "num_models": 10,
    "num_features": 10, # deprecated
    "gcn_channels": (10, 10), # first value must equal num_features
    "num_gcn_layers": 2,
    "gcn_act": ClippedReLU().apply,
    "gcn_name": "clipped_relu",
    "pooling": geom_nn.pool.global_mean_pool,
    "pooling_name": "mean",
    "mlp_channels": (10, 1), # do not change
    "num_mlp_layers": 2, # do not change
    "mlp_act": nn.Tanh(), # do not change
    "mlp_name": "tanh", # do not change
    "final_act": nn.Sigmoid(), # do not change
    "final_name": "sigmoid", # do not change
}

params_three_clipped_mean = {
    "model_name": "GCN",
    "num_models": 10,
    "num_features": 10, # deprecated
    "gcn_channels": (10, 10), # first value must equal num_features
    "num_gcn_layers": 3,
    "gcn_act": ClippedReLU().apply,
    "gcn_name": "clipped_relu",
    "pooling": geom_nn.pool.global_mean_pool,
    "pooling_name": "mean",
    "mlp_channels": (10, 1), # do not change
    "num_mlp_layers": 2, # do not change
    "mlp_act": nn.Tanh(), # do not change
    "mlp_name": "tanh", # do not change
    "final_act": nn.Sigmoid(), # do not change
    "final_name": "sigmoid", # do not change
}

params_one_relu_mean = {
    "model_name": "GCN",
    "num_models": 10,
    "num_features": 10, # deprecated
    "gcn_channels": (10, 10), # first value must equal num_features
    "num_gcn_layers": 1,
    "gcn_act": nn.ReLU(),
    "gcn_name": "relu",
    "pooling": geom_nn.pool.global_mean_pool,
    "pooling_name": "mean",
    "mlp_channels": (10, 1), # do not change
    "num_mlp_layers": 2, # do not change
    "mlp_act": nn.Tanh(), # do not change
    "mlp_name": "tanh", # do not change
    "final_act": nn.Sigmoid(), # do not change
    "final_name": "sigmoid", # do not change
}

params_two_relu_mean = {
    "model_name": "GCN",
    "num_models": 10,
    "num_features": 10, # deprecated
    "gcn_channels": (10, 10), # first value must equal num_features
    "num_gcn_layers": 2,
    "gcn_act": nn.ReLU(),
    "gcn_name": "relu",
    "pooling": geom_nn.pool.global_mean_pool,
    "pooling_name": "mean",
    "mlp_channels": (10, 1), # do not change
    "num_mlp_layers": 2, # do not change
    "mlp_act": nn.Tanh(), # do not change
    "mlp_name": "tanh", # do not change
    "final_act": nn.Sigmoid(), # do not change
    "final_name": "sigmoid", # do not change
}

params_three_relu_mean = {
    "model_name": "GCN",
    "num_models": 10,
    "num_features": 10, # deprecated
    "gcn_channels": (10, 10), # first value must equal num_features
    "num_gcn_layers": 3,
    "gcn_act": nn.ReLU(),
    "gcn_name": "relu",
    "pooling": geom_nn.pool.global_mean_pool,
    "pooling_name": "mean",
    "mlp_channels": (10, 1), # do not change
    "num_mlp_layers": 2, # do not change
    "mlp_act": nn.Tanh(), # do not change
    "mlp_name": "tanh", # do not change
    "final_act": nn.Sigmoid(), # do not change
    "final_name": "sigmoid", # do not change
}

params_one_clipped_sum = {
    "model_name": "GCN",
    "num_models": 10,
    "num_features": 10, # deprecated
    "gcn_channels": (10, 10), # first value must equal num_features
    "num_gcn_layers": 1,
    "gcn_act": ClippedReLU().apply,
    "gcn_name": "clipped_relu",
    "pooling": geom_nn.pool.global_add_pool,
    "pooling_name": "sum",
    "mlp_channels": (10, 1), # do not change
    "num_mlp_layers": 2, # do not change
    "mlp_act": nn.Tanh(), # do not change
    "mlp_name": "tanh", # do not change
    "final_act": nn.Sigmoid(), # do not change
    "final_name": "sigmoid", # do not change
}

params_two_clipped_sum = {
    "model_name": "GCN",
    "num_models": 10,
    "num_features": 10, # deprecated
    "gcn_channels": (10, 10), # first value must equal num_features
    "num_gcn_layers": 2,
    "gcn_act": ClippedReLU().apply,
    "gcn_name": "clipped_relu",
    "pooling": geom_nn.pool.global_add_pool,
    "pooling_name": "sum",
    "mlp_channels": (10, 1), # do not change
    "num_mlp_layers": 2, # do not change
    "mlp_act": nn.Tanh(), # do not change
    "mlp_name": "tanh", # do not change
    "final_act": nn.Sigmoid(), # do not change
    "final_name": "sigmoid", # do not change
}

params_three_clipped_sum = {
    "model_name": "GCN",
    "num_models": 10,
    "num_features": 10, # deprecated
    "gcn_channels": (10, 10), # first value must equal num_features
    "num_gcn_layers": 3,
    "gcn_act": ClippedReLU().apply,
    "gcn_name": "clipped_relu",
    "pooling": geom_nn.pool.global_add_pool,
    "pooling_name": "sum",
    "mlp_channels": (10, 1), # do not change
    "num_mlp_layers": 2, # do not change
    "mlp_act": nn.Tanh(), # do not change
    "mlp_name": "tanh", # do not change
    "final_act": nn.Sigmoid(), # do not change
    "final_name": "sigmoid", # do not change
}

In [13]:
data_params_er = {
  "mode": GenerationMode.ER,
  "nums_nodes": [10, 50, 100, 500, 1000, 5000],
  "subtypes": ["regular", "sparse", "very_sparse"],
  "num_graphs": 32,
}

data_params_er_no_regular = {
  "mode": GenerationMode.ER,
  "nums_nodes": [10, 50, 100, 500, 1000, 5000],
  "subtypes": ["sparse", "very_sparse"],
  "num_graphs": 32,
}

data_params_sbm = {
    "mode": GenerationMode.SBM,
    "nums_nodes": [10, 50, 100, 500, 1000, 5000],
    "subtypes": ["few_dense_in", "few_dense_out", "many_dense_in", "many_dense_out"],
    "num_graphs": 32,
}

data_params_ba = {
    "mode": GenerationMode.BA,
    "nums_nodes": [10, 50, 100, 500, 1000, 5000],
    "subtypes": ["dense", "sparse", "very sparse"],
    "num_graphs": 32
}

In [14]:
all_params = [params_one_clipped_mean, params_two_clipped_mean, params_three_clipped_mean,
              params_one_relu_mean, params_two_relu_mean, params_three_relu_mean,
              params_one_clipped_sum, params_two_clipped_sum, params_three_clipped_sum]

all_data = [data_params_er, data_params_sbm, data_params_ba]

In [18]:
run_full_test(params_one_relu_mean, data_params_er)
# run_full_test(params_three_clipped_mean, data_params_er)
# run_full_test(params_three_clipped_mean, data_params_sbm)
# run_full_test(params_three_clipped_mean, data_params_ba)

Initialised all models.
Found all 576 necessary files.
Iteration 10/576 passed
Iteration 20/576 passed
Iteration 30/576 passed
Iteration 40/576 passed
Iteration 50/576 passed
Iteration 60/576 passed
Iteration 70/576 passed
Iteration 80/576 passed
Iteration 90/576 passed
Iteration 100/576 passed
Iteration 110/576 passed
Iteration 120/576 passed
Iteration 130/576 passed
Iteration 140/576 passed
Iteration 150/576 passed
Iteration 160/576 passed
Iteration 170/576 passed
Iteration 180/576 passed
Iteration 190/576 passed
Iteration 200/576 passed
Iteration 210/576 passed
Iteration 220/576 passed
Iteration 230/576 passed
Iteration 240/576 passed
Iteration 250/576 passed
Iteration 260/576 passed
Iteration 270/576 passed
Iteration 280/576 passed
Iteration 290/576 passed
Iteration 300/576 passed
Iteration 310/576 passed
Iteration 320/576 passed
Iteration 330/576 passed
Iteration 340/576 passed
Iteration 350/576 passed
Iteration 360/576 passed
Iteration 370/576 passed
Iteration 380/576 passed
Iter

In [16]:
from torch_geometric.datasets import TUDataset
from torch_geometric.datasets import Reddit
from torch_geometric.datasets import PCQM4Mv2

def test_real_dataset(params, dataset):
  models = generate_models(params)

  old_num_features = dataset[0].x.shape[1]
  padding = params["num_features"] - old_num_features
  if (padding < 0):
    raise NotImplementedError()

  res = [[] for _ in range(params["num_models"])]
  for data in dataset:
    x = nn.functional.pad(data.x, (0, padding, 0, 0), mode="constant", value=0).to(device)
    edge_index = data.edge_index.to(device)

    for idx, model in enumerate(models):
      res[idx].append(1 if model(x, edge_index) >= 0.5 else 0)

  mean_res = [sum(r) / len(r) for r in res]
  formatted_results = ", ".join([str(int(r * 100)) + "%" for r in mean_res])
  ans = f"{dataset.name} dataset with initial embedding size {old_num_features} "
  if (padding > 0):
    ans += f"lifted "
  else:
    ans += f"clipped "
  ans += f"to {params['num_features']} classifies as: [{formatted_results}]"
  print(ans)


# test_real_dataset(params_three_relu_mean, TUDataset(root="/tmp/ENZYMES", name="ENZYMES"))

In [17]:
# from torch_geometric.datasets import GNNBenchmarkDataset

# dataset = TUDataset(root="/tmp/MUTAG", name="MUTAG")
# print(len(dataset))
# print(dataset[0])
# num_features = 7
# model = GCN((num_features, num_features), 2, ClippedReLU().apply, geom_nn.pool.global_mean_pool,
#             (num_features, 1), 2, nn.Tanh(), nn.Sigmoid())
# for layer in model.mlp:
#   layer.requires_grad = False