In [1]:
!pip install --pre dgl -f https://data.dgl.ai/wheels-test/cu118/repo.html
!pip install --pre dglgo -f https://data.dgl.ai/wheels-test/repo.html

Looking in links: https://data.dgl.ai/wheels-test/cu118/repo.html
Collecting dgl
  Downloading https://data.dgl.ai/wheels-test/cu118/dgl-1.2a231216%2Bcu118-cp310-cp310-manylinux1_x86_64.whl (312.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.9/312.9 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: dgl
Successfully installed dgl-1.2a231216+cu118
Looking in links: https://data.dgl.ai/wheels-test/repo.html
Collecting dglgo
  Downloading dglgo-0.0.2-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting numpydoc>=1.1.0 (from dglgo)
  Obtaining dependency information for numpydoc>=1.1.0 from https://files.pythonhosted.org/packages/9c/94/09c437fd4a5fb5adf0468c0865c781dbc11d399544b55f1163d5d4414afb/numpydoc-1.6.0-py3-none-any.whl.metadata
  Downloading numpydoc-1.6.0-py3-none-any.whl.metadata (4.2 kB)
Collecting ogb>=1.3.3

In [2]:
import torch
from torch.cuda.amp import autocast, GradScaler
from torch import nn
from dgl import ops
from dgl.nn.functional import edge_softmax
import dgl
import tqdm
import os
import numpy as np
from torch.nn import functional as F
from sklearn.metrics import roc_auc_score

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [3]:
torch.__version__

'2.0.0'

In [4]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [5]:
class ResModule(nn.Module):
    def __init__(self, module, normalization, dim, **kwargs):
        super().__init__()
        self.normalization = normalization(dim)
        self.module = module(dim=dim, **kwargs)

    def forward(self, graph, x):
        x_res = self.normalization(x)
        x_res = self.module(graph, x_res)
        x = x + x_res

        return x


class FeedForwardModule(nn.Module):
    def __init__(self, dim, dropout, input_dim_multiplier=1, **kwargs):
        super().__init__()
        input_dim = int(dim * input_dim_multiplier)
        hidden_dim = int(dim)
        self.linear_1 = nn.Linear(in_features=input_dim, out_features=hidden_dim)
        self.dropout_1 = nn.Dropout(p=dropout)
        self.act = nn.GELU()
        self.linear_2 = nn.Linear(in_features=hidden_dim, out_features=dim)
        self.dropout_2 = nn.Dropout(p=dropout)

    def forward(self, graph, x):
        x = self.linear_1(x)
        x = self.dropout_1(x)
        x = self.act(x)
        x = self.linear_2(x)
        x = self.dropout_2(x)

        return x

In [6]:
class TransformerAttentionModule(nn.Module):
    def __init__(self, dim, num_heads, dropout, **kwargs):
        super().__init__()

        _check_dim_and_num_heads_consistency(dim, num_heads)
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads

        self.attn_query = nn.Linear(in_features=dim, out_features=dim)
        self.attn_key = nn.Linear(in_features=dim, out_features=dim)
        self.attn_value = nn.Linear(in_features=dim, out_features=dim)

        self.output_linear = nn.Linear(in_features=dim, out_features=dim)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, graph, x):
        queries = self.attn_query(x)
        keys = self.attn_key(x)
        values = self.attn_value(x)
        queries = queries.reshape(-1, self.num_heads, self.head_dim)
        keys = keys.reshape(-1, self.num_heads, self.head_dim)
        values = values.reshape(-1, self.num_heads, self.head_dim)
        attn_scores = ops.u_dot_v(graph, queries, keys) / self.head_dim ** 0.5
        attn_probs = edge_softmax(graph, attn_scores)
        x = ops.u_mul_e_sum(graph, values, attn_probs)
        x = x.reshape(-1, self.dim)
        x = self.output_linear(x)
        x = self.dropout(x)
        return x

In [7]:
class TransformerAttentionSepModule(nn.Module):
    def __init__(self, dim, num_heads, dropout, **kwargs):
        super().__init__()

        _check_dim_and_num_heads_consistency(dim, num_heads)
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads

        self.attn_query = nn.Linear(in_features=dim, out_features=dim)
        self.attn_key = nn.Linear(in_features=dim, out_features=dim)
        self.attn_value = nn.Linear(in_features=dim, out_features=dim)

        self.output_linear = nn.Linear(in_features=dim * 2, out_features=dim)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, graph, x):
        queries = self.attn_query(x)
        keys = self.attn_key(x)
        values = self.attn_value(x)

        queries = queries.reshape(-1, self.num_heads, self.head_dim)
        keys = keys.reshape(-1, self.num_heads, self.head_dim)
        values = values.reshape(-1, self.num_heads, self.head_dim)

        attn_scores = ops.u_dot_v(graph, queries, keys) / self.head_dim ** 0.5
        attn_probs = edge_softmax(graph, attn_scores)

        message = ops.u_mul_e_sum(graph, values, attn_probs)
        message = message.reshape(-1, self.dim)
        x = torch.cat([x, message], axis=1)

        x = self.output_linear(x)
        x = self.dropout(x)

        return x

In [8]:
class GTModel(nn.Module):
    def __init__(self,num_layers,input_dim,hidden_dim,output_dim,hidden_dim_multiplier,num_heads):
        super().__init__()
        
        self.input_linear = nn.Linear(in_features=input_dim,out_features=hidden_dim)
        self.dropout_layer = nn.Dropout(p=0.2)
        self.activation_layer = nn.GELU()
        self.residual_modules = nn.ModuleList()
        for _ in range(num_layers):
            self.residual_modules.append(ResModule(module=TransformerAttentionModule,
                                             normalization=nn.BatchNorm1d,
                                             dim=hidden_dim,
                                             num_heads=num_heads,
                                             dropout=0.2))
            self.residual_modules.append(ResModule(module=FeedForwardModule,
                                             normalization=nn.BatchNorm1d,
                                             dim=hidden_dim,
                                             num_heads=num_heads,
                                             dropout=0.2))
        self.output_normalization = nn.BatchNorm1d(hidden_dim)
        self.output_linear = nn.Linear(in_features=hidden_dim,out_features=output_dim)
    def forward(self,graph,x):
        x = self.input_linear(x)
        x = self.dropout_layer(x)
        x = self.activation_layer(x)
        for module in self.residual_modules:
            x = module(graph,x)
        x = self.output_normalization(x)
        x = self.output_linear(x).squeeze(1)
        return x

In [22]:
class GTSepModel(nn.Module):
    def __init__(self,num_layers,input_dim,hidden_dim,output_dim,hidden_dim_multiplier,num_heads):
        super().__init__()
        
        self.input_linear = nn.Linear(in_features=input_dim,out_features=hidden_dim)
        self.dropout_layer = nn.Dropout(p=0.2)
        self.activation_layer = nn.GELU()
        self.residual_modules = nn.ModuleList()
        for _ in range(num_layers):
            self.residual_modules.append(ResModule(module=TransformerAttentionSepModule,
                                             normalization=nn.BatchNorm1d,
                                             dim=hidden_dim,
                                             num_heads=num_heads,
                                             dropout=0.2))
            self.residual_modules.append(ResModule(module=FeedForwardModule,
                                             normalization=nn.BatchNorm1d,
                                             dim=hidden_dim,
                                             num_heads=num_heads,
                                             dropout=0.2))
        self.output_normalization = nn.BatchNorm1d(hidden_dim)
        self.output_linear = nn.Linear(in_features=hidden_dim,out_features=output_dim)
    def forward(self,graph,x):
        x = self.input_linear(x)
        x = self.dropout_layer(x)
        x = self.activation_layer(x)
        for module in self.residual_modules:
            x = module(graph,x)
        x = self.output_normalization(x)
        x = self.output_linear(x).squeeze(1)
        return x

In [10]:
def _check_dim_and_num_heads_consistency(dim, num_heads):
    if dim % num_heads != 0:
        raise ValueError('Dimension mismatch: hidden_dim should be a multiple of num_heads.')

In [11]:
class Dataset:
    def __init__(self, name, add_self_loops=False, device='gpu'):
        
        data = np.load(os.path.join('/kaggle/input/heterophily/data', f'{name.replace("-", "_")}.npz'))
        node_features = torch.tensor(data['node_features'])
        labels = torch.tensor(data['node_labels'])
        edges = torch.tensor(data['edges'])

        graph = dgl.graph((edges[:, 0], edges[:, 1]), num_nodes=len(node_features), idtype=torch.int)
        
        if 'directed' not in name:
            graph = dgl.to_bidirected(graph)

        if add_self_loops:
            graph = dgl.add_self_loop(graph)

        num_classes = len(labels.unique())
        num_targets = 1 if num_classes == 2 else num_classes
        if num_targets == 1:
            labels = labels.float()

        train_masks = torch.tensor(data['train_masks'])
        val_masks = torch.tensor(data['val_masks'])
        test_masks = torch.tensor(data['test_masks'])

        train_idx_list = [torch.where(train_mask)[0] for train_mask in train_masks]
        val_idx_list = [torch.where(val_mask)[0] for val_mask in val_masks]
        test_idx_list = [torch.where(test_mask)[0] for test_mask in test_masks]

        self.name = name
        self.device = device

        self.graph = graph.to(device)
        self.node_features = node_features.to(device)
        self.labels = labels.to(device)

        self.train_idx_list = [train_idx.to(device) for train_idx in train_idx_list]
        self.val_idx_list = [val_idx.to(device) for val_idx in val_idx_list]
        self.test_idx_list = [test_idx.to(device) for test_idx in test_idx_list]
        self.num_data_splits = len(train_idx_list)
        self.cur_data_split = 0

        self.num_node_features = node_features.shape[1]
        self.num_targets = num_targets

        self.loss_fn = F.binary_cross_entropy_with_logits if num_targets == 1 else F.cross_entropy
        self.metric = 'accuracy'

    @property
    def train_idx(self):
        return self.train_idx_list[self.cur_data_split]

    @property
    def val_idx(self):
        return self.val_idx_list[self.cur_data_split]

    @property
    def test_idx(self):
        return self.test_idx_list[self.cur_data_split]

    def next_data_split(self):
        self.cur_data_split = (self.cur_data_split + 1) % self.num_data_splits

    def compute_metrics(self, logits):
        if self.num_targets == 1:
            train_metric = roc_auc_score(y_true=self.labels[self.train_idx].cpu().numpy(),
                                         y_score=logits[self.train_idx].cpu().numpy()).item()

            val_metric = roc_auc_score(y_true=self.labels[self.val_idx].cpu().numpy(),
                                       y_score=logits[self.val_idx].cpu().numpy()).item()

            test_metric = roc_auc_score(y_true=self.labels[self.test_idx].cpu().numpy(),
                                        y_score=logits[self.test_idx].cpu().numpy()).item()

        else:
            preds = logits.argmax(axis=1)
            train_metric = (preds[self.train_idx] == self.labels[self.train_idx]).float().mean().item()
            val_metric = (preds[self.val_idx] == self.labels[self.val_idx]).float().mean().item()
            test_metric = (preds[self.test_idx] == self.labels[self.test_idx]).float().mean().item()

        metrics = {
            f'train {self.metric}': train_metric,
            f'val {self.metric}': val_metric,
            f'test {self.metric}': test_metric
        }

        return metrics


In [12]:
device = torch.device("cuda:0")

In [13]:
dataset = Dataset(name="roman-empire",
                 add_self_loops=True,
                 device=device)

In [14]:
# model = Model(model_name="GT-sep",num_layers=5,input_dim=dataset.num_node_features,
#              hidden_dim=512,output_dim=dataset.num_targets,
#              hidden_dim_multiplier=1,num_heads=8,normalization='BatchNorm',
#              dropout=0.2)

In [15]:
model = GTModel(num_layers=5,input_dim=dataset.num_node_features,
                   hidden_dim=512,output_dim=dataset.num_targets,
                   hidden_dim_multiplier=1,num_heads=8)

In [16]:
def train_step(model, dataset, optimizer, scheduler, scaler, amp=False):
    model.train()

    with autocast(enabled=amp):
        logits = model(graph=dataset.graph, x=dataset.node_features)
        loss = dataset.loss_fn(input=logits[dataset.train_idx], target=dataset.labels[dataset.train_idx])

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad()
    scheduler.step()


@torch.no_grad()
def evaluate(model, dataset, amp=False):
    model.eval()
    with autocast(enabled=amp):
        logits = model(graph=dataset.graph, x=dataset.node_features)
    metrics = dataset.compute_metrics(logits)
    return metrics

In [17]:
def get_parameter_groups(model):
    no_weight_decay_names = ['bias', 'normalization', 'label_embeddings']
    parameter_groups = [
        {
            'params': [param for name, param in model.named_parameters()
                       if not any(no_weight_decay_name in name for no_weight_decay_name in no_weight_decay_names)]
        },
        {
            'params': [param for name, param in model.named_parameters()
                       if any(no_weight_decay_name in name for no_weight_decay_name in no_weight_decay_names)],
            'weight_decay': 0
        },
    ]
    return parameter_groups
def get_lr_scheduler_with_warmup(optimizer, num_warmup_steps=None, num_steps=None, warmup_proportion=None,
                                 last_step=-1):


    if num_warmup_steps is None:
        num_warmup_steps = int(num_steps * warmup_proportion)

    def get_lr_multiplier(step):
        if step < num_warmup_steps:
            return (step + 1) / (num_warmup_steps + 1)
        else:
            return 1

    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=get_lr_multiplier, last_epoch=last_step)

    return lr_scheduler

In [18]:
model.to(torch.device("cuda:0"))

GTModel(
  (input_linear): Linear(in_features=300, out_features=512, bias=True)
  (dropout_layer): Dropout(p=0.2, inplace=False)
  (activation_layer): GELU(approximate='none')
  (residual_modules): ModuleList(
    (0): ResModule(
      (normalization): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (module): TransformerAttentionModule(
        (attn_query): Linear(in_features=512, out_features=512, bias=True)
        (attn_key): Linear(in_features=512, out_features=512, bias=True)
        (attn_value): Linear(in_features=512, out_features=512, bias=True)
        (output_linear): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
    )
    (1): ResModule(
      (normalization): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (module): FeedForwardModule(
        (linear_1): Linear(in_features=512, out_features=512, bias=True)
        (dropout_1): Dropo

In [19]:
parameter_groups = get_parameter_groups(model)
optimizer = torch.optim.AdamW(parameter_groups, lr=3e-5, weight_decay=0)
scheduler = get_lr_scheduler_with_warmup(optimizer=optimizer, num_warmup_steps=None,
                                                 num_steps=500, warmup_proportion=0)

In [20]:
for run in range(1):
    with tqdm.tqdm(total=500, desc=f'Run {run}', disable=False) as progress_bar:
                for step in range(1, 500 + 1):
                    train_step(model=model, dataset=dataset, optimizer=optimizer, scheduler=scheduler,scaler= GradScaler(enabled=True))
                    metrics = evaluate(model=model, dataset=dataset, amp=False)
                    progress_bar.update()
                    progress_bar.set_postfix({metric: f'{value:.2f}' for metric, value in metrics.items()})

Run 0: 100%|██████████| 500/500 [02:27<00:00,  3.39it/s, train accuracy=0.96, val accuracy=0.82, test accuracy=0.80]


In [25]:
datasets = ["squirrel","squirrel-filtered","chameleon","chameleon_filtered","roman-empire","minesweeper","amazon-ratings","questions","tolokers"]
for dataset_name in datasets:
    dataset = Dataset(name=dataset_name,
                 add_self_loops=True,
                 device=device)
    model = GTSepModel(num_layers=5,input_dim=dataset.num_node_features,
                   hidden_dim=512,output_dim=dataset.num_targets,
                   hidden_dim_multiplier=1,num_heads=8)
    model.to(torch.device("cuda:0"))
    parameter_groups = get_parameter_groups(model)
    optimizer = torch.optim.AdamW(parameter_groups, lr=3e-5, weight_decay=0)
    scheduler = get_lr_scheduler_with_warmup(optimizer=optimizer, num_warmup_steps=None,
                                                     num_steps=500, warmup_proportion=0)
    print("Running for:",dataset_name)
    for run in range(1):
        with tqdm.tqdm(total=500, desc=f'Run {run}', disable=False) as progress_bar:
                    for step in range(1, 500 + 1):
                        train_step(model=model, dataset=dataset, optimizer=optimizer, scheduler=scheduler,scaler= GradScaler(enabled=True))
                        metrics = evaluate(model=model, dataset=dataset, amp=False)
                        progress_bar.update()
                        progress_bar.set_postfix({metric: f'{value:.2f}' for metric, value in metrics.items()})

Running for: squirrel


Run 0: 100%|██████████| 500/500 [01:46<00:00,  4.71it/s, train accuracy=1.00, val accuracy=0.39, test accuracy=0.40]


Running for: squirrel-filtered


Run 0: 100%|██████████| 500/500 [00:36<00:00, 13.62it/s, train accuracy=1.00, val accuracy=0.32, test accuracy=0.33]


Running for: chameleon


Run 0: 100%|██████████| 500/500 [00:33<00:00, 14.81it/s, train accuracy=1.00, val accuracy=0.50, test accuracy=0.49]


Running for: chameleon_filtered


Run 0: 100%|██████████| 500/500 [00:21<00:00, 23.07it/s, train accuracy=1.00, val accuracy=0.37, test accuracy=0.34]


Running for: roman-empire


Run 0: 100%|██████████| 500/500 [02:43<00:00,  3.07it/s, train accuracy=0.99, val accuracy=0.84, test accuracy=0.83]


Running for: minesweeper


Run 0: 100%|██████████| 500/500 [01:25<00:00,  5.82it/s, train accuracy=0.96, val accuracy=0.91, test accuracy=0.92]


Running for: amazon-ratings


Run 0: 100%|██████████| 500/500 [03:12<00:00,  2.60it/s, train accuracy=0.82, val accuracy=0.50, test accuracy=0.51]


Running for: questions


Run 0: 100%|██████████| 500/500 [06:21<00:00,  1.31it/s, train accuracy=0.79, val accuracy=0.71, test accuracy=0.71]


Running for: tolokers


Run 0: 100%|██████████| 500/500 [04:01<00:00,  2.07it/s, train accuracy=0.91, val accuracy=0.82, test accuracy=0.84]
