In [1]:
!pip install --pre dgl -f https://data.dgl.ai/wheels-test/cu118/repo.html
!pip install --pre dglgo -f https://data.dgl.ai/wheels-test/repo.html

Looking in links: https://data.dgl.ai/wheels-test/cu118/repo.html
Collecting dgl
  Downloading https://data.dgl.ai/wheels-test/cu118/dgl-1.2a231216%2Bcu118-cp310-cp310-manylinux1_x86_64.whl (312.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.9/312.9 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: dgl
Successfully installed dgl-1.2a231216+cu118
Looking in links: https://data.dgl.ai/wheels-test/repo.html
Collecting dglgo
  Downloading dglgo-0.0.2-py3-none-any.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.5/63.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting numpydoc>=1.1.0 (from dglgo)
  Obtaining dependency information for numpydoc>=1.1.0 from https://files.pythonhosted.org/packages/9c/94/09c437fd4a5fb5adf0468c0865c781dbc11d399544b55f1163d5d4414afb/numpydoc-1.6.0-py3-none-any.whl.metadata
  Downloading numpydoc-1.6.0-py3-none-any.whl.metadata (4.2 kB)
Collecting ogb>=1.3.3

In [2]:
import torch
from torch.cuda.amp import autocast, GradScaler
from torch import nn
from dgl import ops
from dgl.nn.functional import edge_softmax
import dgl
import tqdm
import os
import numpy as np
from torch.nn import functional as F

from dgl import ops
from sklearn.metrics import roc_auc_score

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [None]:
torch.__version__

In [None]:
!nvcc --version

In [5]:
class GATSepModule(nn.Module):
    def __init__(self, dim, hidden_dim_multiplier, num_heads, dropout, **kwargs):
        super().__init__()

        _check_dim_and_num_heads_consistency(dim, num_heads)
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads

        self.input_linear = nn.Linear(in_features=dim, out_features=dim)

        self.attn_linear_u = nn.Linear(in_features=dim, out_features=num_heads)
        self.attn_linear_v = nn.Linear(in_features=dim, out_features=num_heads, bias=False)
        self.attn_act = nn.LeakyReLU(negative_slope=0.2)

        self.feed_forward_module = FeedForwardModule(dim=dim,
                                                     input_dim_multiplier=2,
                                                     hidden_dim_multiplier=hidden_dim_multiplier,
                                                     dropout=dropout)

    def forward(self, graph, x):
        x = self.input_linear(x)

        attn_scores_u = self.attn_linear_u(x)
        attn_scores_v = self.attn_linear_v(x)
        attn_scores = ops.u_add_v(graph, attn_scores_u, attn_scores_v)
        attn_scores = self.attn_act(attn_scores)
        attn_probs = edge_softmax(graph, attn_scores)

        x = x.reshape(-1, self.head_dim, self.num_heads)
        message = ops.u_mul_e_sum(graph, x, attn_probs)
        x = x.reshape(-1, self.dim)
        message = message.reshape(-1, self.dim)
        x = torch.cat([x, message], axis=1)

        x = self.feed_forward_module(graph, x)

        return x


# class AttentionGATModule(nn.Module):
#     def __init__(self, dimensions, multiplier_hidden, heads_count, drop_rate, **extra_args):
#         super(AttentionGATModule, self).__init__()

#         _validate_dimensions_and_heads(dimensions, heads_count)
#         self.dimensions = dimensions
#         self.heads_count = heads_count
#         self.dimension_per_head = dimensions // heads_count

#         self.linear_transform_input = nn.Linear(in_features=dimensions, out_features=dimensions)

#         self.linear_attention_a = nn.Linear(in_features=dimensions, out_features=heads_count)
#         self.linear_attention_b = nn.Linear(in_features=dimensions, out_features=heads_count, bias=False)
#         self.activation_attention = nn.LeakyReLU(negative_slope=0.2)

#         self.module_feedforward = FeedForwardModule(dim=dimensions,
#                                                     multiplier_input=2,
#                                                     multiplier_hidden=multiplier_hidden,
#                                                     dropout_rate=drop_rate)

#     def forward(self, input_graph, input_features):
#         transformed_input = self.linear_transform_input(input_features)

#         attention_scores_a = self.linear_attention_a(transformed_input)
#         attention_scores_b = self.linear_attention_b(transformed_input)
#         combined_attention_scores = ops.add_u_v(input_graph, attention_scores_a, attention_scores_b)
#         combined_attention_scores = self.activation_attention(combined_attention_scores)
#         normalized_attention = edge_softmax(input_graph, combined_attention_scores)

#         transformed_input = transformed_input.view(-1, self.dimension_per_head, self.heads_count)
#         attention_message = ops.sum_u_mul_e(input_graph, transformed_input, normalized_attention)
#         transformed_input = transformed_input.view(-1, self.dimensions)
#         attention_message = attention_message.view(-1, self.dimensions)
#         concatenated_features = torch.cat([transformed_input, attention_message], axis=1)

#         output_features = self.module_feedforward(input_graph, concatenated_features)

#         return output_features


In [6]:
from torch import nn

MODULES = {
    'GAT-sep': [GATSepModule]
}


NORMALIZATION = {
    'None': nn.Identity,
    'LayerNorm': nn.LayerNorm,
    'BatchNorm': nn.BatchNorm1d
}
class Model(nn.Module):
    def __init__(self, model_name, num_layers, input_dim, hidden_dim, output_dim, hidden_dim_multiplier, num_heads,
                 normalization, dropout):

        super().__init__()

        normalization = NORMALIZATION[normalization]

        self.input_linear = nn.Linear(in_features=input_dim, out_features=hidden_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.act = nn.GELU()

        self.residual_modules = nn.ModuleList()
        for _ in range(num_layers):
            for module in MODULES[model_name]:
                residual_module = ResidualModuleWrapper(module=module,
                                                        normalization=normalization,
                                                        dim=hidden_dim,
                                                        hidden_dim_multiplier=hidden_dim_multiplier,
                                                        num_heads=num_heads,
                                                        dropout=dropout)

                self.residual_modules.append(residual_module)

        self.output_normalization = normalization(hidden_dim)
        self.output_linear = nn.Linear(in_features=hidden_dim, out_features=output_dim)

    def forward(self, graph, x):
        x = self.input_linear(x)
        x = self.dropout(x)
        x = self.act(x)

        for residual_module in self.residual_modules:
            x = residual_module(graph, x)

        x = self.output_normalization(x)
        x = self.output_linear(x).squeeze(1)

        return x

In [7]:
def _check_dim_and_num_heads_consistency(dim, num_heads):
    if dim % num_heads != 0:
        raise ValueError('Dimension mismatch: hidden_dim should be a multiple of num_heads.')

In [8]:
class Dataset:
    def __init__(self, name, add_self_loops=False, device='cpu', use_sgc_features=False, use_identity_features=False,
                 use_adjacency_features=False, do_not_use_original_features=False):

        if do_not_use_original_features and not any([use_sgc_features, use_identity_features, use_adjacency_features]):
            raise ValueError('If original node features are not used, at least one of the arguments '
                             'use_sgc_features, use_identity_features, use_adjacency_features should be used.')

        print('Preparing data...')
        data = np.load(os.path.join('/kaggle/input/squirrel', f'{name.replace("-", "_")}.npz'))
        node_features = torch.tensor(data['node_features'])
        labels = torch.tensor(data['node_labels'])
        edges = torch.tensor(data['edges'])

        graph = dgl.graph((edges[:, 0], edges[:, 1]), num_nodes=len(node_features), idtype=torch.int)

        if 'directed' not in name:
            graph = dgl.to_bidirected(graph)

        if add_self_loops:
            graph = dgl.add_self_loop(graph)

        num_classes = len(labels.unique())
        num_targets = 1 if num_classes == 2 else num_classes
        if num_targets == 1:
            labels = labels.float()

        train_masks = torch.tensor(data['train_masks'])
        val_masks = torch.tensor(data['val_masks'])
        test_masks = torch.tensor(data['test_masks'])

        train_idx_list = [torch.where(train_mask)[0] for train_mask in train_masks]
        val_idx_list = [torch.where(val_mask)[0] for val_mask in val_masks]
        test_idx_list = [torch.where(test_mask)[0] for test_mask in test_masks]

        node_features = self.augment_node_features(graph=graph,
                                                   node_features=node_features,
                                                   use_sgc_features=use_sgc_features,
                                                   use_identity_features=use_identity_features,
                                                   use_adjacency_features=use_adjacency_features,
                                                   do_not_use_original_features=do_not_use_original_features)

        self.name = name
        self.device = device

        self.graph = graph.to(device)
        self.node_features = node_features.to(device)
        self.labels = labels.to(device)

        self.train_idx_list = [train_idx.to(device) for train_idx in train_idx_list]
        self.val_idx_list = [val_idx.to(device) for val_idx in val_idx_list]
        self.test_idx_list = [test_idx.to(device) for test_idx in test_idx_list]
        self.num_data_splits = len(train_idx_list)
        self.cur_data_split = 0

        self.num_node_features = node_features.shape[1]
        self.num_targets = num_targets

        self.loss_fn = F.binary_cross_entropy_with_logits if num_targets == 1 else F.cross_entropy
        self.metric = 'ROC AUC' if num_targets == 1 else 'accuracy'

    @property
    def train_idx(self):
        return self.train_idx_list[self.cur_data_split]

    @property
    def val_idx(self):
        return self.val_idx_list[self.cur_data_split]

    @property
    def test_idx(self):
        return self.test_idx_list[self.cur_data_split]

    def next_data_split(self):
        self.cur_data_split = (self.cur_data_split + 1) % self.num_data_splits

    def compute_metrics(self, logits):
        if self.num_targets == 1:
            train_metric = roc_auc_score(y_true=self.labels[self.train_idx].cpu().numpy(),
                                         y_score=logits[self.train_idx].cpu().numpy()).item()

            val_metric = roc_auc_score(y_true=self.labels[self.val_idx].cpu().numpy(),
                                       y_score=logits[self.val_idx].cpu().numpy()).item()

            test_metric = roc_auc_score(y_true=self.labels[self.test_idx].cpu().numpy(),
                                        y_score=logits[self.test_idx].cpu().numpy()).item()

        else:
            preds = logits.argmax(axis=1)
            train_metric = (preds[self.train_idx] == self.labels[self.train_idx]).float().mean().item()
            val_metric = (preds[self.val_idx] == self.labels[self.val_idx]).float().mean().item()
            test_metric = (preds[self.test_idx] == self.labels[self.test_idx]).float().mean().item()

        metrics = {
            f'train {self.metric}': train_metric,
            f'val {self.metric}': val_metric,
            f'test {self.metric}': test_metric
        }

        return metrics

    @staticmethod
    def augment_node_features(graph, node_features, use_sgc_features, use_identity_features, use_adjacency_features,
                              do_not_use_original_features):

        n = graph.num_nodes()
        original_node_features = node_features

        if do_not_use_original_features:
            node_features = torch.tensor([[] for _ in range(n)])

        if use_sgc_features:
            sgc_features = Dataset.compute_sgc_features(graph, original_node_features)
            node_features = torch.cat([node_features, sgc_features], axis=1)

        if use_identity_features:
            node_features = torch.cat([node_features, torch.eye(n)], axis=1)

        if use_adjacency_features:
            graph_without_self_loops = dgl.remove_self_loop(graph)
            adj_matrix = graph_without_self_loops.adjacency_matrix().to_dense()
            node_features = torch.cat([node_features, adj_matrix], axis=1)

        return node_features

    @staticmethod
    def compute_sgc_features(graph, node_features, num_props=5):
        graph = dgl.remove_self_loop(graph)
        graph = dgl.add_self_loop(graph)

        degrees = graph.out_degrees().float()
        degree_edge_products = ops.u_mul_v(graph, degrees, degrees)
        norm_coefs = 1 / degree_edge_products ** 0.5

        for _ in range(num_props):
            node_features = ops.u_mul_e_sum(graph, node_features, norm_coefs)

        return node_features

In [9]:
device = torch.device("cuda:0")

In [10]:
dataset = Dataset(name="squirrel",
                 add_self_loops=True,
                 device=device,
                 use_sgc_features=False,
                 use_identity_features=False,
                 use_adjacency_features=False,
                 do_not_use_original_features=False)


Preparing data...


In [11]:
class ResidualModuleWrapper(nn.Module):
    def __init__(self, module, normalization, dim, **kwargs):
        super().__init__()
        self.normalization = normalization(dim)
        self.module = module(dim=dim, **kwargs)

    def forward(self, graph, x):
        x_res = self.normalization(x)
        x_res = self.module(graph, x_res)
        x = x + x_res

        return x


class FeedForwardModule(nn.Module):
    def __init__(self, dim, hidden_dim_multiplier, dropout, input_dim_multiplier=1, **kwargs):
        super().__init__()
        input_dim = int(dim * input_dim_multiplier)
        hidden_dim = int(dim * hidden_dim_multiplier)
        self.linear_1 = nn.Linear(in_features=input_dim, out_features=hidden_dim)
        self.dropout_1 = nn.Dropout(p=dropout)
        self.act = nn.GELU()
        self.linear_2 = nn.Linear(in_features=hidden_dim, out_features=dim)
        self.dropout_2 = nn.Dropout(p=dropout)

    def forward(self, graph, x):
        x = self.linear_1(x)
        x = self.dropout_1(x)
        x = self.act(x)
        x = self.linear_2(x)
        x = self.dropout_2(x)

        return x

In [12]:
model = Model(model_name="GAT-sep",num_layers=5,input_dim=dataset.num_node_features,
             hidden_dim=512,output_dim=dataset.num_targets,
             hidden_dim_multiplier=1,num_heads=8,normalization='BatchNorm',
             dropout=0.2)

In [13]:
def train_step(model, dataset, optimizer, scheduler, scaler, amp=False):
    model.train()

    with autocast(enabled=amp):
        logits = model(graph=dataset.graph, x=dataset.node_features)
        loss = dataset.loss_fn(input=logits[dataset.train_idx], target=dataset.labels[dataset.train_idx])

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad()
    scheduler.step()


@torch.no_grad()
def evaluate(model, dataset, amp=False):
    model.eval()
    with autocast(enabled=amp):
        logits = model(graph=dataset.graph, x=dataset.node_features)
    metrics = dataset.compute_metrics(logits)
    return metrics

In [14]:
def get_parameter_groups(model):
    no_weight_decay_names = ['bias', 'normalization', 'label_embeddings']

    parameter_groups = [
        {
            'params': [param for name, param in model.named_parameters()
                       if not any(no_weight_decay_name in name for no_weight_decay_name in no_weight_decay_names)]
        },
        {
            'params': [param for name, param in model.named_parameters()
                       if any(no_weight_decay_name in name for no_weight_decay_name in no_weight_decay_names)],
            'weight_decay': 0
        },
    ]

    return parameter_groups
def get_lr_scheduler_with_warmup(optimizer, num_warmup_steps=None, num_steps=None, warmup_proportion=None,
                                 last_step=-1):

    if num_warmup_steps is None and (num_steps is None or warmup_proportion is None):
        raise ValueError('Either num_warmup_steps or num_steps and warmup_proportion should be provided.')

    if num_warmup_steps is None:
        num_warmup_steps = int(num_steps * warmup_proportion)

    def get_lr_multiplier(step):
        if step < num_warmup_steps:
            return (step + 1) / (num_warmup_steps + 1)
        else:
            return 1

    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=get_lr_multiplier, last_epoch=last_step)

    return lr_scheduler

In [15]:
# torch.onnx.export(model, (dataset.graph.adj_tensors('csc'),dataset.node_features), 'gt-sep.onnx', input_names=["features"], output_names=["logits"])

In [16]:
model.to(torch.device("cuda:0"))

Model(
  (input_linear): Linear(in_features=2089, out_features=512, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (act): GELU(approximate='none')
  (residual_modules): ModuleList(
    (0-4): 5 x ResidualModuleWrapper(
      (normalization): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (module): GATSepModule(
        (input_linear): Linear(in_features=512, out_features=512, bias=True)
        (attn_linear_u): Linear(in_features=512, out_features=8, bias=True)
        (attn_linear_v): Linear(in_features=512, out_features=8, bias=False)
        (attn_act): LeakyReLU(negative_slope=0.2)
        (feed_forward_module): FeedForwardModule(
          (linear_1): Linear(in_features=1024, out_features=512, bias=True)
          (dropout_1): Dropout(p=0.2, inplace=False)
          (act): GELU(approximate='none')
          (linear_2): Linear(in_features=512, out_features=512, bias=True)
          (dropout_2): Dropout(p=0.2, inplace=False)
        )

In [17]:
parameter_groups = get_parameter_groups(model)
optimizer = torch.optim.AdamW(parameter_groups, lr=3e-5, weight_decay=0)
scheduler = get_lr_scheduler_with_warmup(optimizer=optimizer, num_warmup_steps=None,
                                                 num_steps=500, warmup_proportion=0)

In [18]:
for run in range(1):
    with tqdm.tqdm(total=500, desc=f'Run {run}', disable=False) as progress_bar:
                for step in range(1, 500 + 1):
                    train_step(model=model, dataset=dataset, optimizer=optimizer, scheduler=scheduler,scaler= GradScaler(enabled=True))
                    metrics = evaluate(model=model, dataset=dataset, amp=False)
                    progress_bar.update()
                    progress_bar.set_postfix({metric: f'{value:.2f}' for metric, value in metrics.items()})

Run 0:  21%|██▏       | 107/500 [00:18<01:09,  5.64it/s, train accuracy=0.92, val accuracy=0.41, test accuracy=0.41]


KeyboardInterrupt: 