In [None]:
!pip install torch_geometric
!pip install torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cu121.html

Collecting torch_geometric
  Downloading torch_geometric-2.4.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.4.0
Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu121.html
Collecting torch_scatter
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu121/torch_scatter-2.1.2%2Bpt21cu121-cp310-cp310-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_sparse
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcu121/torch_sparse-0.6.18%2Bpt21cu121-cp310-cp310-linux_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_cluster
  Downloading https://data.pyg.org/whl/torch-2.1.0

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import torch
from torch import Tensor
from torch_geometric.loader.neighbor_sampler import NeighborSampler, Adj, EdgeIndex

class NeighborSamplerbyNFT(NeighborSampler):
    def __init__(self, edge_index, sizes, edge_attr=None, transform=None, prob_vector=None, **kwargs):
        super(NeighborSamplerbyNFT, self).__init__(edge_index, sizes=sizes, transform=transform, **kwargs)
        self.edge_attr = edge_attr.to('cpu') if edge_attr is not None else None
        self.first_layer_edge_attr = None  # Store the edge attributes of the first layer
        self.prob_vector = prob_vector


    def sample(self, batch):
        if not isinstance(batch, Tensor):
            batch = torch.tensor(batch, dtype=torch.long)

        batch_size: int = len(batch)
        adjs = []
        n_id = batch
        if self.prob_vector is not None:
            n_id = torch.multinomial(self.prob_vector, num_samples=len(n_id), replacement=True)

        for i, size in enumerate(self.sizes):
            adj_t, n_id = self.adj_t.sample_adj(n_id, size, replace=False)
            e_id = adj_t.storage.value()
            size = adj_t.sparse_sizes()[::-1]

            if self.edge_attr is not None:
                edge_attr = self.edge_attr[e_id].to('cpu')  # Ensure edge_attr is on CPU
                if i == 0:
                    # Store the edge attributes for the first layer
                    self.first_layer_edge_attr = edge_attr
                else:
                    # Compute the mask based on the stored first layer edge attributes
                    mask = self.compute_mask(edge_attr, self.first_layer_edge_attr)
                    adj_t = adj_t.masked_select_nnz(mask, layout='coo')
                    e_id = adj_t.storage.value()
                    size = adj_t.sparse_sizes()[::-1]

            if self.is_sparse_tensor:
                adjs.append(Adj(adj_t, e_id, size))
            else:
                row, col, _ = adj_t.coo()
                edge_index = torch.stack([col, row], dim=0)
                adjs.append(EdgeIndex(edge_index, e_id, size))

        adjs = adjs[0] if len(adjs) == 1 else adjs[::-1]
        out = (batch_size, n_id, adjs)
        out = self.transform(*out) if self.transform is not None else out
        return out

    def compute_mask(self, edge_attr, first_layer_edge_attr):
        # Compute the mask by checking if the edge attributes match any of the first layer edge attributes
        mask = (edge_attr[:, None] == first_layer_edge_attr).all(dim=-1).any(dim=1)
        return mask

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, BatchNorm
from torch_geometric.nn import MessagePassing

class ArtemisFirstLayerConv(MessagePassing):
    def __init__(self, in_node_channels, in_edge_channels, out_channels, aggr='mean'):
        super(ArtemisFirstLayerConv, self).__init__(aggr=aggr)
        self.lin = nn.Linear(in_node_channels + in_edge_channels, out_channels)
        self.aggr = aggr

    def forward(self, x, edge_index, edge_attr):
        return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x, edge_attr=edge_attr)

    def message(self, x_j, edge_attr):
        return self.lin(torch.cat([x_j, edge_attr], dim=-1))

    def update(self, aggr_out):
        return F.relu(aggr_out)

class ArtemisNet(nn.Module):
    def __init__(self, in_node_channels, in_edge_channels, hidden_channels):
        super(ArtemisNet, self).__init__()
        self.conv1 = ArtemisFirstLayerConv(in_node_channels, in_edge_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_channels + in_node_channels, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 1)
        )

    def forward(self, x, edge_index_tuple, edge_attr_tuple):
        edge_index_0, edge_index_1, edge_index_2 = edge_index_tuple
        edge_attr_0, _, _ = edge_attr_tuple

        # First layer with residual connection
        inital_embedding = x
        x = self.conv1(x, edge_index_0, edge_attr_0)
        x = F.relu(self.bn1(x))
        x = F.dropout(x, p=0.5, training=self.training)

        # Second layer with residual connection
        x = self.conv2(x, edge_index_1)
        x = F.relu(self.bn2(x))
        x = F.dropout(x, p=0.5, training=self.training)

        # Third layer with residual connection
        x = self.conv3(x, edge_index_2)
        x = F.relu(self.bn3(x))
        x = F.dropout(x, p=0.5, training=self.training)

        # Apply MLP to the final output
        x = torch.cat([x, inital_embedding], dim=1)
        x = self.mlp(x)
        return x.squeeze()

In [None]:
import torch
import torch.nn as nn
import torch_geometric
from torch_geometric.data import Data
import numpy as np
import random
import sys
from torch_geometric.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

node_basic_features = torch.load('./WWW 2024/node_basic_features.pt').to(device)
node_advanced_features = torch.load('./WWW 2024/node_advanced_features.pt').to(device)
edge_index = torch.load('./WWW 2024/edge_index.pt').to(device)
base_edge_features = torch.load('./WWW 2024/base_edge_features.pt').to(device)
nft_multimodal_bmbedding_features = torch.load('./WWW 2024/nft_multimodal_bmbedding_features.pt').to(device)
y = torch.load('./WWW 2024/y.pt').to(device)
node_sample_prob = torch.load('./WWW 2024/node_sample_prob.pt')
node_sample_prob = node_sample_prob / node_sample_prob.sum()

node_features = torch.cat([node_basic_features, node_advanced_features], dim=1)
edge_features = torch.cat([base_edge_features, nft_multimodal_bmbedding_features], dim=1)

In [None]:
train_mask = np.zeros(y.shape[0], dtype=np.bool_)
test_mask = np.zeros(y.shape[0], dtype=np.bool_)
train_test_split_num = int(y.shape[0] * 0.9)
train_index = random.sample(range(y.shape[0]), train_test_split_num)
test_index = list(set(range(y.shape[0])) - set(train_index))
train_mask[train_index] = True
test_mask[test_index] = True
print("train node num: ", train_mask.sum())
print("test node num: ", test_mask.sum())
print("true data percentage in train data: ", y[train_mask].sum() / len(y[train_mask]))
print("true data percentage in test data: ", y[test_mask].sum() / len(y[test_mask]))

data = Data(x=node_basic_features, y=y,
            edge_index=edge_index, edge_attr=edge_features,
            train_mask=train_mask, test_mask=test_mask).to(device)

train node num:  182790
test node num:  20311
true data percentage in train data:  tensor(0.0237, device='cuda:0')
true data percentage in test data:  tensor(0.0234, device='cuda:0')


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from torch.utils.data import DataLoader, WeightedRandomSampler

import os

log_path = "./WWW 2024/artemis_model_log.txt"

if os.path.exists(log_path):
    os.remove(log_path)

with open(log_path, "a") as log_file:
    log_file.write("Epoch, Average Loss, Average Accuracy, Average Precision, Average Recall, Average F1 Score\n")


for run in range(5):
    print(f"Starting run {run+1}...\n")
    with open(log_path, "a") as log_file:

        model = ArtemisNet(data.x.shape[1], data.edge_attr.shape[1], 32).to(device)

        num_pos = data.y[data.train_mask].sum().item()
        num_neg = data.train_mask.sum().item() - num_pos
        class_weights = torch.tensor([1], dtype=torch.float32).to(device)
        criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)


        sizes = [8, 1, 1]
        edge_sampler = NeighborSamplerbyNFT(edge_index=data.edge_index, sizes=sizes, edge_attr=data.edge_attr, prob_vector=node_sample_prob)
        patience = 10
        best_loss = float('inf')
        patience_counter = 0

        train_nodes = torch.where(torch.from_numpy(data.train_mask))[0]  # Keep train_nodes on CPU


        labels = data.y[data.train_mask].cpu().numpy()

        class_counts = np.bincount(labels)
        weights = 1. / torch.tensor(class_counts, dtype=torch.float32)
        sample_weights = weights[labels]

        # WeightedRandomSampler
        sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=False)

        patience = 10
        best_loss = float('inf')
        patience_counter = 0

        best_model = None
        best_f1 = 0.0

        for epoch in range(100):
            model.train()
            total_loss = 0
            total_accuracy = 0
            total_precision = 0
            total_recall = 0
            total_f1 = 0
            batch_count = 0

            for subset_nodes in DataLoader(train_nodes, batch_size=256, sampler=sampler):
                batch_size, n_id, adjs = edge_sampler.sample(subset_nodes)
                n_id = n_id.to(device)

                optimizer.zero_grad()

                edge_index_0, e_id_0, size_0 = adjs[0].edge_index, adjs[0].e_id, adjs[0].size
                edge_attr_0 = data.edge_attr[e_id_0].to(device)
                edge_index_1, _, size_1 = adjs[1].edge_index, adjs[1].e_id, adjs[1].size
                edge_index_2, _, size_2 = adjs[2].edge_index, adjs[2].e_id, adjs[2].size

                out = model(data.x[n_id], (edge_index_0.to(device), edge_index_1.to(device), edge_index_2.to(device)), (edge_attr_0, None, None))

                loss = criterion(out, data.y[n_id].float())
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

                predictions = torch.sigmoid(out)
                pred_binary = (predictions >= 0.5).int()
                pred = pred_binary.cpu()
                y = data.y[n_id].int().cpu()

                total_accuracy += accuracy_score(y.numpy(), pred.numpy())
                total_precision += precision_score(y.numpy(), pred.numpy(), zero_division=1)
                total_recall += recall_score(y.numpy(), pred.numpy())
                total_f1 += f1_score(y.numpy(), pred.numpy())

                batch_count += 1

            avg_loss = total_loss / batch_count
            avg_accuracy = total_accuracy / batch_count
            avg_precision = total_precision / batch_count
            avg_recall = total_recall / batch_count
            avg_f1 = total_f1 / batch_count

            print(f"Epoch {epoch} | Average Loss: {avg_loss:.5f} | Average Accuracy: {avg_accuracy:.3f} | "
                f"Average Precision: {avg_precision:.3f} | Average Recall: {avg_recall:.3f} | "
                f"Average F1 Score: {avg_f1:.3f}")
            log_file.write(f"{epoch}, {avg_loss:.5f}, {avg_accuracy:.3f}, {avg_precision:.3f}, {avg_recall:.3f}, {avg_f1:.3f}\n")

            if avg_loss < best_loss:
                best_loss = avg_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter == patience:
                    print("Stopping early due to lack of improvement on the validation set.")
                    break

            model.eval()
            test_nodes = torch.where(torch.from_numpy(data.test_mask))[0]
            with torch.no_grad():
                batch_size, n_id, adjs = edge_sampler.sample(test_nodes)
                n_id = n_id.to(device)

                edge_index_0, e_id_0, size_0 = adjs[0].edge_index, adjs[0].e_id, adjs[0].size
                edge_attr_0 = data.edge_attr[e_id_0].to(device)
                edge_index_1, _, size_1 = adjs[1].edge_index, adjs[1].e_id, adjs[1].size
                edge_index_2, _, size_2 = adjs[2].edge_index, adjs[2].e_id, adjs[2].size

                out = model(data.x[n_id], (edge_index_0.to(device), edge_index_1.to(device), edge_index_2.to(device)), (edge_attr_0, None, None))

                predictions = torch.sigmoid(out)
                pred_binary = (predictions >= 0.5).int()
                pred = pred_binary.cpu()
                y = data.y[n_id].int().cpu()
                accuracy = accuracy_score(y.numpy(), pred.numpy())
                precision = precision_score(y.numpy(), pred.numpy(), zero_division=1)
                recall = recall_score(y.numpy(), pred.numpy())
                f1 = f1_score(y.numpy(), pred.numpy())

                if f1 > best_f1:
                    best_f1 = f1
                    best_model = model.state_dict().copy()


        model.load_state_dict(best_model)
        model.eval()
        test_nodes = torch.where(torch.from_numpy(data.test_mask))[0]
        with torch.no_grad():
            batch_size, n_id, adjs = edge_sampler.sample(test_nodes)
            n_id = n_id.to(device)

            edge_index_0, e_id_0, size_0 = adjs[0].edge_index, adjs[0].e_id, adjs[0].size
            edge_attr_0 = data.edge_attr[e_id_0].to(device)
            edge_index_1, _, size_1 = adjs[1].edge_index, adjs[1].e_id, adjs[1].size
            edge_index_2, _, size_2 = adjs[2].edge_index, adjs[2].e_id, adjs[2].size

            out = model(data.x[n_id], (edge_index_0.to(device), edge_index_1.to(device), edge_index_2.to(device)), (edge_attr_0, None, None))

            predictions = torch.sigmoid(out)
            pred_binary = (predictions >= 0.5).int()
            pred = pred_binary.cpu()
            y = data.y[n_id].int().cpu()
            accuracy = accuracy_score(y.numpy(), pred.numpy())
            precision = precision_score(y.numpy(), pred.numpy(), zero_division=1)
            recall = recall_score(y.numpy(), pred.numpy())
            f1 = f1_score(y.numpy(), pred.numpy())

            print(f"Test - Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1 Score: {f1:.3f}")
            log_file.write(f"Test - Run {run+1}, Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1 Score: {f1:.3f}\n\n")

        print(f"Run {run+1} completed.\n")

Starting run 1...

Epoch 0 | Average Loss: 5.59800 | Average Accuracy: 0.606 | Average Precision: 0.471 | Average Recall: 0.842 | Average F1 Score: 0.598
Epoch 1 | Average Loss: 1.11412 | Average Accuracy: 0.813 | Average Precision: 0.669 | Average Recall: 0.871 | Average F1 Score: 0.756
Epoch 2 | Average Loss: 0.51414 | Average Accuracy: 0.853 | Average Precision: 0.726 | Average Recall: 0.895 | Average F1 Score: 0.802
Epoch 3 | Average Loss: 0.37783 | Average Accuracy: 0.873 | Average Precision: 0.756 | Average Recall: 0.912 | Average F1 Score: 0.826
Epoch 4 | Average Loss: 0.31241 | Average Accuracy: 0.890 | Average Precision: 0.791 | Average Recall: 0.912 | Average F1 Score: 0.847
Epoch 5 | Average Loss: 0.28223 | Average Accuracy: 0.900 | Average Precision: 0.811 | Average Recall: 0.912 | Average F1 Score: 0.858
Epoch 6 | Average Loss: 0.26440 | Average Accuracy: 0.906 | Average Precision: 0.825 | Average Recall: 0.910 | Average F1 Score: 0.866
Epoch 7 | Average Loss: 0.25026 | Av