In [1]:
!pip install  dgl -f https://data.dgl.ai/wheels/torch-2.4/cu124/repo.html

Looking in links: https://data.dgl.ai/wheels/torch-2.4/cu124/repo.html
Collecting dgl
  Downloading https://data.dgl.ai/wheels/torch-2.4/cu124/dgl-2.4.0%2Bcu124-cp310-cp310-manylinux1_x86_64.whl (347.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m347.8/347.8 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting torch<=2.4.0 (from dgl)
  Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch<=2.4.0->dgl)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch<=2.4.0->dgl)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch<=2.4.0->dgl)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<=2.4.0->dgl)


In [2]:
!gdown 1szBOuGckL7gYw5tDPeFjNUpE79DoBF0r

Downloading...
From (original): https://drive.google.com/uc?id=1szBOuGckL7gYw5tDPeFjNUpE79DoBF0r
From (redirected): https://drive.google.com/uc?id=1szBOuGckL7gYw5tDPeFjNUpE79DoBF0r&confirm=t&uuid=bb69b141-6185-4eb0-9b3a-0d64ae5af977
To: /content/train_visual_features.zip
100% 7.94G/7.94G [01:44<00:00, 76.2MB/s]


In [3]:
!gdown 1SFgOGGB8a0JxTaa7phzYnq6K6JyWDGbf

Downloading...
From: https://drive.google.com/uc?id=1Ikh3QRI1WzUC9VbkxtDRuqapIvWV5MGO
To: /content/train_data.pkl
100% 87.2M/87.2M [00:00<00:00, 108MB/s]


In [4]:
!unzip  -q train_visual_features.zip

In [8]:
from torch.utils.data import Dataset, DataLoader
import pickle
import dgl
import os
import torch
import numpy as np
class GraphDataset(Dataset):
    def __init__(self, graph_file, feature_dir, test=False):
        super().__init__()
        self.graphs,_ = dgl.load_graphs(graph_file)
        self.feature_dir = feature_dir
        self.test = test

    def __len__(self):
        return len(self.graphs)

    def load_feat(self,nodes):
        tensors = []
        for idx,node in enumerate(nodes):
            try:
                tensors.append(torch.load(f"{self.feature_dir}/{node}.pt",map_location=torch.device("cpu"),weights_only=False).unsqueeze(0))
            except:
                tensors.append(torch.zeros((1,1536)))
        return torch.cat(tensors,dim=0)

    def __getitem__(self, index):
        g = self.graphs[index]
        nodes = g.ndata['obj_id']
        feats = self.load_feat(nodes)
        if self.test:
            return g, feats, nodes
        labels = g.edata['labels']
        return g, feats, nodes, labels


In [9]:
import torch
import torch.nn as nn
import dgl
import torch.nn.functional as F
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        """
        Computes a scalar score for each edge of the given graph.

        Parameters
        ----------
        edges :
            Has three members ``src``, ``dst`` and ``data``, each of
            which is a dictionary representing the features of the
            source nodes, the destination nodes, and the edges
            themselves.

        Returns
        -------
        dict
            A dictionary of new edge features.
        """
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [10]:
dataset = GraphDataset('train_data_graphs.bin','train_visual_features')

In [11]:
from torch.utils.data import random_split
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
train_dataset, val_dataset = random_split(dataset,[0.9,0.1])

In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=1, collate_fn=lambda batch: batch[0])
val_dataloader = DataLoader(val_dataset, batch_size=1, collate_fn=lambda batch: batch[0])

In [13]:
model = MLPPredictor(1536)
device = torch.device('cpu')
model.to(device)
from torch import optim
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [15]:
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
best_auc = 0.0
for epoch in tqdm(range(10)):
    total_loss = 0.0
    model.train()
    for g, feats, nodes, labels in tqdm(train_dataloader):
        scores = model(g.to(device),feats.to(device))
        loss = F.binary_cross_entropy_with_logits(scores, labels.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    total_loss/=len(train_dataloader)
    print(f'Epoch: {epoch} Train Loss: {total_loss}')
    model.eval()
    total_loss = 0.0
    auc = 0.0
    with torch.no_grad():
        for g, feats, nodes, labels in val_dataloader:
            scores = model(g.to(device),feats.to(device))
            loss = F.binary_cross_entropy_with_logits(scores, labels.to(device))
            total_loss+=loss.item()
            auc+= roc_auc_score(labels.detach().numpy(), scores.detach().numpy())
        total_loss/=len(val_dataloader)
        auc/=len(val_dataloader)
    if auc > best_auc:
        auc = best_auc
        torch.save(model.state_dict(),"predictor.pth")
    print(f'Epoch: {epoch} Val Loss: {total_loss} AUC: {auc}')


  0%|          | 0/10 [00:00<?, ?it/s]

: 