In [1]:
from ogb.nodeproppred import PygNodePropPredDataset
import os
import os.path as osp
import sys
from torch_geometric.data import Data
import torch
import pandas as pd
import numpy as np
from torch_geometric.data import InMemoryDataset, download_url

sys.path.append(osp.abspath(".."))
print(sys.path)

from utils.encoder import SentenceEncoder
from utils_data.custom_pyg import CustomPygDataset
from utils.dataloader import GetDataloader
from utils_data.cora import CoraPyGDataset

import warnings
warnings.filterwarnings("ignore")

['/home/prateek/graphs-with-llms-experiments/utils_data', '/home/prateek/miniconda3/envs/torch_pyg/lib/python310.zip', '/home/prateek/miniconda3/envs/torch_pyg/lib/python3.10', '/home/prateek/miniconda3/envs/torch_pyg/lib/python3.10/lib-dynload', '', '/home/prateek/miniconda3/envs/torch_pyg/lib/python3.10/site-packages', '/home/prateek/graphs-with-llms-experiments']


In [2]:
data_root = "../data"

In [3]:
LMencoder = SentenceEncoder("ST", root="../lang_models", device=1)
custom_cora = CoraPyGDataset(dataRoot=data_root, custom_dataRoot="../custom_data", sentence_encoder=LMencoder)
cora = custom_cora._data
cora.to("cpu")
cora

Data(x=[2708, 384], edge_index=[2, 10858], y=[2708], label_names=[7], num_nodes=2708, x_text_feat=[2708, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [4]:
import yaml
from datetime import date

with open("../config.yaml", "r") as f:
    args = yaml.safe_load(f)

args["device"] = 'cpu' if args["device"] == 123 else f"cuda:{args['device']}"
args["exp_name"] = f"Date -> {date.today()}. Experiment_{args['sentence_encoder']}_{args['exp_name']}"

args["encoder_path"] = '../lang_models'
args["dataRoot"] = '../data'
args["custom_dataRoot"] = '../custom_data'
args["dataset"] = "cora"
args["batch_count"] = 5

args

{'exp_name': 'Date -> 2024-02-03. Experiment_ST_evaluation-mode',
 'dataRoot': '../data',
 'custom_dataRoot': '../custom_data',
 'dataset': 'cora',
 'model_type': 'MLP',
 'sentence_encoder': 'ST',
 'encoder_path': '../lang_models',
 'state_dict_path': './state_dicts',
 'lr': 0.001,
 'epochs': 200,
 'batch_count': 5,
 'batch_size': 2,
 'weight_decay': 0.001,
 'dropout': 0.3,
 'seed': None,
 'num_workers': 10,
 'device': 'cuda:0',
 'eval_only': False,
 'n_way': 3,
 'n_shot': 1,
 'n_query': 1,
 'num_neighbors': [-1],
 'subgraph_type': 'directional'}

In [5]:
dl = GetDataloader(**args)
dl

<utils.dataloader.GetDataloader at 0x7f581084feb0>

In [6]:
batch1 = next(iter(dl.trn_smplr))
batch1

[{3: [505, 2033], 6: [1971, 772], 0: [1878, 1373]},
 {3: [2033, 2555], 2: [2150, 182], 0: [677, 322]}]

In [7]:
from torch_geometric.loader import NeighborLoader


def getitem(index):
    if isinstance(index, list):
        return [getitem(i) for i in index]
    elif isinstance(index, dict):
        return {key: getitem(value) for key, value in index.items()}
    elif not isinstance(index, int):
        raise IndexError("Only integers, lists and dictionaries can be used as indices")

    loader = NeighborLoader(data=cora,
                            num_neighbors=args["num_neighbors"],
                            input_nodes=torch.LongTensor([index]),
                            subgraph_type=args["subgraph_type"])
    subgraph = next(iter(loader))
    subgraph.batch_size = None

    return subgraph

batch = getitem(batch1)
batch

[{3: [Data(x=[3, 384], edge_index=[2, 2], y=[3], label_names=[7], num_nodes=3, x_text_feat=[3, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[3], val_mask=[3], test_mask=[3], n_id=[3], e_id=[2], num_sampled_nodes=[2], num_sampled_edges=[1], input_id=[1]),
   Data(x=[4, 384], edge_index=[2, 3], y=[4], label_names=[7], num_nodes=4, x_text_feat=[4, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[4], val_mask=[4], test_mask=[4], n_id=[4], e_id=[3], num_sampled_nodes=[2], num_sampled_edges=[1], input_id=[1])],
  6: [Data(x=[5, 384], edge_index=[2, 4], y=[5], label_names=[7], num_nodes=5, x_text_feat=[5, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge

In [8]:
def process_one_task(task):
    label_map = list(task) # Looks like this: (0, 'task1'), (1, 'task2'), ...
    label_map_reverse = {v: i for i, v in enumerate(label_map)} # ((0, 'task1'), 0), ((1, 'task2'), 1), ...
    all_graphs = []
    labels = []
    query_mask = []
    for label, graphs in task.items():
        augmented = [graph for graph in graphs]
        all_graphs.extend(augmented)
        query_mask.extend([False] * (args["n_shot"]))
        query_mask.extend([True] * (args["n_query"]))
        labels.extend([label_map_reverse[label]] * len(augmented)) # label_map_reverse[label] is the index of label in label_map
    return all_graphs, torch.tensor(labels), torch.tensor(query_mask), label_map


In [9]:
graphs, labels, query_mask, label_map = map(list, zip(*[process_one_task(task) for task in batch]))
print("graphs = ", graphs)
print("labels = ", labels)
print("query_mask = ", query_mask)
print("label_map = ", label_map)

graphs =  [[Data(x=[3, 384], edge_index=[2, 2], y=[3], label_names=[7], num_nodes=3, x_text_feat=[3, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[3], val_mask=[3], test_mask=[3], n_id=[3], e_id=[2], num_sampled_nodes=[2], num_sampled_edges=[1], input_id=[1]), Data(x=[4, 384], edge_index=[2, 3], y=[4], label_names=[7], num_nodes=4, x_text_feat=[4, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[4], val_mask=[4], test_mask=[4], n_id=[4], e_id=[3], num_sampled_nodes=[2], num_sampled_edges=[1], input_id=[1]), Data(x=[5, 384], edge_index=[2, 4], y=[5], label_names=[7], num_nodes=5, x_text_feat=[5, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_fea

In [10]:
g1, g2 = graphs[0][0], graphs[0][1]
g1, g2

(Data(x=[3, 384], edge_index=[2, 2], y=[3], label_names=[7], num_nodes=3, x_text_feat=[3, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[3], val_mask=[3], test_mask=[3], n_id=[3], e_id=[2], num_sampled_nodes=[2], num_sampled_edges=[1], input_id=[1]),
 Data(x=[4, 384], edge_index=[2, 3], y=[4], label_names=[7], num_nodes=4, x_text_feat=[4, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[4], val_mask=[4], test_mask=[4], n_id=[4], e_id=[3], num_sampled_nodes=[2], num_sampled_edges=[1], input_id=[1]))

In [11]:
from torch_geometric.data import Batch

gg = Batch.from_data_list([g1, g2])
gg

DataBatch(x=[7, 384], edge_index=[2, 5], y=[7], label_names=[2], num_nodes=7, x_text_feat=[7, 768], label_text_feat=[14, 768], edge_text_feat=[2, 768], prompt_text_edge_feat=[2, 768], prompt_text_feat=[2, 768], prompt_edge_feat=[2, 768], edge_label_feat=[4, 768], train_mask=[7], val_mask=[7], test_mask=[7], n_id=[7], e_id=[5], num_sampled_nodes=[2], num_sampled_edges=[2], input_id=[2], batch=[7], ptr=[3])

In [12]:
from torch_geometric.data import Batch
from itertools import chain

num_task = len(graphs)
task_len = len(graphs[0])
num_labels = len(label_map[0])

print("num_task = ", num_task)
print("task_len = ", task_len)
print("num_labels = ", num_labels)

graphs = Batch.from_data_list([g for l in graphs for g in l])
labels = torch.cat(labels)
b_mask = torch.stack(query_mask)
query_mask = torch.cat(query_mask)
label_map = list(chain(*label_map))

print("graphs = ", graphs)
print("labels = ", labels)
print("b_mask = ", b_mask)
print("query_mask = ", query_mask)
print("label_map = ", label_map)

num_task =  2
task_len =  6
num_labels =  3
graphs =  DataBatch(x=[41, 384], edge_index=[2, 29], y=[41], label_names=[12], num_nodes=41, x_text_feat=[41, 768], label_text_feat=[84, 768], edge_text_feat=[12, 768], prompt_text_edge_feat=[12, 768], prompt_text_feat=[12, 768], prompt_edge_feat=[12, 768], edge_label_feat=[24, 768], train_mask=[41], val_mask=[41], test_mask=[41], n_id=[41], e_id=[29], num_sampled_nodes=[12], num_sampled_edges=[12], input_id=[12], batch=[41], ptr=[13])
labels =  tensor([0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2])
b_mask =  tensor([[False,  True, False,  True, False,  True],
        [False,  True, False,  True, False,  True]])
query_mask =  tensor([False,  True, False,  True, False,  True, False,  True, False,  True,
        False,  True])
label_map =  [3, 6, 0, 3, 2, 0]


In [13]:
metagraph_edge_source = torch.arange(labels.size(0)).repeat_interleave(num_labels)

metagraph_edge_target = torch.arange(num_labels).repeat(labels.size(0))
metagraph_edge_target += (torch.arange(num_task) * num_labels).repeat_interleave(task_len * num_labels) + labels.size(0)

metagraph_edge_index = torch.stack([metagraph_edge_source, metagraph_edge_target], dim=0)

metagraph_edge_mask = query_mask.repeat_interleave(num_labels)

metagraph_edge_attr = torch.nn.functional.one_hot(labels, num_labels).float()
metagraph_edge_attr = metagraph_edge_attr.reshape(-1)
metagraph_edge_attr = (metagraph_edge_attr * 2 - 1) * (~metagraph_edge_mask)

metagraph_edge_attr = torch.stack([metagraph_edge_mask, metagraph_edge_attr], dim=1)

label_meta = torch.arange(7 * 768).reshape(7, 768)

label_map = torch.tensor(label_map)
label_embeddings = label_meta[label_map]
print(label_embeddings)

labels_onehot = torch.nn.functional.one_hot(labels).float()
print(labels_onehot)

tensor([[2304, 2305, 2306,  ..., 3069, 3070, 3071],
        [4608, 4609, 4610,  ..., 5373, 5374, 5375],
        [   0,    1,    2,  ...,  765,  766,  767],
        [2304, 2305, 2306,  ..., 3069, 3070, 3071],
        [1536, 1537, 1538,  ..., 2301, 2302, 2303],
        [   0,    1,    2,  ...,  765,  766,  767]])
tensor([[1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.]])


In [14]:
from torch.utils.data import DataLoader
from utils_data.custom_pyg import SubgraphPygDataset

custom_subg_cora = SubgraphPygDataset(graph=cora, num_neighbors=[-1], subgraph_type="directional")
d_ldr = DataLoader(custom_subg_cora, batch_sampler=dl.trn_smplr)

TypeError: SubgraphPygDataset.__init__() got an unexpected keyword argument 'graphnum_neighbors'

In [None]:
cora

Data(x=[2708, 384], edge_index=[2, 10858], y=[2708], label_names=[7], num_nodes=2708, x_text_feat=[2708, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[2708], val_mask=[2708], test_mask=[2708])