In [1]:
from ogb.nodeproppred import PygNodePropPredDataset
import os
import os.path as osp
import sys
import torch_geometric
import torch
import pandas as pd
import numpy as np
from torch_geometric.data import InMemoryDataset, download_url

sys.path.append(osp.abspath(".."))
print(sys.path)

from utils.encoder import SentenceEncoder
from utils_data.custom_pyg import CustomPygDataset
from utils.dataloader import GetDataloader

['/home/prateek/graphs-with-llms-experiments/utils_data', '/home/prateek/miniconda3/envs/torch_pyg_dgl/lib/python310.zip', '/home/prateek/miniconda3/envs/torch_pyg_dgl/lib/python3.10', '/home/prateek/miniconda3/envs/torch_pyg_dgl/lib/python3.10/lib-dynload', '', '/home/prateek/miniconda3/envs/torch_pyg_dgl/lib/python3.10/site-packages', '/home/prateek/graphs-with-llms-experiments']


In [2]:
# Loading the cora dataset
data_root = "../data"

In [3]:
class CoraPyGDataset(InMemoryDataset, CustomPygDataset):
    def __init__(self, dataRoot="../data", custom_dataRoot="../custom_data", sentence_encoder=None, transform=None, pre_transform=None, pre_filter=None):
        self.data_root = dataRoot
        self.custom_data_root = custom_dataRoot
        self.sentence_encoder = sentence_encoder
        self.custom_data_dir = osp.join(self.custom_data_root, f"cora_{self.sentence_encoder.name}")

        if not osp.exists(self.custom_data_dir):
            os.makedirs(self.custom_data_dir)

        super().__init__(self.custom_data_dir, transform, pre_transform, pre_filter)

        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return ["data.pt", "texts.pkl"]

    def text_to_embed(self, texts):
        if self.sentence_encoder is None:
            raise NotImplementedError("Sentence Encoder is not passed")
        if texts is None:
            return None
        else:
            return self.sentence_encoder.encode(texts)  # returns to self.device

    def encode_texts(self, texts):
        if isinstance(texts[0], str):
            return self.text_to_embed(texts)
        return [self.text_to_embed(t) for t in texts]

    def generate_custom_data(self):
        # Load the raw cora dataset
        data_path = osp.join(self.data_root, "cora", "cora.pt")
        raw_cora_data = torch.load(data_path)

        texts = raw_cora_data.raw_text
        label_names = raw_cora_data.label_names


        # Label and label description
        category_desc = pd.read_csv(osp.join(self.data_root, "cora", "categories.csv"), sep=",").values

        # Sort the label desc by the order of label_names
        ordered_desc = []
        for i, label in enumerate(label_names):
            true_ind = (label == category_desc[:, 0])
            ordered_desc.append((label, category_desc[true_ind, 1][0]))

        # Prompts for nodes/edges in original graph (can be changed accordingly)
        node_texts = ["Feature Node.\n Paper Title and abstract: " + t for t in texts]
        edge_text = ["Feature Edge.\n Connected papers are cited together by other papers."]

        # Node classification : Prompts for prompt node and label node (can be changed accordingly)
        prompt_node_text = ["Prompt Node.\n Node Classification on the paper's category"]
        label_texts = ["Prompt Node.\n Literature Category and Description: " + desc[0] + " + " + desc[1] for desc in ordered_desc]

        # Link prediction : Prompts for prompt node and edge labels (can be changed accordingly)
        prompt_node_edge_text = ["Prompt Node.\n Link Prediction on the papers that are cited together"]
        edge_label_text = ["Prompt Node.\n Two papers have co-citation",
                           "Prompt Node.\n Two papers do not have co-citation"]

        # Prompt for edge b/w prompt node and labels (can be changed accordingly)
        prompt_edge_text = ["Prompt Edge."]

        return raw_cora_data, [node_texts, label_texts, edge_text, prompt_node_edge_text, prompt_node_text, prompt_edge_text, edge_label_text]

    def process(self):
        # raw cora dataset is not in any library, so we process and load it manually in self.generate_custom_data()
        cora_data_list, texts = self.generate_custom_data()
        texts_embed = self.encode_texts(texts)

        torch.save(texts, self.processed_paths[1])

        cora_data_list.x_text_feat = texts_embed[0]
        cora_data_list.label_text_feat = texts_embed[1]
        cora_data_list.edge_text_feat = texts_embed[2]
        cora_data_list.prompt_text_edge_feat = texts_embed[3]
        cora_data_list.prompt_text_feat = texts_embed[4]
        cora_data_list.prompt_edge_feat = texts_embed[5]
        cora_data_list.edge_label_feat = texts_embed[6]

        cora_data_list.train_mask = cora_data_list.train_masks[0]
        cora_data_list.val_mask = cora_data_list.val_masks[0]
        cora_data_list.test_mask = cora_data_list.test_masks[0]

        cora_data_list.train_masks = None
        cora_data_list.val_masks = None
        cora_data_list.test_masks = None

        # Pass the data_list as a list
        data, slices = self.collate([cora_data_list])

        torch.save((data, slices), self.processed_paths[0])
        print("Cora is processed. Saved.")

In [4]:
LMencoder = SentenceEncoder("ST", root="../lang_models", device=2)
custom_cora = CoraPyGDataset(dataRoot=data_root, sentence_encoder=LMencoder)

In [5]:
LMencoder2 = SentenceEncoder("roberta", root="../lang_models", device=2)
custom_cora2 = CoraPyGDataset(dataRoot=data_root, sentence_encoder=LMencoder)

In [6]:
cora = custom_cora._data

In [7]:
from torch_geometric.loader import DataLoader

dl = DataLoader(custom_cora, batch_size=12)

for batch in dl:
    print(batch)

DataBatch(raw_text=[1], y=[2708], label_names=[1], edge_index=[2, 10858], x=[2708, 384], category_names=[1], x_text_feat=[2708, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[2708], val_mask=[2708], test_mask=[2708], batch=[2708], ptr=[2])


In [8]:
isinstance(custom_cora, CustomPygDataset)

True

In [9]:
def f(var : CustomPygDataset):
    print(var)

f(custom_cora)

CoraPyGDataset()


In [10]:
label = cora.y.squeeze()
print(label)
label.device

tensor([0, 1, 2,  ..., 1, 1, 1])


device(type='cpu')

In [11]:
custom_cora.num_classes

7

In [12]:
import yaml
from datetime import date

with open("../config.yaml", "r") as f:
    args = yaml.safe_load(f)

args["device"] = 'cpu' if args["device"] == 123 else f"cuda:{args['device']}"
args["exp_name"] = f"Date -> {date.today()}. Experiment_{args['sentence_encoder']}_{args['exp_name']}"

args["encoder_path"] = '../lang_models'
args["dataRoot"] = '../data'
args["custom_dataRoot"] = '../custom_data'
args["dataset"] = "cora"
args["batch_count"] = 5

args

{'exp_name': 'Date -> 2024-01-30. Experiment_ST_evaluation-mode',
 'dataRoot': '../data',
 'custom_dataRoot': '../custom_data',
 'dataset': 'cora',
 'model_type': 'MLP',
 'sentence_encoder': 'ST',
 'encoder_path': '../lang_models',
 'state_dict_path': './state_dicts',
 'lr': 0.001,
 'epochs': 200,
 'batch_count': 5,
 'batch_size': 5,
 'weight_decay': 0.001,
 'dropout': 0.3,
 'seed': None,
 'num_workers': 10,
 'device': 'cuda:0',
 'eval_only': False,
 'n_way': 3,
 'n_shot': 3,
 'n_query': 2}

In [13]:
dl = GetDataloader(**args)
dl

<utils.dataloader.GetDataloader at 0x7f0c702287f0>

In [14]:
dl.get_num_classes()

7

In [15]:
list(dl.trn_smplr)

[[{6: [797, 959, 1610, 2217, 1554],
   3: [2246, 2555, 2116, 299, 58],
   1: [579, 445, 1766, 2419, 660]},
  {6: [2277, 2191, 1087, 221, 160],
   0: [518, 1800, 2414, 2622, 886],
   1: [1220, 495, 2397, 1195, 445]},
  {3: [993, 2116, 841, 1776, 58],
   0: [2458, 9, 1736, 1878, 2622],
   1: [1017, 1195, 445, 250, 495]},
  {2: [1602, 1263, 1499, 1751, 658],
   3: [1004, 2555, 739, 856, 841],
   0: [518, 2325, 1583, 886, 1800]},
  {5: [1895, 2205, 2594, 789, 2482],
   3: [2137, 534, 993, 739, 841],
   6: [221, 2347, 1971, 1554, 797]}],
 [{6: [2013, 1087, 2299, 1554, 1970],
   2: [954, 2150, 547, 1602, 2466],
   4: [2392, 516, 507, 1246, 1950]},
  {3: [856, 534, 1302, 2348, 2263],
   1: [250, 2701, 1220, 1195, 72],
   0: [1186, 518, 1875, 1736, 2414]},
  {6: [2191, 2337, 2217, 1234, 2299],
   1: [1017, 380, 2397, 445, 1],
   2: [845, 576, 547, 954, 1263]},
  {2: [954, 231, 658, 2242, 174],
   5: [1418, 1519, 2482, 1562, 2181],
   0: [886, 2590, 412, 322, 1878]},
  {4: [1315, 1642, 2519, 19

In [16]:
a = []
for key, value in cora:
    a.append(key)
print(a)

node_attrs = [key for key, value in cora if cora.is_node_attr(key)]
node_attrs

['raw_text', 'y', 'label_names', 'edge_index', 'x', 'category_names', 'x_text_feat', 'label_text_feat', 'edge_text_feat', 'prompt_text_edge_feat', 'prompt_text_feat', 'prompt_edge_feat', 'edge_label_feat', 'train_mask', 'val_mask', 'test_mask']


['raw_text',
 'y',
 'x',
 'category_names',
 'x_text_feat',
 'train_mask',
 'val_mask',
 'test_mask']

In [17]:
cora.to("cpu")
cora

Data(raw_text=[2708], y=[2708], label_names=[7], edge_index=[2, 10858], x=[2708, 384], category_names=[2708], x_text_feat=[2708, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [18]:
for k, v in cora:
    print(k, type(v))

raw_text <class 'list'>
y <class 'torch.Tensor'>
label_names <class 'list'>
edge_index <class 'torch.Tensor'>
x <class 'torch.Tensor'>
category_names <class 'list'>
x_text_feat <class 'torch.Tensor'>
label_text_feat <class 'torch.Tensor'>
edge_text_feat <class 'torch.Tensor'>
prompt_text_edge_feat <class 'torch.Tensor'>
prompt_text_feat <class 'torch.Tensor'>
prompt_edge_feat <class 'torch.Tensor'>
edge_label_feat <class 'torch.Tensor'>
train_mask <class 'torch.Tensor'>
val_mask <class 'torch.Tensor'>
test_mask <class 'torch.Tensor'>


In [19]:
idx = torch.Tensor(range(1000)).type(torch.int)

sg = cora.subgraph(idx)
sg

Data(raw_text=[1000], y=[1000], label_names=[7], edge_index=[2, 556], x=[1000, 384], category_names=[1000], x_text_feat=[1000, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[1000], val_mask=[1000], test_mask=[1000])

In [20]:
i = sg.edge_index[0]
j = sg.edge_index[1]

i.unique().shape[0]

292

In [21]:
cora.raw_text = None
cora.category_names = None
cora

Data(y=[2708], label_names=[7], edge_index=[2, 10858], x=[2708, 384], x_text_feat=[2708, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [22]:
from torch_geometric.loader import NeighborLoader, NodeLoader

loader = None
loader = NeighborLoader(data=cora,
                        num_neighbors=[-1],
                        input_nodes=torch.LongTensor([0, 1, 2]),
                        subgraph_type=)

In [23]:
sg = next(iter(loader))
sg

Data(y=[6], label_names=[7], edge_index=[2, 5], x=[6, 384], x_text_feat=[6, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[6], val_mask=[6], test_mask=[6], n_id=[6], e_id=[5], num_sampled_nodes=[2], num_sampled_edges=[1], input_id=[1], batch_size=1)

In [28]:
for k, v in sg:
    print(k, v)

y tensor([0, 0, 4, 0, 0, 0])
label_names ['Rule_Learning', 'Neural_Networks', 'Case_Based', 'Genetic_Algorithms', 'Theory', 'Reinforcement_Learning', 'Probabilistic_Methods']
edge_index tensor([[1, 2, 3, 4, 5],
        [0, 0, 0, 0, 0]])
x tensor([[ 0.0087,  0.0294,  0.0358,  ...,  0.0481,  0.0118,  0.0109],
        [-0.0271,  0.0552,  0.0554,  ...,  0.0173,  0.0651, -0.0354],
        [-0.0349, -0.0170,  0.0448,  ..., -0.0737, -0.0317,  0.0582],
        [ 0.0180, -0.0094,  0.0315,  ...,  0.0871,  0.0109,  0.0333],
        [-0.0007,  0.0180, -0.0089,  ..., -0.0348,  0.0422,  0.0276],
        [ 0.0344,  0.0096, -0.0080,  ...,  0.0427,  0.0105,  0.0244]])
x_text_feat tensor([[ 0.0178, -0.0104, -0.0473,  ..., -0.0147, -0.0497,  0.0113],
        [-0.0117,  0.0400, -0.0301,  ..., -0.0606, -0.0444, -0.0081],
        [-0.0722, -0.0166, -0.0333,  ...,  0.0085, -0.0148,  0.0588],
        [ 0.0063,  0.0369, -0.0176,  ...,  0.0040, -0.0289, -0.0065],
        [ 0.0278, -0.0036, -0.0721,  ...,  0.040

In [26]:
row, col = cora.edge_index[0], cora.edge_index[1]

np.where(row == 0)[0]

array([2377, 2451, 3633, 4965, 9395])

In [27]:
class Collator:
    def __init__(self, params):
        self.params = params

    def __call__(self, batch):
        pass