In [1]:

from ogb.nodeproppred import PygNodePropPredDataset
import os
import os.path as osp
import sys
import torch_geometric
import torch
import pandas as pd
import numpy as np
from torch_geometric.data import InMemoryDataset, download_url

sys.path.append(osp.abspath(".."))
print(sys.path)

from utils.encoder import SentenceEncoder
from utils_data.custom_pyg import CustomPygDataset

['/home/prateek/graphs-with-llms-experiments/utils_data', '/home/prateek/miniconda3/envs/torch_pyg_dgl/lib/python310.zip', '/home/prateek/miniconda3/envs/torch_pyg_dgl/lib/python3.10', '/home/prateek/miniconda3/envs/torch_pyg_dgl/lib/python3.10/lib-dynload', '', '/home/prateek/miniconda3/envs/torch_pyg_dgl/lib/python3.10/site-packages', '/home/prateek/graphs-with-llms-experiments']


In [2]:
# Loading the cora dataset
data_root = "../data"

In [3]:
class CoraPyGDataset(InMemoryDataset, CustomPygDataset):
    def __init__(self, dataRoot="../data", custom_dataRoot="../custom_data", sentence_encoder=None, transform=None, pre_transform=None, pre_filter=None):
        self.data_root = dataRoot
        self.custom_data_root = custom_dataRoot
        self.sentence_encoder = sentence_encoder
        self.custom_data_dir = osp.join(self.custom_data_root, f"cora_{self.sentence_encoder.name}")

        if not osp.exists(self.custom_data_dir):
            os.makedirs(self.custom_data_dir)

        super().__init__(self.custom_data_dir, transform, pre_transform, pre_filter)

        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return ["data.pt", "texts.pkl"]

    def text_to_embed(self, texts):
        if self.sentence_encoder is None:
            raise NotImplementedError("Sentence Encoder is not passed")
        if texts is None:
            return None
        else:
            return self.sentence_encoder.encode(texts)  # returns to self.device

    def encode_texts(self, texts):
        if isinstance(texts[0], str):
            return self.text_to_embed(texts)
        return [self.text_to_embed(t) for t in texts]

    def generate_custom_data(self):
        # Load the raw cora dataset
        data_path = osp.join(self.data_root, "cora", "cora.pt")
        raw_cora_data = torch.load(data_path)

        texts = raw_cora_data.raw_text
        label_names = raw_cora_data.label_names


        # Label and label description
        category_desc = pd.read_csv(osp.join(self.data_root, "cora", "categories.csv"), sep=",").values

        # Sort the label desc by the order of label_names
        ordered_desc = []
        for i, label in enumerate(label_names):
            true_ind = (label == category_desc[:, 0])
            ordered_desc.append((label, category_desc[true_ind, 1][0]))

        # Prompts for nodes/edges in original graph (can be changed accordingly)
        node_texts = ["Feature Node.\n Paper Title and abstract: " + t for t in texts]
        edge_text = ["Feature Edge.\n Connected papers are cited together by other papers."]

        # Node classification : Prompts for prompt node and label node (can be changed accordingly)
        prompt_node_text = ["Prompt Node.\n Node Classification on the paper's category"]
        label_texts = ["Prompt Node.\n Literature Category and Description: " + desc[0] + " + " + desc[1] for desc in ordered_desc]

        # Link prediction : Prompts for prompt node and edge labels (can be changed accordingly)
        prompt_node_edge_text = ["Prompt Node.\n Link Prediction on the papers that are cited together"]
        edge_label_text = ["Prompt Node.\n Two papers have co-citation",
                           "Prompt Node.\n Two papers do not have co-citation"]

        # Prompt for edge b/w prompt node and labels (can be changed accordingly)
        prompt_edge_text = ["Prompt Edge."]

        return raw_cora_data, [node_texts, label_texts, edge_text, prompt_node_edge_text, prompt_node_text, prompt_edge_text, edge_label_text]

    def process(self):
        # raw cora dataset is not in any library, so we process and load it manually in self.generate_custom_data()
        cora_data_list, texts = self.generate_custom_data()
        texts_embed = self.encode_texts(texts)

        torch.save(texts, self.processed_paths[1])

        cora_data_list.x_text_feat = texts_embed[0]
        cora_data_list.label_text_feat = texts_embed[1]
        cora_data_list.edge_text_feat = texts_embed[2]
        cora_data_list.prompt_text_edge_feat = texts_embed[3]
        cora_data_list.prompt_text_feat = texts_embed[4]
        cora_data_list.prompt_edge_feat = texts_embed[5]
        cora_data_list.edge_label_feat = texts_embed[6]

        cora_data_list.train_mask = cora_data_list.train_masks[0]
        cora_data_list.val_mask = cora_data_list.val_masks[0]
        cora_data_list.test_mask = cora_data_list.test_masks[0]

        cora_data_list.train_masks = None
        cora_data_list.val_masks = None
        cora_data_list.test_masks = None

        # Pass the data_list as a list
        data, slices = self.collate([cora_data_list])

        torch.save((data, slices), self.processed_paths[0])
        print("Cora is processed. Saved.")

In [4]:
LMencoder = SentenceEncoder("ST", root="../lang_models", device=2)
custom_cora = CoraPyGDataset(dataRoot=data_root, sentence_encoder=LMencoder)

In [5]:
LMencoder2 = SentenceEncoder("roberta", root="../lang_models", device=2)
custom_cora2 = CoraPyGDataset(dataRoot=data_root, sentence_encoder=LMencoder2)

In [6]:
cora = custom_cora._data
cora

Data(raw_text=[2708], y=[2708], label_names=[7], edge_index=[2, 10858], x=[2708, 384], category_names=[2708], x_text_feat=[2708, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [7]:
custom_cora._data

Data(raw_text=[2708], y=[2708], label_names=[7], edge_index=[2, 10858], x=[2708, 384], category_names=[2708], x_text_feat=[2708, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [8]:
from torch_geometric.loader import DataLoader

dl = DataLoader(custom_cora, batch_size=12)

for batch in dl:
    print(batch)

DataBatch(raw_text=[1], y=[2708], label_names=[1], edge_index=[2, 10858], x=[2708, 384], category_names=[1], x_text_feat=[2708, 768], label_text_feat=[7, 768], edge_text_feat=[1, 768], prompt_text_edge_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], edge_label_feat=[2, 768], train_mask=[2708], val_mask=[2708], test_mask=[2708], batch=[2708], ptr=[2])


In [9]:
isinstance(custom_cora, CustomPygDataset)

True

In [10]:
def f(var : CustomPygDataset):
    print(var)

f(custom_cora)

CoraPyGDataset()


In [19]:
label = cora.y.squeeze()
print(label)
label.device

tensor([0, 1, 2,  ..., 1, 1, 1])


device(type='cpu')

In [25]:
custom_cora.num_classes

7

In [26]:
cora.y[cora.train_mask] = -1 - cora.y[cora.train_mask]
cora.y

tensor([ 0, -2, -3,  ...,  1,  1,  1])