In [2]:
from ogb.nodeproppred import PygNodePropPredDataset
import os
import os.path as osp
import sys
import torch_geometric
import torch
import pandas as pd
import numpy as np
from torch_geometric.data import InMemoryDataset, download_url
from torch_geometric.utils import remove_self_loops
from datasets import load_dataset

sys.path.append(osp.abspath(".."))
print(sys.path)

from utils.encoder import SentenceEncoder
from utils_data.custom_pyg import CustomPygDataset
from utils.dataloader import GetDataloader

['/home/prateek/graphs-with-llms-experiments/utils_data', '/home/prateek/miniconda3/envs/torch_pyg_dgl/lib/python310.zip', '/home/prateek/miniconda3/envs/torch_pyg_dgl/lib/python3.10', '/home/prateek/miniconda3/envs/torch_pyg_dgl/lib/python3.10/lib-dynload', '', '/home/prateek/miniconda3/envs/torch_pyg_dgl/lib/python3.10/site-packages', '/home/prateek/graphs-with-llms-experiments']


In [3]:
# Loading the ogbn-products dataset
data_root = "../data"
products = PygNodePropPredDataset(name='ogbn-products', root=data_root)

In [4]:
dataset = products[0]
dataset

Data(num_nodes=2449029, edge_index=[2, 123718280], x=[2449029, 100], y=[2449029, 1])

In [5]:
ei = dataset.edge_index
ei

tensor([[      0,  152857,       0,  ..., 2449028,   53324, 2449028],
        [ 152857,       0,   32104,  ...,  162836, 2449028,   53324]])

In [6]:
def get_node_feature(data_root):
    nodeidx2asin = pd.read_csv(osp.join(data_root, "ogbn_products/mapping/nodeidx2asin.csv.gz"), index_col="node idx")

    raw_train = load_dataset("json", data_files=osp.join(data_root, "ogbn_products/Amazon-3M.raw/trn.json.gz"))
    raw_test = load_dataset("json", data_files=osp.join(data_root, "ogbn_products/Amazon-3M.raw/tst.json.gz"))

    raw_train_df = raw_train["train"].to_pandas()
    raw_test_df = raw_test["train"].to_pandas()
    raw_combined_df = pd.concat([raw_train_df, raw_test_df], ignore_index=True)

    products_titdesc = pd.merge(nodeidx2asin, raw_combined_df, left_on="asin", right_on="uid")

    # Prompt for the feature of nodes (can be changed accordingly)
    node_feature_prompt = ("Feature Node.\n"
                           + "Product Title and Description : "
                           + products_titdesc["title"]
                           + " + "
                           + products_titdesc["content"])

    node_feature_prompt_list = node_feature_prompt.values
    return node_feature_prompt_list

In [7]:
def get_label_feature(data_root):
    label2cat = pd.read_csv(osp.join(data_root, "ogbn_products/mapping/labelidx2productcategory.csv.gz"), index_col="label idx")

    # Fixing few errors
    label2cat.loc[24] = "Label 25"
    label2cat.loc[45] = "Furniture & Decor"
    label2cat.loc[46] = "Label 47" # replacing '#508510'

    # Prompt for the label nodes (can be changed accordingly)
    label_node_prompt = ("Prompt Node.\n"
                         + "Product Category : "
                         + label2cat["product category"])

    label_node_prompt_list = label_node_prompt.values
    return label_node_prompt_list

In [8]:
class ProductsPyGDataset(InMemoryDataset):
    def __init__(self, dataRoot="../data", custom_dataRoot="../custom_data", sentence_encoder=None, transform=None, pre_transform=None, pre_filter=None):
        self.data_root = dataRoot
        self.custom_data_root = custom_dataRoot
        self.sentence_encoder = sentence_encoder
        self.custom_data_dir = osp.join(self.custom_data_root, f"ogbn_products_{self.sentence_encoder.name}")

        if not osp.exists(self.custom_data_dir):
            os.makedirs(self.custom_data_dir)

        super().__init__(self.custom_data_dir, transform, pre_transform, pre_filter)

        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []

    @property
    def processed_file_names(self):
        return ["data.pt", "texts.pkl"]

    def text_to_embed(self, texts):
        if self.sentence_encoder is None:
            raise NotImplementedError("Sentence Encoder is not passed")
        if texts is None:
            return None
        else:
            return self.sentence_encoder.encode(texts) # returns to self.device

    def encode_texts(self, texts):
        if isinstance(texts[0], str):
            return self.text_to_embed(texts)
        return [self.text_to_embed(t) for t in texts]

    def generate_custom_data(self):
        node_texts = get_node_feature(self.data_root).tolist()
        label_texts = get_label_feature(self.data_root).tolist()

        # Prompt for prompt node/edge and edge texts (can be changed accordingly)
        edge_texts = ["Feature Edge.\n Co-purchased. Two products were purchased together on Amazon"]
        prompt_texts = ["Prompt Node.\n Node Classification of Product Category"]
        prompt_edge_texts = ["Prompt Edge."]

        return [node_texts, label_texts, edge_texts, prompt_texts, prompt_edge_texts]

    def process(self):
        products_data = PygNodePropPredDataset(name="ogbn-products", root=self.data_root)
        products_data_list = products_data._data

        products_data_list.edge_index = remove_self_loops(products_data_list.edge_index)[0] # remove self-loops from graph
        products_data_list.y = products_data_list.y.squeeze()  # to flatten the y tensor

        texts = self.generate_custom_data()
        texts_embed = self.encode_texts(texts)

        torch.save(texts, self.processed_paths[1])

        products_data_list.x_text_feat = texts_embed[0] # node text feature
        products_data_list.label_text_feat = texts_embed[1] # label text feature
        products_data_list.edge_text_feat = texts_embed[2] # edge text feature
        products_data_list.prompt_text_feat = texts_embed[3] # prompt node text feature
        products_data_list.prompt_edge_feat = texts_embed[4] # prompt edge text feature

        # get dataset split
        split_idx = products_data.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

        products_data_list.train_mask = train_idx
        products_data_list.val_mask = valid_idx
        products_data_list.test_mask = test_idx

        data, slices = self.collate([products_data_list]) # Pass the data_list as a list

        torch.save((data, slices), self.processed_paths[0])
        print("Products is processed. Saved.")

In [9]:
LMencoder = SentenceEncoder(root="../lang_models", name="ST", device=1)
custom_products = ProductsPyGDataset(dataRoot=data_root, sentence_encoder=LMencoder)

In [10]:
LMencoder = SentenceEncoder(root="../lang_models", name="roberta", device=1)
custom_products2 = ProductsPyGDataset(dataRoot=data_root, sentence_encoder=LMencoder)

In [11]:
products = custom_products._data
products

Data(num_nodes=2449029, edge_index=[2, 123718024], x=[2449029, 100], y=[2449029], x_text_feat=[2449029, 768], label_text_feat=[47, 768], edge_text_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], train_mask=[196615], val_mask=[39323], test_mask=[2213091])

In [12]:
for k, v in products:
    print(k, type(v))

num_nodes <class 'int'>
edge_index <class 'torch.Tensor'>
x <class 'torch.Tensor'>
y <class 'torch.Tensor'>
x_text_feat <class 'torch.Tensor'>
label_text_feat <class 'torch.Tensor'>
edge_text_feat <class 'torch.Tensor'>
prompt_text_feat <class 'torch.Tensor'>
prompt_edge_feat <class 'torch.Tensor'>
train_mask <class 'torch.Tensor'>
val_mask <class 'torch.Tensor'>
test_mask <class 'torch.Tensor'>


In [15]:
from torch_geometric.loader import NeighborLoader, NodeLoader

loader = None
loader = NeighborLoader(data=products,
                        num_neighbors=[-1, -1],
                        input_nodes=torch.LongTensor([0, 1, 2]))

In [16]:
for sg in loader:
    print(sg)

Data(num_nodes=5740, edge_index=[2, 19354], x=[5740, 100], y=[5740], x_text_feat=[5740, 768], label_text_feat=[47, 768], edge_text_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], train_mask=[196615], val_mask=[39323], test_mask=[2213091], n_id=[5740], e_id=[19354], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[1], batch_size=1)
Data(num_nodes=8199, edge_index=[2, 14826], x=[8199, 100], y=[8199], x_text_feat=[8199, 768], label_text_feat=[47, 768], edge_text_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], train_mask=[196615], val_mask=[39323], test_mask=[2213091], n_id=[8199], e_id=[14826], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[1], batch_size=1)
Data(num_nodes=3504, edge_index=[2, 4549], x=[3504, 100], y=[3504], x_text_feat=[3504, 768], label_text_feat=[47, 768], edge_text_feat=[1, 768], prompt_text_feat=[1, 768], prompt_edge_feat=[1, 768], train_mask=[196615], val_mask=[39323], test_mask=[2213091], n_id=[3504], e_id=[

In [17]:
products.is_directed()

False