In [1]:
from typing import List
from torch_geometric.data import InMemoryDataset
from torch_geometric.distributed.local_graph_store import LocalGraphStore
from torch_geometric.datasets.web_qsp_dataset import *
import torch
import datasets
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from raw_qsp_dataset import RawWebQSPDataset

In [3]:
dataset = RawWebQSPDataset(force_reload=True)

Processing...
Done!


In [4]:
dataset.raw_dataset

Dataset({
    features: ['id', 'question', 'answer', 'q_entity', 'a_entity', 'graph', 'choices'],
    num_rows: 4700
})

In [5]:
from large_graph_indexer import LargeGraphIndexer, TripletLike

In [6]:
def preprocess_triplet(triplet: TripletLike):
    return tuple([word.lower() for word in triplet])

Test basic collation

In [7]:
indexer = LargeGraphIndexer.from_triplets(dataset.raw_dataset[0]['graph'], pre_transform=preprocess_triplet)

In [8]:
indexer2 = LargeGraphIndexer.from_triplets(dataset.raw_dataset[1]['graph'], pre_transform=preprocess_triplet)

In [9]:
bigger_indexer = LargeGraphIndexer.collate([indexer, indexer2])

In [10]:
assert len(indexer._nodes) + len(indexer2._nodes) - len(indexer._nodes.keys() & indexer2._nodes.keys()) == len(bigger_indexer._nodes)
assert len(indexer._edges) + len(indexer2._edges) - len(indexer._edges.keys() & indexer2._edges.keys()) == len(bigger_indexer._edges)

In [11]:
assert len(set(bigger_indexer._nodes.values())) == len(bigger_indexer._nodes)
assert len(set(bigger_indexer._edges.values())) == len(bigger_indexer._edges)

In [12]:
for node in indexer._nodes.keys():
    assert indexer.node_attr['pid'][indexer._nodes[node]] == node, f'{node} is not {indexer.node_attr["pid"][indexer._nodes[node]]}'

In [13]:
import tqdm
from multiprocessing import Pool

Test collation on entire dataset

In [14]:
LIMIT=100
def get_next_graph(dataset, limit=None):
    i = 0
    for ds in dataset:
        if i == limit:
            break
        yield ds['graph']
        i += 1
graphs = get_next_graph(dataset.raw_dataset, limit=LIMIT)

In [15]:
indexers = []
def from_trips_with_pretransform(triplets):
    return LargeGraphIndexer.from_triplets(triplets, pre_transform=preprocess_triplet)
with Pool(40) as p:
    indexers = list(tqdm.tqdm(p.imap(from_trips_with_pretransform, graphs), total=len(dataset.raw_dataset) if LIMIT is None else LIMIT))

100%|██████████| 100/100 [00:01<00:00, 55.45it/s]


In [16]:
start = time.time()
big_indexer = LargeGraphIndexer.collate(indexers)
time.time()-start

0.5066280364990234

Naive method, concatenating all the triplets

In [17]:
from itertools import chain

In [18]:
large_graph_dataset = chain.from_iterable(get_next_graph(dataset.raw_dataset, limit=LIMIT))

In [19]:
# This is just for TQDM to work well
total_size = 0
#for g in tqdm.tqdm(get_next_graph(dataset.raw_dataset), total=4700):
#    total_size += len(g)

In [20]:
large_indexer = LargeGraphIndexer.from_triplets(tqdm.tqdm(large_graph_dataset, total=total_size), pre_transform=preprocess_triplet)

423763it [00:02, 204343.22it/s]


In [21]:
assert set(large_indexer._nodes) == set(big_indexer._nodes)
assert set(large_indexer._edges) == set(big_indexer._edges)

## Phase I: Indexing Large Graph

In [22]:
node_attributes = list(big_indexer.get_unique_node_features())

In [23]:
edge_attributes = list(big_indexer.get_unique_edge_features("r"))

In [24]:
len(node_attributes)

105413

In [25]:
len(edge_attributes)

3095

In [26]:
from torch_geometric.nn.text import text2embedding, SentenceTransformer

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer("sentence-transformers/all-roberta-large-v1").to(device)
model.eval()

inherit model weights from sentence-transformers/all-roberta-large-v1


SentenceTransformer(
  (bert_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
           

In [28]:
print(device)

cuda


In [29]:
from more_itertools import chunked

In [30]:
nodes_to_process, edges_to_process = len(node_attributes), len(edge_attributes)

In [31]:
BATCH_SIZE = 32

In [32]:
# Indexing graph features
node_embs = []
for nbatch in tqdm.tqdm(chunked(node_attributes, BATCH_SIZE), total=nodes_to_process//BATCH_SIZE):
    node_embs.append(text2embedding(model, device, nbatch, BATCH_SIZE).cpu())
node_embs = torch.cat(node_embs, 0)



3295it [01:07, 48.80it/s]                          


In [33]:
node_embs.shape

torch.Size([105413, 1024])

In [34]:
big_indexer.add_node_feature(new_feature_name="embs", new_feature_vals=node_embs)

In [35]:
from typing import Iterable, Callable, Optional
from large_graph_indexer import TripletLike, ordered_set
from torch_geometric.typing import FeatureTensorType
from torch_geometric.data import Data

In [36]:
# Indexing graph features
edge_embs = []
for ebatch in tqdm.tqdm(chunked(edge_attributes, BATCH_SIZE), total=edges_to_process//BATCH_SIZE):
    edge_embs.append(text2embedding(model, device, ebatch).cpu())
edge_embs = torch.cat(edge_embs, 0)


97it [00:01, 51.27it/s]                        


In [37]:
edge_embs.shape

torch.Size([3095, 1024])

In [38]:
big_indexer.add_edge_feature(new_feature_name="embs", new_feature_vals=edge_embs, map_from_feature="r")

In [39]:
from large_graph_indexer import get_features_for_triplets

In [40]:
import networkx as nx

In [41]:
first_10_trips = dataset.raw_dataset[:10]['graph']

In [42]:
#TODO: Parallelize
first_10 = [get_features_for_triplets(big_indexer, trip_lst, pre_transform=preprocess_triplet) for trip_lst in tqdm.tqdm(first_10_trips)]

100%|██████████| 10/10 [00:01<00:00,  5.64it/s]


In [43]:
# Grab the first few samples from the old ds to test with LargeGraphIndexer
old_dataset = RawWebQSPDataset(force_reload=True, with_process=True, limit=10)

Processing...


inherit model weights from sentence-transformers/all-roberta-large-v1
Encoding graphs...


100%|██████████| 10/10 [01:01<00:00,  6.11s/it]
Done!


In [44]:
def results_are_close_enough(ground_truth: Data, new_method: Data, thresh=.8):
    def _sorted_tensors_are_close(tensor1, tensor2):
        return torch.all(torch.isclose(tensor1.sort(dim=0)[0], tensor2.sort(dim=0)[0]).float().mean(axis=1) > thresh)
    def _graphs_are_same(tensor1, tensor2):
        return nx.weisfeiler_lehman_graph_hash(nx.Graph(tensor1.T)) == nx.weisfeiler_lehman_graph_hash(nx.Graph(tensor2.T))
    return _sorted_tensors_are_close(ground_truth.x, new_method.x) \
        and _sorted_tensors_are_close(ground_truth.edge_attr, new_method.edge_attr) \
        and _graphs_are_same(ground_truth.edge_index, new_method.edge_index)

In [45]:
for ds in zip(old_dataset, first_10):
    print(results_are_close_enough(*ds))

True
True
True
True
True
True
True
True
True
True


Test Saving and Loading and eq

In [46]:
big_indexer.save('indexer')

In [47]:
assert big_indexer == LargeGraphIndexer.from_disk('indexer')