In [1]:
from profiling_utils import create_remote_backend_from_triplets
from rag_feature_store import SentenceTransformerFeatureStore
from rag_graph_store import NeighborSamplingRAGGraphStore
from torch_geometric.loader import RAGQueryLoader
from torch_geometric.datasets import UpdatedWebQSPDataset
from torch_geometric.nn.nlp import SentenceTransformer
from torch_geometric.datasets.updated_web_qsp_dataset import preprocess_triplet, retrieval_via_pcst
from torch_geometric.data import get_features_for_triplets_groups, Data
from itertools import chain
import torch
from typing import Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = UpdatedWebQSPDataset("small_ds", force_reload=True, limit=10)

Processing...


Loading graph...
Encoding questions...
Retrieving subgraphs...


10it [00:00, 58.23it/s]00:00<?, ?it/s]
100%|██████████| 10/10 [00:01<00:00,  5.79it/s]


Saving subgraphs...


Done!


In [3]:
triplets = list(chain.from_iterable((d['graph'] for d in ds.raw_dataset)))

In [4]:
questions = ds.raw_dataset['question']
questions

['what is the name of justin bieber brother',
 'what character did natalie portman play in star wars',
 'what country is the grand bahama island in',
 'what kind of money to take to bahamas',
 'what character did john noble play in lord of the rings',
 'who does joakim noah play for',
 'where are the nfl redskins from',
 'where did saki live',
 'who did draco malloy end up marrying',
 'which countries border the us']

In [5]:
ground_truth_graphs = get_features_for_triplets_groups(ds.indexer, (d['graph'] for d in ds.raw_dataset), pre_transform=preprocess_triplet)
num_edges = len(ds.indexer._edges)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer().to(device)

In [7]:
fs, gs = create_remote_backend_from_triplets(triplets=triplets, node_embedding_model=model, node_method_to_call="encode", path="backend", pre_transform=preprocess_triplet, node_method_kwargs={"batch_size": 256}, graph_db=NeighborSamplingRAGGraphStore, feature_db=SentenceTransformerFeatureStore).load()

In [11]:
query_loader = RAGQueryLoader(data=(fs, gs), seed_nodes_kwargs={"k_nodes": 10}, seed_edges_kwargs={"k_edges": 10}, sampler_kwargs={"num_neighbors": [40]*10})

In [12]:
# Accuracy Metrics to be added to Profiler
def _eidx_helper(subg: Data, ground_truth: Data):
    subg_eidx, gt_eidx = subg.edge_idx, ground_truth.edge_idx
    if isinstance(subg_eidx, torch.Tensor):
        subg_eidx = subg_eidx.tolist()
    if isinstance(gt_eidx, torch.Tensor):
        gt_eidx = gt_eidx.tolist()
    subg_e = set(subg_eidx)
    gt_e = set(gt_eidx)
    return subg_e, gt_e
def check_retrieval_accuracy(subg: Data, ground_truth: Data, num_edges: int):
    subg_e, gt_e = _eidx_helper(subg, ground_truth)
    total_e = set(range(num_edges))
    tp = len(subg_e & gt_e)
    tn = len(total_e-(subg_e | gt_e))
    return (tp+tn)/num_edges
def check_retrieval_precision(subg: Data, ground_truth: Data):
    subg_e, gt_e = _eidx_helper(subg, ground_truth)
    return len(subg_e & gt_e) / len(subg_e)
def check_retrieval_recall(subg: Data, ground_truth: Data):
    subg_e, gt_e = _eidx_helper(subg, ground_truth)
    return len(subg_e & gt_e) / len(gt_e)

In [13]:
query_loader.query(questions[0]).edge_index

tensor([[  12,    9,   29,  ..., 4320, 4320,  203],
        [   0,    1,    2,  ..., 4335, 4336, 4336]])

In [14]:
for subg, gt in zip((query_loader.query(q) for q in questions), ground_truth_graphs):
    print(check_retrieval_accuracy(subg, gt, num_edges), check_retrieval_precision(subg, gt), check_retrieval_recall(subg, gt))

10it [00:00, 58.83it/s]


0.5298204220736663 0.22839287819832568 0.4442151129457631
0.5446847555380784 0.1108633135747403 0.4599464329194059
0.5419845327041551 0.05783118608816223 0.4625693160813309
0.5353781622755276 0.1497463408945128 0.4497373029772329
0.5505046532966312 0.038434331525583586 0.4387617765814266
0.5276445143531262 0.07287290651431132 0.4701843151427539
0.5448420500720933 0.10575959441136591 0.44996237772761477
0.5472276838379866 0.06082667919210897 0.4482907832107313
0.5396775462052694 0.10320740438907969 0.4516046867040244
0.5173417223751474 0.17134230030194894 0.4847778813296055


In [15]:
def apply_retrieval_via_pcst(graph: Data, query: str, topk: int = 3, topk_e: int = 3, cost_e: float = 0.5) -> Tuple[Data, str]:
    q_emb = model.encode(query)
    textual_nodes = ds.textual_nodes.iloc[graph["node_idx"]].reset_index()
    textual_edges = ds.textual_edges.iloc[graph["edge_idx"]].reset_index()
    out_graph, desc = retrieval_via_pcst(graph, q_emb, textual_nodes, textual_edges, topk, topk_e, cost_e)
    out_graph["desc"] = desc
    return graph

In [16]:
query_loader = RAGQueryLoader(data=(fs, gs), seed_nodes_kwargs={"k_nodes": 10}, seed_edges_kwargs={"k_edges": 10}, sampler_kwargs={"num_neighbors": [40]*10}, local_filter=apply_retrieval_via_pcst)

In [17]:
for subg, gt in zip((query_loader.query(q) for q in questions), ds):
    print(check_retrieval_accuracy(subg, gt, num_edges), check_retrieval_precision(subg, gt), check_retrieval_recall(subg, gt))

0.5534146021759078 0.0007042666823170373 0.6
0.5555118626294403 0.00017691808692575338 0.75
0.5525756979944947 0.0001757881167233095 0.42857142857142855
0.5580547909293485 0.00011862396204033214 1.0
0.5605977192292568 5.966587112171838e-05 0.3333333333333333
0.5426923581072225 0.00017200848575196377 0.3333333333333333
0.5530475815965396 0.00011729517330361856 1.0
0.553126228863547 0.0002933239469670304 0.5
0.5531000131078778 0.00029337557941676935 0.35714285714285715
0.5271464149954123 5.545389009038984e-05 0.16666666666666666
