## Test KBQA model on question pairs
* Example notebook to predict get answers to given questions

In [1]:
import os 
from pathlib import Path 
import sys 

root_dir = Path(os.getcwd()).parents[0]
sys.path.append(str(root_dir))

from utils import data_utils
from utils.dataset_utils import QADataset
import models_nhop

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel

### File Paths
* Enter your model path and configuration 

In [2]:
data_dir = root_dir/'datasets/MetaQA'
kg_path = data_dir/'kb.txt'

# enter your parameters 
NUM_HOPS = 1   
CKPT_PATH = 'checkpoints\epoch=0-step=750-v3.ckpt' 
ENOCDER_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
ENOCDER_SIZE = 384 # hidden size 

### Load data and model

In [3]:
# create kb matrices
triplets, entity_to_idx, relation_to_idx, idx_to_entity, idx_to_relation = data_utils.load_triplets_metaqa(kg_path)

subject_matrix, rel_matrix, object_matrix = data_utils.create_differentiable_kg(triplets, entity_to_idx, relation_to_idx)
object_matrix = torch.transpose(object_matrix, 0, 1)

# load models 
tokenizer = AutoTokenizer.from_pretrained(ENOCDER_NAME)
trans_model = AutoModel.from_pretrained(ENOCDER_NAME)

net = models_nhop.KBLightning(trans_model, subject_matrix, rel_matrix, object_matrix, NUM_HOPS, trans_output_size=ENOCDER_SIZE)
net = net.load_from_checkpoint(CKPT_PATH, trans_model=trans_model, subject_matrix=subject_matrix, rel_matrix=rel_matrix, object_matrix=object_matrix, num_hops=NUM_HOPS, trans_output_size=ENOCDER_SIZE)
print('checkpoint loaded')

num entities: 43234
num relations: 18
num triplets  269482
checkpoint loaded


### Predict on any question
* Update eval_pairs with your own questions
* The below questions are all onehop questions, since that's the model being tested here. Can utilize two hop model and test those question pairs

Key Points:
* For practical use-cases, this model has to be trained on un-answerable questions. For example look the 3rd question below. It queries movies acted in, rather it should return no answers

In [14]:
# create question pair dataset 
# format: [(question string, answer entities: tuple, subject entity: tuple)]
# if you dont have the answer entities, just enter any entity in this from the KB, its not used by the model 
eval_pairs = [
    ("Ruggero Raimondi appears in what films", ["Carmen"], ["Ruggero Raimondi"]),
    ('what does Laura Harring act in', ['Mulholland Drive', 'Derailed'], ['Laura Harring']),
    ('Which country does Laura Harring live in', ['Mulholland Drive'], ['Laura Harring']), # unanswerable question from KB
]

# all entities have to be in the KB
missing_entities = data_utils.santity_check(eval_pairs, entity_to_idx)

if missing_entities:
    raise ValueError(f'All entities have to be in the KB. There are {len(missing_entities)} entities in the questions missing: {missing_entities}')

# create dataset 
eval_tokens = tokenizer([row[0] for row in eval_pairs], padding=True, truncation=True, max_length=200, return_tensors='pt')
eval_dataset = QADataset(eval_pairs, eval_tokens, entity_to_idx)
eval_dl = DataLoader(eval_dataset, batch_size=16)

idx_to_entity = {idx: entity for entity, idx in entity_to_idx.items()}

number of entities missing 0: []


## Predict answers

In [19]:
threshold = 0.70

predictions = []
for batch in eval_dl:
    trans_input, subject_vector, object_labels = batch
    subject_vector2 = torch.transpose(subject_vector, 0, 1)
    object_logits = net(trans_input, subject_vector2)

    object_names, output_ids = models_nhop.interpret_follow_output(object_logits, idx_to_entity, threshold=threshold)
    predictions.extend(object_names)

print("predicted answers", predictions)

predicted answers [['Carmen'], ['Mulholland Drive', 'Derailed'], ['Mulholland Drive', 'Derailed']]
