In [1]:
import json
import numpy as np
import random
import sys
from tqdm import tqdm
import re
from pathlib import Path

import torch
import torch.nn as nn
from transformers import DistilBertModel, DistilBertTokenizer

from wikidataintegrator import wdi_core
from wikidata.client import Client
import wikidata
import en_core_web_sm
nlp = en_core_web_sm.load()

from IPython.display import clear_output
from IPython.core.debugger import set_trace
import matplotlib.pyplot as plt
%matplotlib inline

#############################################################
from utils import get_triplets_by_idd, get_description_name
from datasets import load_rubq, load_simple_questions, combined_dataset_non_stochastic
from models import EncoderBERT, get_projection_module_simple, get_tokenizer
from reject import reject_by_metric
from train import train_ensemble
from eval_models import eval_ensemble
from get_props import presearch_sq, presearch_rubq

%load_ext autoreload
%autoreload 1

2022-08-10 07:08:35.155315: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-08-10 07:08:35.159724: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-08-10 07:08:35.159744: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
print(device)

cuda:2


### Dataset and Dataloaders

In [4]:
#path to full list of embeddings and full list of ids (one2one correspondence with embeddings)
PATH_TO_EMBEDDINGS_Q = "../new_data/entitie_embeddings_ru.json" 
PATH_TO_IDS = "../new_data/entitie_ids_ru_filtered.json"
PATH_TO_EMBEDDINGS_P = "../new_data/entitie_P_embeddings_ru.json" 

graph_embeddings_Q = json.load(open(PATH_TO_EMBEDDINGS_Q))
graph_embeddings_P = json.load(open(PATH_TO_EMBEDDINGS_P))

In [5]:
%autoreload

MASTER_SEED = 42

questions_train, relations_train, entities_train, answers_train, questions_val, relations_val, entities_val, answers_val, questions_test, answers_test = load_rubq(MASTER_SEED, graph_embeddings_Q, graph_embeddings_P)
simple_questions_train, simple_questions_val = load_simple_questions(MASTER_SEED, graph_embeddings_Q, graph_embeddings_P)

  answers_train = np.array(answers)[train_ids]
  answers_val = np.array(answers)[val_ids]


308
296
1186
16414


100% 16414/16414 [00:00<00:00, 214039.94it/s]

8327





In [6]:
rubq_candidates = list(np.load("./data/presearched_fixed_rubq_test.npy", allow_pickle=True))

In [7]:
import pickle

entities = np.load('data/candidate_entities_sq_test.npy', allow_pickle=True)

with open('data/entity_subgraphs_sq_test.pickle', 'rb') as handle:
    entity_subgraphs = pickle.load(handle)

In [8]:
sq_candidates = []

for question_entities in entities:
    candidates_dict = {}
    for entity in list(question_entities.item()):
        candidates_dict[entity] = entity_subgraphs[entity]
    sq_candidates.append(candidates_dict)

In [9]:
simple_questions_test = np.load("../new_data/simple_questions_test.npy")

simple_questions_filtered = []
questions_sq = []
answers_sq = []

for e, p, a, q in tqdm(simple_questions_test):
    if e in graph_embeddings_Q and a in graph_embeddings_Q and p in graph_embeddings_P:
        simple_questions_filtered.append((e, p, a, q))
        questions_sq.append(q)
        answers_sq.append([a])

100% 4751/4751 [00:00<00:00, 230993.76it/s]


### What is top accuracy that model can achieve given this NER?

#### RuBQ

In [14]:
present = []
for (answer, candidates) in zip(answers_test, rubq_candidates):
    cand_answers = []
    for ent in candidates.keys():
        entity_candidates = [pair[0] for pair in candidates[ent]]
        cand_answers.extend(entity_candidates)
    
    for cand_ans in cand_answers:
        if cand_ans in answer:
            present.append(1.0)
            break

In [18]:
answer_present = len(present) / len(answers_test)
print('At least answer present in candidates: ', answer_present)

At least answer present in candidates:  0.7951096121416527


#### SQ

In [19]:
present = []
for (answer, candidates) in zip(answers_sq, sq_candidates):
    cand_answers = []
    for ent in candidates.keys():
        entity_candidates = [pair[0] for pair in candidates[ent]]
        cand_answers.extend(entity_candidates)
    
    for cand_ans in cand_answers:
        if cand_ans in answer:
            present.append(1.0)
            break

In [20]:
answer_present = len(present) / len(answers_sq)
print('At least answer present in candidates: ', answer_present)

At least answer present in candidates:  0.6542247744052502


### mGENRE + Stanza

In [23]:
import stanza 

stanza.download('en')

ModuleNotFoundError: No module named '_lzma'

In [None]:
def stanza_nlp(text, device, language):
    nlp = stanza.Pipeline(lang=language, processors='tokenize,ner', verbose= False, use_gpu = False)
    doc = nlp(text)
    return [ent.text for sent in doc.sentences for ent in sent.ents]

def NER_Stanza(sentence, language, device=device):
    res = stanza_nlp(text = sentence, device = device, language = language)
    if res != []:
        if len(res) == 1:
            first_part, second_part = sentence.split(res[0])[0], sentence.split(res[0])[1]
            output = first_part + "[START] " + res[0] + " [END]" + second_part
            return output
        else:
            for i in range(len(res)):
                output = ' '.join(['[START] {} [END]'.format(x) if x in res else x for x in sentence.split(" ")])
            return output
                
    else:
        return sentence

def UE_estimate(
    data,
    model,
    ue_metrics = ['entropy', 'maxprob', 'delta', 'BALD' ,'expected entropy', 'predicted entropy'],
    number_of_samples = 100,
    beams = 5,
    seed = 13,
    task = "Question Answering (object detection)",
    target_col = "object",
    NER = None,
    dataset = "Simple Questions",
    language = "en"
):
    n = number_of_samples
    rang = range(n)
    df = data.sample(n = n, replace = False, random_state=seed)
    
    elif NER == "Stanza":
        df = df.reset_index().drop(['index'], axis = 1)        
        da = pd.DataFrame(df['question'].apply(lambda x: string.capwords(x)))
        print("Started preparing text using NER")
        for i in tqdm(range(len(da))):
            #print("before: ", da.loc[i, "question"])
            #print("before df: ", df.loc[i, "question"])
            da.loc[i, "question"] = NER_Stanza(da.loc[i, "question"], language)
            
            #print("after: ", da.loc[i, "question"])
            #print("correct answer: ", df.loc[i, "subject"])
        
        df["question"] = da["question"]
        print("Finished preparing text using NER")
        
    print("Started sampling variants using mGENRE")
    model_mGENRE_mcdropout_result = model.sample(list(df['question']),
                                                      beam = beams,
                                                      prefix_allowed_tokens_fn=lambda batch_id, sent: [
                                                          e for e in trie.get(sent.tolist())
                                                          if e < len(model.task.target_dictionary)
                                                      ],
                                                      text_to_id=lambda x: max(lang_title2wikidataID[tuple(reversed(x.split(" >> ")))], key=lambda y: int(y[1:])),
                                                      marginalize=True,
                                                      verbose = True,
                                                      seed = seed)