In [2]:
import json
import numpy as np
import random
import sys
from tqdm import tqdm
import re

import torch
import torch.nn as nn
from transformers import DistilBertModel, DistilBertTokenizer

from wikidataintegrator import wdi_core
from wikidata.client import Client
import wikidata
import en_core_web_sm
nlp = en_core_web_sm.load()

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

#############################################################
from utils import get_triplets_by_idd, get_description_name

In [None]:
device = torch.device("cuda:5" if torch.cuda.is_available() else "cpu")
print(device)

### Loading Graph Embeddings and Questions

In [3]:
#path to full list of embeddings and full list of ids (one2one correspondence with embeddings)
PATH_TO_EMBEDDINGS_Q = "../new_data/entitie_embeddings_ru.json" 
PATH_TO_IDS = "../new_data/entitie_ids_ru_filtered.json"
PATH_TO_EMBEDDINGS_P = "../new_data/entitie_P_embeddings_ru.json" 

graph_embeddings_Q = json.load(open(PATH_TO_EMBEDDINGS_Q))
graph_embeddings_P = json.load(open(PATH_TO_EMBEDDINGS_P))

In [4]:
questions = list(np.load("../new_data/all_EN_rubq_val_questions_1_hop_uri.npy"))
relations = list(np.load("../new_data/all_rubq_val_relations_1_hop_uri.npy"))
entities = list(np.load("../new_data/all_rubq_val_entities_1_hop_uri.npy"))
answers = list(np.load("../new_data/all_rubq_val_answers_1_hop_uri.npy",allow_pickle=True))

questions_test = list(np.load("../new_data/all_EN_rubq_test_questions_1_hop_uri.npy"))
answers_test = list(np.load("../new_data/all_rubq_test_answers_1_hop_uri.npy", allow_pickle=True))


yes = []
no = []
no_ids = []

for i, answer in enumerate(answers):
    flag = True
    for a in answer:
        if not a in graph_embeddings_Q:
            flag = False
    if flag and relations[i] in graph_embeddings_P and entities[i] in graph_embeddings_Q:
        yes.append(answer)
    else:
        no.append(answer)
        no_ids.append(i)
        
for i in no_ids[::-1]:
    del answers[i]
    del questions[i]
    del relations[i]
    del entities[i]
      
        
print(len(questions))

296


In [6]:
simple_questions = np.load("../new_data/simple_questions_train.npy")

In [7]:
simple_questions_filtered = []
print(len(simple_questions))
for e, p, a, q in tqdm(simple_questions):
    if e in graph_embeddings_Q and a in graph_embeddings_Q:
        simple_questions_filtered.append((e, p, a, q))
print(len(simple_questions_filtered))

simple_questions = simple_questions_filtered

16414


100%|██████████| 16414/16414 [00:00<00:00, 219327.31it/s]

8327





### mBert Text Encoder

In [8]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained("bert-base-multilingual-cased").to(device)

class EncoderBERT(nn.Module):
    def __init__(self):
        super(EncoderBERT,self).__init__()
        self.encoder =  model
    def forward(self,questions_ids):
        q_ids = torch.tensor(questions_ids)
        last_hidden_states = self.encoder(q_ids)[0]
        q_emb = last_hidden_states.mean(1)
        return q_emb
    
encoder = EncoderBERT().to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Trainable projection modules

In [9]:

projection_E = nn.Sequential(
    nn.Linear(768,512),
    nn.ELU(),
    nn.Linear(512,512),
    nn.ELU(),
    nn.Linear(512,200),
).to(device).train()

projection_Q = nn.Sequential(
    nn.Linear(768,512),
    nn.ELU(),
    nn.Linear(512,512),
    nn.ELU(),
    nn.Linear(512,200),
).to(device).train()

projection_P = nn.Sequential(
    nn.Linear(768,512),
    nn.ELU(),
    nn.Linear(512,512),
    nn.ELU(),
    nn.Linear(512,200),
).to(device).train()


### Dataset and Dataloaders

In [10]:
import random

MAX_LEN_Q = 32
class my_dataset_simple_questions(torch.utils.data.Dataset):
    def __init__(self, simple_questions, graph_Q, graph_P):
        self.simple_questions = simple_questions
        self.graph_Q = graph_Q
        self.graph_P = graph_P
    def __len__(self):
        return len(self.simple_questions)
    def __getitem__ (self,i):
        id_e, id_p, id_a, q = self.simple_questions[i]
        input_ids = torch.tensor([tokenizer.encode(q, max_length=MAX_LEN_Q, add_special_tokens=True, pad_to_max_length=True)]).to(device)[0]
        answer = id_a
        if id_p[0] == "P":
            relation = id_p
        else:
            relation = "P" + id_p[1:]
        graph_Q_embedding = torch.FloatTensor(self.graph_Q[answer])
        graph_P_embedding = torch.FloatTensor(self.graph_P[relation])
        return (input_ids.to(device), graph_Q_embedding.to(device), graph_P_embedding.to(device))
    
class my_dataset(torch.utils.data.Dataset):
    def __init__(self, questions, answers, relations, graph_Q, graph_P):
        self.questions = questions
        self.answers = answers
        self.relations = relations
        self.graph_Q = graph_Q
        self.graph_P = graph_P
    def __len__(self):
        return len(self.questions)
    def __getitem__ (self,i):
        input_ids = torch.tensor([tokenizer.encode(self.questions[i], max_length=MAX_LEN_Q, add_special_tokens=True, pad_to_max_length=True)]).to(device)[0]
        answer = self.answers[i]
        relation = self.relations[i]
        graph_Q_embedding = torch.FloatTensor(self.graph_Q[answer])
        graph_P_embedding = torch.FloatTensor(self.graph_P[relation])
        return (input_ids.to(device), graph_Q_embedding.to(device), graph_P_embedding.to(device))
    
class combined_dataset(torch.utils.data.Dataset):
    def __init__(self, questions, answers, entities, relations, graph_Q, graph_P, simple_questions):
        self.simple_questions = simple_questions
        self.questions = questions
        self.answers = answers
        self.entities = entities
        self.relations = relations
        self.graph_Q = graph_Q
        self.graph_P = graph_P
        
    def __len__(self):
        return len(self.simple_questions)
    def __getitem__ (self,i):
        if i % 5 > 0:
            id_e, id_p, id_a, q = self.simple_questions[i]
            input_ids = torch.tensor([tokenizer.encode(q, max_length=MAX_LEN_Q, add_special_tokens=True, pad_to_max_length=True)]).to(device)[0]
            answer = id_a
            if id_p[0] == "P":
                relation = id_p
            else:
                relation = "P" + id_p[1:]
            graph_E_embedding = torch.FloatTensor(self.graph_Q[id_e])
            graph_Q_embedding = torch.FloatTensor(self.graph_Q[answer])
            graph_P_embedding = torch.FloatTensor(self.graph_P[relation])
            return (input_ids.to(device), graph_E_embedding.to(device), graph_Q_embedding.to(device), graph_P_embedding.to(device))
        else:
            i = random.randint(0,len(self.questions) - 1)
            input_ids = torch.tensor([tokenizer.encode(self.questions[i], max_length=MAX_LEN_Q, add_special_tokens=True, pad_to_max_length=True)]).to(device)[0]
            entity = self.entities[i]
            answer = self.answers[i]
            ind = random.randint(0,len(answer) - 1)
            answer = answer[ind]
            relation = self.relations[i]
            graph_E_embedding = torch.FloatTensor(self.graph_Q[entity])
            graph_Q_embedding = torch.FloatTensor(self.graph_Q[answer])
            graph_P_embedding = torch.FloatTensor(self.graph_P[relation])
            return (input_ids.to(device), graph_E_embedding.to(device), graph_Q_embedding.to(device), graph_P_embedding.to(device))
            
    
    
train_dataset = combined_dataset(questions, answers, entities, relations, graph_embeddings_Q, graph_embeddings_P, simple_questions)
train_batch_generator = torch.utils.data.DataLoader(train_dataset,batch_size=64, shuffle=True)

In [11]:
opt = torch.optim.AdamW(
    list(projection_Q.parameters()) + \
    list(projection_P.parameters()) + \
    list(projection_E.parameters()) + \
    list(encoder.parameters()), 
    lr=1e-4)

### Training loop

In [12]:
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    for X, y_e, y_q, y_p in train_batch_generator:
        projection_Q.train()
        projection_E.train()
        projection_P.train()
        
        y_pred_e = projection_E(encoder(X))
        y_pred_q = projection_Q(encoder(X))
        y_pred_p = projection_P(encoder(X))
        
        loss = nn.MSELoss()(y_q,y_pred_q) + nn.MSELoss()(y_p,y_pred_p) + nn.MSELoss()(y_e,y_pred_e)
        loss.backward()
        
        opt.step()
        opt.zero_grad()
        
    print("EPOCH", epoch, loss.item())
        

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  # Remove the CWD from sys.path while we load stuff.


EPOCH 0 0.17019693553447723
EPOCH 1 0.16221800446510315
EPOCH 2 0.16502806544303894
EPOCH 3 0.16466915607452393
EPOCH 4 0.21004383265972137
EPOCH 5 0.12261909246444702
EPOCH 6 0.17267999053001404
EPOCH 7 0.08153051882982254
EPOCH 8 0.19384366273880005
EPOCH 9 0.09176598489284515


### RuBQ 2.0 Test

In [13]:
embeddings_Q = graph_embeddings_Q
ids_list = list(graph_embeddings_Q.keys())
embeddings_Q = [embeddings_Q[Q] for Q in ids_list]
embeddings_tensor_Q = torch.FloatTensor(embeddings_Q)

embeddings_P = graph_embeddings_P
embeddings_P = [embeddings_P[P] for P in graph_embeddings_P.keys()]
embeddings_tensor_P = torch.FloatTensor(embeddings_P)

In [18]:
# from natasha import (
#     Segmenter,
#     MorphVocab,
    
#     NewsEmbedding,
#     NewsMorphTagger,
#     NewsSyntaxParser,
#     NewsNERTagger,
    
#     PER,
#     NamesExtractor,

#     Doc
# )


segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

# names_extractor = NamesExtractor(morph_vocab)

# text = "Кто автор поэмы 'энеида'?"
# doc = Doc(text)

# doc.segment(segmenter)
# doc.tag_morph(morph_tagger)
# doc.tag_ner(ner_tagger)
# for span in doc.spans:
#     span.normalize(morph_vocab)
    

### Entity candidate selection fuctions

In [15]:
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc)

from natasha.doc import DocSpan, DocToken

class Kostil_phrase_normalization():
    def __init__(self):
        self.segmenter = Segmenter()
        self.morph_vocab = MorphVocab()

        self.emb = NewsEmbedding()
        self.morph_tagger = NewsMorphTagger(self.emb)
        self.syntax_parser = NewsSyntaxParser(self.emb)
        self.doc = None
        
        
        
    def phrase_preprocess(self, phrase):
        self.doc = Doc(phrase)
        self.doc.segment(self.segmenter)
        self.doc.tag_morph(self.morph_tagger)
        
        
        
    def get_tokens(self, phrase, tokens):
        local_tokens = phrase.split()
        result_tokens = []
        for token in tokens:
            if token.text in local_tokens:
                result_tokens.append(token)
        return result_tokens
    
    
    def normalize(self, phrase):
        self.phrase_preprocess(phrase)
        
        tokens = self.get_tokens(phrase, self.doc.tokens)
        span = DocSpan('0', '2', type='LOC', text=phrase, tokens=tokens)
        span.normalize(self.morph_vocab)
        return span.normal



normalizer = Kostil_phrase_normalization()
phrase = 'Ленинградской области'
print(normalizer.normalize(phrase))

Ленинградская область


In [19]:
import spacy
nlp = spacy.load("ru_core_news_sm") 

def get_nouns(text):
    text = text.replace('?', '')
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.tag_ner(ner_tagger)
    for span in doc.spans:
        span.normalize(morph_vocab)
        
    ents = [str(ent.normal) for ent in doc.spans]
    
    text1 = text.replace('\'', '').replace('\"', '')
    doc = Doc(text1)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.tag_ner(ner_tagger)
    for span in doc.spans:
        span.normalize(morph_vocab)
    
    ents += [str(ent.normal) for ent in doc.spans]
    
    if '"' in text:
        text3 = text[text.find('"')+1:]
        text3 = text3[0:text3.find('"')]
        doc = Doc(text3)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)
        doc.tag_ner(ner_tagger)
        for span in doc.spans:
            span.normalize(morph_vocab)

        ents += [str(ent.normal) for ent in doc.spans]
        ents += [str(normalizer.normalize(text3))]
        ents += [str(text3)]
        
    if '«' in text:
        text4 = text[text.find('«')+1:]
        text4 = text4[0:text4.find('»')]
        doc = Doc(text4)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)
        doc.tag_ner(ner_tagger)
        for span in doc.spans:
            span.normalize(morph_vocab)

        ents += [str(ent.normal) for ent in doc.spans]
        ents += [str(normalizer.normalize(text4))]
        ents += [str(text4)]
        

    if len(ents) == 0:
        doc = nlp(text)
        ents = [token.lemma_ for token in doc if token.pos_ == "NOUN" or token.pos_ == "PROPN"]
        
        bigrams = [normalizer.normalize(" ".join(b)) for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]
        ents += bigrams
    nouns_set = set(ents)
    if "" in nouns_set:
        nouns_set.remove("")
    return list(nouns_set)

def get_top_ids_second_hop(text, first_hop_graph_E, second_hop_graph_Q, second_hop_graph_P, second_hop_ids_filtered_Q, second_hop_ids_filtered_P, topk):
        projection_E.eval()
        projection_P.eval()
        projection_Q.eval()
        bert_tokenize = lambda text: torch.tensor([tokenizer.encode(text, max_length=MAX_LEN_Q, add_special_tokens=True,pad_to_max_length=True)]).to(device)[0]
        X = torch.tensor([tokenizer.encode(text, max_length=MAX_LEN_Q, add_special_tokens=True,pad_to_max_length=True)]).to(device)[0].to(device)
        y_pred_e = projection_E(encoder(X[None,:]))
        y_pred_q = projection_Q(encoder(X[None,:]))
        y_pred_p = projection_P(encoder(X[None,:]))

        embeddings_tensor_E = torch.FloatTensor(first_hop_graph_E)
        embeddings_tensor_Q = torch.FloatTensor(second_hop_graph_Q)
        embeddings_tensor_P = torch.FloatTensor(second_hop_graph_P)
        
        
        cosines_descr_E = torch.cosine_similarity(embeddings_tensor_E.cpu(),y_pred_e.cpu())
        cosines_descr_E = nn.Softmax()(cosines_descr_E)
        
        cosines_descr_Q = torch.cosine_similarity(embeddings_tensor_Q.cpu(),y_pred_q.cpu())
        cosines_descr_Q = nn.Softmax()(cosines_descr_Q)

        cosines_descr_P = torch.cosine_similarity(embeddings_tensor_P.cpu(),y_pred_p.cpu())
        cosines_descr_P = nn.Softmax()(cosines_descr_P)
        
        cosines_aggr = cosines_descr_P * cosines_descr_Q * cosines_descr_E
        inds = torch.topk(cosines_aggr,topk,sorted=True).indices.cpu().numpy()
        return np.array(second_hop_ids_filtered_Q)[inds], cosines_aggr[inds]
    

def mp_get_second_hop_entities_by_idd(idd,d):
    client = wikidata.client.Client()
    entity = client.get(idd, load = True)
    second_hop_qp = []
    for x in tqdm(list(entity)): # Iterate over properties
        prop = client.get(x.id, load = True)
        try:
            if type(prop) is wikidata.entity.Entity and type(entity[prop]) is wikidata.entity.Entity:
                second_hop_qp.append((str(entity[prop].id),str(prop.id)))
        except:
            pass
    d[idd] = second_hop_qp


### Test

In [20]:
#multiprocessing
import multiprocessing
from multiprocessing import Manager, Process

MAX_PRESEARCH = 7
q_list = []
a_list = []
a_predicts = []
inv_ranks = []
top1_scores = []


for q, a in zip(questions_test, answers_test):
    print("question: ", q)

    nouns = get_nouns(str(q))
    nouns = list(set(nouns))
    print("nouns: ", nouns)

    ids_q = []
    for noun in nouns:
        ids_q += wdi_core.WDItemEngine.get_wd_search_results(noun)[0:MAX_PRESEARCH]
    ids_q = list(set(ids_q))
    if len(ids_q) != 0:
        second_hop_ids_QP = Manager().dict()
        processes = []
        for idd_q in ids_q:
            processes.append(Process(target=mp_get_second_hop_entities_by_idd, args=(idd_q,second_hop_ids_QP)))
        print(f"# PROCESSES: {len(processes)}")
        for p in processes:
            p.start()
        for p in processes:
            p.join()
    
    if len(second_hop_ids_QP) > 0:
        first_hop_graph_E = []
        second_hop_graph_Q = []
        second_hop_ids_filtered_Q = []
        second_hop_graph_P = []
        second_hop_ids_filtered_P = []
        for key in second_hop_ids_QP.keys():
            for (idd_q, idd_p) in second_hop_ids_QP[key]:
                if idd_q in graph_embeddings_Q and idd_p in graph_embeddings_P and key in graph_embeddings_Q:
                    first_hop_graph_E.append(graph_embeddings_Q[key])
                    second_hop_ids_filtered_Q.append(idd_q)
                    second_hop_graph_Q.append(graph_embeddings_Q[idd_q])
                    second_hop_ids_filtered_P.append(idd_p)
                    second_hop_graph_P.append(graph_embeddings_P[idd_p])

        if len(first_hop_graph_E) > 0:
            predicts, scores = get_top_ids_second_hop(q, first_hop_graph_E, second_hop_graph_Q, second_hop_graph_P, second_hop_ids_filtered_Q, second_hop_ids_filtered_P, len(second_hop_graph_Q))
            top1_scores.append(torch.max(scores).item())
        else:
            predicts = []
            top1_scores.append(-1)
    else:
        predicts = []
        top1_scores.append(-1)
    
    
        
        
    print("Predicts: ",predicts)
    a_predicts.append(predicts)
    
    clear_output(wait=True)
    
    inv_rs = []
    if len(predicts) > 0:
        for a_i in a:
            if a_i not in predicts:
                inv_r = 0
            else:
                inv_r = 1 / (list(predicts).index(a_i) + 1)
            inv_rs.append(inv_r)
        inv_ranks.append(max(inv_rs))
    else:
        inv_ranks.append(0)

    print()
    print("#$"*30)
    top1 = np.array(inv_ranks)[np.array(inv_ranks) == 1]
    print("Processed questions:", len(inv_ranks))
    print("Accuracy: ", len(top1) / len(inv_ranks))
    print("MEAN MRR: ", np.mean(inv_ranks))
    print("inv_ranks",[round(r,4) for r in inv_ranks])
    print("#$"*30)
    print()



#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$
Processed questions: 3
Accuracy:  0.6666666666666666
MEAN MRR:  0.6666666666666666
inv_ranks [0, 1.0, 1.0]
#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$#$

question:  What is the name of the capital of Romania?
nouns:  ['capital of', 'the name', 'name of', 'is the', 'What is', 'of Romania', 'of the', 'the capital']


KeyboardInterrupt: 

# Simple questions

In [None]:
simple_questions_test = np.load("/simple_questions_test.npy")

In [None]:
simple_questions_filtered = []
questions_sq = []
answers_sq = []
print(len(simple_questions))
for e, p, a, q in tqdm(simple_questions_test):
    if e in graph_embeddings_Q and a in graph_embeddings_Q and p in graph_embeddings_P:
        simple_questions_filtered.append((e, p, a, q))
        questions_sq.append(q)
        answers_sq.append(a)

### Entity candidate selection functions

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm") 

def get_nouns(text):
    text = text.replace('?', '')
    
    doc = nlp(text)
    import spacy

    lemmas = []
    for token in doc:
        lemmas.append(str(token.lemma_))
    lemmatized_text = " ".join(lemmas)
        
    ents = [str(ent) for ent in doc.ents]
    if '"' in text:
        text3 = text[text.find('"')+1:]
        text3 = text3[0:text3.find('"')]
        ents += [str(text3)]
        
    if '«' in text:
        text4 = text[text.find('«')+1:]
        text4 = text4[0:text4.find('»')]
        ents += [str(text4)]
        

    doc = nlp(text)
    ents = [token.lemma_ for token in doc if token.pos_ == "NOUN" or token.pos_ == "PROPN"]

    bigrams = [" ".join(b) for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]
    ents += bigrams
    bigrams = [" ".join(b) for b in zip(lemmatized_text.split(" ")[:-1], lemmatized_text.split(" ")[1:])]
    ents += bigrams

    bigrams = [" ".join(b) for b in zip(text.split(" ")[:-2], text.split(" ")[1:], text.split(" ")[2:])]
    ents += bigrams
    bigrams = [" ".join(b) for b in zip(lemmatized_text.split(" ")[:-2], lemmatized_text.split(" ")[1:], lemmatized_text.split(" ")[2:])]
    ents += bigrams
        
    nouns_set = set(ents)
    if "" in nouns_set:
        nouns_set.remove("")
    return list(nouns_set)


def get_top_ids_second_hop(text, first_hop_graph_E, second_hop_graph_Q, second_hop_graph_P, second_hop_ids_filtered_Q, second_hop_ids_filtered_P, topk):
        projection_E.eval()
        projection_P.eval()
        projection_Q.eval()
        bert_tokenize = lambda text: torch.tensor([tokenizer.encode(text, max_length=MAX_LEN_Q, add_special_tokens=True,pad_to_max_length=True)]).to(device)[0]
        X = torch.tensor([tokenizer.encode(text, max_length=MAX_LEN_Q, add_special_tokens=True,pad_to_max_length=True)]).to(device)[0].to(device)
        y_pred_e = projection_E(encoder(X[None,:]))
        y_pred_q = projection_Q(encoder(X[None,:]))
        y_pred_p = projection_P(encoder(X[None,:]))

        embeddings_tensor_E = torch.FloatTensor(first_hop_graph_E)
        embeddings_tensor_Q = torch.FloatTensor(second_hop_graph_Q)
        embeddings_tensor_P = torch.FloatTensor(second_hop_graph_P)
        
        cosines_descr_E = torch.cosine_similarity(embeddings_tensor_E.cpu(),y_pred_e.cpu())
        cosines_descr_E = nn.Softmax()(cosines_descr_E)
        
        cosines_descr_Q = torch.cosine_similarity(embeddings_tensor_Q.cpu(),y_pred_q.cpu())
        cosines_descr_Q = nn.Softmax()(cosines_descr_Q)

        cosines_descr_P = torch.cosine_similarity(embeddings_tensor_P.cpu(),y_pred_p.cpu())
        cosines_descr_P = nn.Softmax()(cosines_descr_P)

        cosines_aggr = cosines_descr_P * cosines_descr_Q * cosines_descr_E
        inds = torch.topk(cosines_aggr,topk,sorted=True).indices.cpu().numpy()
        return np.array(second_hop_ids_filtered_Q)[inds], cosines_aggr[inds]
    

def mp_get_second_hop_entities_by_idd(idd,d):
    client = wikidata.client.Client()
    entity = client.get(idd, load = True)
    second_hop_qp = []
    for x in tqdm(list(entity)): # Iterate over properties
        prop = client.get(x.id, load = True)
        try:
            if type(prop) is wikidata.entity.Entity and type(entity[prop]) is wikidata.entity.Entity:
                second_hop_qp.append((str(entity[prop].id),str(prop.id)))
        except:
            pass
    d[idd] = second_hop_qp

            
            


### Test

In [None]:
#multiprocessing
import multiprocessing
from multiprocessing import Manager, Process


MAX_PRESEARCH = 3
q_list = []
a_list = []
a_predicts = []
inv_ranks = []
top1_scores = []

i = 0

for q, a in zip(questions_sq, answers_sq):
    print("question: ", q)
    a = [a]

    nouns = get_nouns(str(q))
    nouns = list(set(nouns))
    print("nouns: ", nouns)

    ids_q = []
    for noun in nouns:
        ids_q += wdi_core.WDItemEngine.get_wd_search_results(noun)[0:MAX_PRESEARCH]
    ids_q = list(set(ids_q))
    if len(ids_q) != 0:
        second_hop_ids_QP = Manager().dict()
        processes = []
        for idd_q in ids_q:
            processes.append(Process(target=mp_get_second_hop_entities_by_idd, args=(idd_q,second_hop_ids_QP)))
        print(f"# PROCESSES: {len(processes)}")
        for p in processes:
            p.start()
        for p in processes:
            p.join()

    
    if len(second_hop_ids_QP) > 0:
        first_hop_graph_E = []
        second_hop_graph_Q = []
        second_hop_ids_filtered_Q = []
        second_hop_graph_P = []
        second_hop_ids_filtered_P = []
        for key in second_hop_ids_QP.keys():
            for (idd_q, idd_p) in second_hop_ids_QP[key]:
                if idd_q in graph_embeddings_Q and idd_p in graph_embeddings_P and key in graph_embeddings_Q:
                    first_hop_graph_E.append(graph_embeddings_Q[key])
                    second_hop_ids_filtered_Q.append(idd_q)
                    second_hop_graph_Q.append(graph_embeddings_Q[idd_q])
                    second_hop_ids_filtered_P.append(idd_p)
                    second_hop_graph_P.append(graph_embeddings_P[idd_p])

        if len(first_hop_graph_E) > 0:
            predicts, scores = get_top_ids_second_hop(q, first_hop_graph_E, second_hop_graph_Q, second_hop_graph_P, second_hop_ids_filtered_Q, second_hop_ids_filtered_P, len(second_hop_graph_Q))
            top1_scores.append(torch.max(scores).item())
        else:
            predicts = []
            top1_scores.append(-1)
    else:
        predicts = []
        top1_scores.append(-1)
    
    
        
        
    print("Predicts: ",predicts)
    a_predicts.append(predicts)
    
    clear_output(wait=True)
    
    inv_rs = []
    if len(predicts) > 0:
        for a_i in a:
            if a_i not in predicts:
                inv_r = 0
            else:
                inv_r = 1 / (list(predicts).index(a_i) + 1)
            inv_rs.append(inv_r)
        inv_ranks.append(max(inv_rs))
    else:
        inv_ranks.append(0)

    print()
    print("#$"*30)
    top1 = np.array(inv_ranks)[np.array(inv_ranks) == 1]
    print("Processed questions:", len(inv_ranks))
    print(f"found: {sum(np.array(inv_ranks)>0)}")
    print("Accuracy: ", len(top1) / sum(np.array(inv_ranks)>0))
    print("Absolute Accuracy: ", len(top1) / len(inv_ranks))
    print("MEAN MRR: ", np.mean(inv_ranks))
    print("inv_ranks",[round(r,4) for r in inv_ranks])
    print("#$"*30)
    print()

