In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from numpy.random import randint
from tqdm.notebook import tqdm
import re

with open('NN-DTE-to-phiyodr-bert-base-finetuned-squad2.pkl', 'rb') as f:
    our_approach_df = pd.read_pickle(f)

with open('Mikolov_to_phiyodr_bert-base-finetuned-squad2.pkl', 'rb') as f:
    mik_approach_df = pd.read_pickle(f)
    
print('Necessary Files loaded...')

model = AutoModelForQuestionAnswering.from_pretrained('phiyodr/bert-base-finetuned-squad2')
model_embeddings = model.get_input_embeddings()
tokenizer = AutoTokenizer.from_pretrained('phiyodr/bert-base-finetuned-squad2')

print('Model and tokenizer loaded...')

Necessary Files loaded...
Model and tokenizer loaded...


In [2]:
our_sim_scores = []
mik_sim_scores = []

cos = torch.nn.CosineSimilarity()
model_elements_subset = {}

count = 0
number_of_elements_to_select = tokenizer.__len__()

while count != number_of_elements_to_select:
    idx = randint(0, number_of_elements_to_select)
    token = tokenizer._convert_id_to_token(idx)
    #We are trying to exclude numbers from the computation cause that doesn't really tell us anything
    if (not re.search('\d+', token)) and (token not in model_elements_subset.keys()):
        model_elements_subset[token] = model_embeddings(torch.LongTensor([idx]))
        count+= 1

In [4]:
def compute_sim_stats(df, topk=20):
    sim_scores = {}
    for row in tqdm(df.itertuples(index=False)):
        for mod_el in model_elements_subset.items():
            sim_scores[(row.Entity, mod_el[0])] = cos(row.UMLS_Embedding.detach(), mod_el[1].detach()).item()
    scores = torch.FloatTensor(list(sim_scores.values()))
    print(f'Mean Sim: {torch.mean(scores)}')
    print(f'Std. Dev. Sim: {torch.std(scores)}')
    print(f'...\nTop {topk} similar pairs (Entity, Model)')
    print(sorted(sim_scores.items(), key=lambda x: x[1], reverse=True)[0:topk])

compute_sim_stats(our_approach_df)
compute_sim_stats(mik_approach_df)

0it [00:00, ?it/s]

Mean Sim: 0.042095281183719635
Std. Dev. Sim: 0.030935749411582947
...
Top 20 similar pairs (Entity, Model)
[(('Lipopolysaccharides', 'weathered'), 0.17903827130794525), (('Proteins', 'weathered'), 0.1770404428243637), (('Specimen', 'forbade'), 0.17694877088069916), (('artesunate', 'weathered'), 0.1739923655986786), (('described', '##ss'), 0.17359139025211334), (('Calpain', 'reforms'), 0.17279590666294098), (('described', 'worries'), 0.17182792723178864), (('entry - cluster', 'filters'), 0.1717856526374817), (('Elements', '##th'), 0.17068587243556976), (('paired basic amino acid cleaving enzyme', 'weathered'), 0.1702965497970581), (('Specimen', 'charlton'), 0.16923679411411285), (('Fowls, Domestic', 'reforms'), 0.1685318797826767), (('TNFSF10 protein, human', '##tting'), 0.16850689053535461), (('described', 'forbade'), 0.16848257184028625), (('Peptides', 'weathered'), 0.1684201955795288), (('Biological Processes', 'weathered'), 0.16840548813343048), (('Specimen', '##tting'), 0.16827613

0it [00:00, ?it/s]

Mean Sim: 0.3559246063232422
Std. Dev. Sim: 0.3457118272781372
...
Top 20 similar pairs (Entity, Model)
[(('customary', 'customary'), 0.915919303894043), (('137', 'ᄑ'), 0.8276524543762207), (('boulevard', 'ث'), 0.8276291489601135), (('volcano', 'ᄑ'), 0.8275499939918518), (('statute', 'ᄑ'), 0.8264901041984558), (('volcano', 'ث'), 0.826195478439331), (('137', 'ث'), 0.8257742524147034), (('statute', 'ث'), 0.8237239122390747), (('boulevard', 'ᄑ'), 0.8236469626426697), (('190', 'ᄑ'), 0.8234977722167969), (('cologne', 'ث'), 0.822898268699646), (('68', 'ث'), 0.8225765228271484), (('bonus', 'ᄑ'), 0.8223711252212524), (('190', 'ث'), 0.8221819400787354), (('indigo', 'ᄑ'), 0.8220299482345581), (('payment', 'ث'), 0.8216512203216553), (('licking', 'ᄑ'), 0.821648120880127), (('payment', 'ᄑ'), 0.8211519718170166), (('conductor', 'ᄑ'), 0.8203153610229492), (('indigo', 'ث'), 0.8201801180839539)]
