In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from numpy.random import randint
from tqdm.notebook import tqdm
import re

#with open('NN-DTE-to-phiyodr-bert-base-finetuned-squad2.pkl', 'rb') as f:
#    full_approach_df = pd.read_pickle(f)

with open('UMLS_Only_NN-DTE-to-phiyodr-bert-base-finetuned-squad2.pkl', 'rb') as f:
    umls_only_approach_df = pd.read_pickle(f)

with open('Mikolov_to_phiyodr_bert-base-finetuned-squad2.pkl', 'rb') as f:
    mik_approach_df = pd.read_pickle(f)

print('Necessary Files loaded...')

model = AutoModelForQuestionAnswering.from_pretrained('phiyodr/bert-base-finetuned-squad2')
model_embeddings = model.get_input_embeddings()
tokenizer = AutoTokenizer.from_pretrained('phiyodr/bert-base-finetuned-squad2')

print('Model and tokenizer loaded...')

Necessary Files loaded...
Model and tokenizer loaded...


In [3]:
our_sim_scores = []
mik_sim_scores = []

cos = torch.nn.CosineSimilarity()
model_elements_subset = {}

count = 0
number_of_elements_to_select = 5000

while count != number_of_elements_to_select:
    idx = randint(0, tokenizer.__len__())
    token = tokenizer._convert_id_to_token(idx)
    #We are trying to exclude numbers from the computation cause that doesn't really tell us anything
    if (not re.search('\d+', token)) and (token not in model_elements_subset.keys()):
        model_elements_subset[token] = model_embeddings(torch.LongTensor([idx]))
        count+= 1

def compute_sim_stats(df, topk=20):
    sim_scores = {}
    for row in tqdm(df.itertuples(index=False)):
        for mod_el in model_elements_subset.items():
            sim_scores[(row.Entity, mod_el[0])] = cos(row.UMLS_Embedding.detach(), mod_el[1].detach()).item()
    scores = torch.FloatTensor(list(sim_scores.values()))
    print(f'Mean Sim: {torch.mean(scores)}')
    print(f'Std. Dev. Sim: {torch.std(scores)}')
    print(f'...\nTop {topk} similar pairs (Entity, Model)')
    print(sorted(sim_scores.items(), key=lambda x: x[1], reverse=True)[0:topk])

In [4]:
compute_sim_stats(mik_approach_df)

0it [00:00, ?it/s]

Mean Sim: 0.3562013506889343
Std. Dev. Sim: 0.3457609713077545
...
Top 20 similar pairs (Entity, Model)
[(('adventure', 'adventure'), 0.9725835919380188), (('financial', 'financial'), 0.8610132932662964), (('volcano', 'ᄎ'), 0.8396756649017334), (('137', 'ᄅ'), 0.8387606739997864), (('137', 'ᄎ'), 0.8387260437011719), (('statute', 'ᄎ'), 0.8375660181045532), (('137', '##ذ'), 0.8375254273414612), (('licking', 'ᄎ'), 0.8374002575874329), (('137', 'त'), 0.8367186188697815), (('190', 'ᄅ'), 0.8358256220817566), (('190', '##ذ'), 0.8351693749427795), (('68', 'ᄅ'), 0.8335087895393372), (('statute', 'ᄅ'), 0.833187997341156), (('cologne', '##ذ'), 0.8331410884857178), (('172', 'ᄅ'), 0.8325009346008301), (('statute', '##ذ'), 0.8322805166244507), (('hydrogen', 'ᄎ'), 0.8321632146835327), (('cologne', 'ᄅ'), 0.831396222114563), (('cologne', 'ᄎ'), 0.8310816287994385), (('volcano', 'ᄅ'), 0.8309412002563477)]


In [None]:
compute_sim_stats(full_approach_df)

In [5]:
compute_sim_stats(umls_only_approach_df)

0it [00:00, ?it/s]

Mean Sim: 0.016827397048473358
Std. Dev. Sim: 0.026941044256091118
...
Top 20 similar pairs (Entity, Model)
[(('Maximum oxygen uptake', 'fireplace'), 0.10459917038679123), (('Coach (vehicle)', 'fireplace'), 0.10459746420383453), (('Imitation', 'fireplace'), 0.10459674894809723), (('Lebanese race', 'fireplace'), 0.10459446161985397), (('Surveys', 'fireplace'), 0.10459356009960175), (('Three months', 'fireplace'), 0.1045934334397316), (('plasma concentration', 'fireplace'), 0.10459332913160324), (('Severe (severity modifier)', 'fireplace'), 0.10459267348051071), (('BARRIER', 'fireplace'), 0.10459262877702713), (('Observation parameter', 'fireplace'), 0.104592464864254), (('Data', 'fireplace'), 0.10459242761135101), (('Alteration', 'fireplace'), 0.10459233075380325), (('Congenital MeSH qualifier', 'fireplace'), 0.10459231585264206), (('Site of', 'fireplace'), 0.10459229350090027), (('Classification', 'fireplace'), 0.10459228605031967), (('Numerical value', 'fireplace'), 0.1045922785997390

In [None]:
compute_sim_stats(def_only_approach_df)