In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from numpy.random import randint
from tqdm.notebook import tqdm

with open('NN-DTE-to-phiyodr-bert-base-finetuned-squad2.pkl', 'rb') as f:
    our_approach_df = pd.read_pickle(f)

with open('Mikolov_to_phiyodr_bert-base-finetuned-squad2.pkl', 'rb') as f:
    mik_approach_df = pd.read_pickle(f)

In [2]:
model = AutoModelForQuestionAnswering.from_pretrained('phiyodr/bert-base-finetuned-squad2')
model_embeddings = model.get_input_embeddings()
tokenizer = AutoTokenizer.from_pretrained('phiyodr/bert-base-finetuned-squad2')
print('Model and tokenizer loaded...')

Model and tokenizer loaded...


In [16]:
our_sim_scores = []
mik_sim_scores = []

cos = torch.nn.CosineSimilarity()
model_elements_subset = {}

for idx in randint(0, tokenizer.__len__(), size=1000):
    model_elements_subset[tokenizer._convert_id_to_token(idx)] = model_embeddings(torch.LongTensor([idx]))

In [17]:
def compute_sim_stats(df, topk=20):
    sim_scores = {}
    for row in tqdm(df.itertuples(index=False)):
        for mod_el in model_elements_subset.items():
            sim_scores[(row.Entity, mod_el[0])] = cos(row.UMLS_Embedding.detach(), mod_el[1].detach())
    scores = torch.FloatTensor(list(sim_scores.values()))
    print(f'Mean Sim: {torch.mean(scores)}')
    print(f'Std. Dev. Sim: {torch.std(scores)}')
    print(f'...\nTop {topk} similar pairs (Entity, Model)')
    print(sorted(sim_scores.items(), key=lambda x: x[1], reverse=True)[0:topk])

compute_sim_stats(our_approach_df)
compute_sim_stats(mik_approach_df)

0it [00:00, ?it/s]

Mean Sim: 0.04276169463992119
Std. Dev. Sim: 0.030586203560233116
...
Top 20 similar pairs (Entity, Model)
[(('Lipopolysaccharides', 'solemnly'), tensor([0.1666])), (('Released (action)', '##வ'), tensor([0.1649])), (('Nervous system structure', 'solemnly'), tensor([0.1615])), (('Released (action)', 'considerations'), tensor([0.1615])), (('Esthesia', 'solemnly'), tensor([0.1613])), (('Specimen', 'fantasies'), tensor([0.1611])), (('Reproduction', 'solemnly'), tensor([0.1609])), (('described', '##வ'), tensor([0.1592])), (('CDKN2A gene', 'solemnly'), tensor([0.1591])), (('Staff Member', 'accompanying'), tensor([0.1585])), (('Specimen', 'solemnly'), tensor([0.1574])), (('Public Health Surveillance', 'solemnly'), tensor([0.1573])), (('entry - cluster', 'aforementioned'), tensor([0.1572])), (('Open Reading Frames', 'solemnly'), tensor([0.1568])), (('Released (action)', 'fantasies'), tensor([0.1568])), (('Cell Nucleolus', 'solemnly'), tensor([0.1565])), (('cytokine secretion', 'solemnly'), ten

0it [00:00, ?it/s]

Mean Sim: 0.3580394685268402
Std. Dev. Sim: 0.34967029094696045
...
Top 20 similar pairs (Entity, Model)
[(('141', '141'), tensor([0.9869])), (('415', '415'), tensor([0.9686])), (('305', '337'), tensor([0.9137])), (('85', '337'), tensor([0.9135])), (('350', '337'), tensor([0.9121])), (('218', '337'), tensor([0.9116])), (('65', '337'), tensor([0.9113])), (('333', '337'), tensor([0.9113])), (('900', '337'), tensor([0.9112])), (('670', '337'), tensor([0.9109])), (('140', '337'), tensor([0.9105])), (('900', '820'), tensor([0.9098])), (('85', '820'), tensor([0.9098])), (('4000', '337'), tensor([0.9095])), (('275', '337'), tensor([0.9090])), (('320', '337'), tensor([0.9088])), (('200', '337'), tensor([0.9087])), (('3000', '820'), tensor([0.9086])), (('325', '337'), tensor([0.9080])), (('400', '337'), tensor([0.9079]))]
