In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

with open('NN-DTE-to-phiyodr-bert-base-finetuned-squad2.pkl', 'rb') as f:
    our_approach_df = pd.read_pickle(f)

with open('Mikolov_to_phiyodr_bert-base-finetuned-squad2.pkl', 'rb') as f:
    mik_approach_df = pd.read_pickle(f)

In [2]:
our_entities = set(our_approach_df.Entity.to_list())
mik_entities = set(mik_approach_df.Entity.to_list())

common_entities = list(our_entities.intersection(mik_entities))

print(f"Number of Common entities b/w us & mik: {len(common_entities)}")

Number of Common entities b/w us & mik: 23


In [3]:
model = AutoModelForQuestionAnswering.from_pretrained('phiyodr/bert-base-finetuned-squad2')
model_embeddings = model.get_input_embeddings()
model_vocab = AutoTokenizer.from_pretrained('phiyodr/bert-base-finetuned-squad2').get_vocab()

In [4]:
import torch
import numpy as np

our_sim_scores = []
mik_sim_scores = []

cos = torch.nn.CosineSimilarity()

for term in common_entities:
    model_emb = model_embeddings(torch.LongTensor([model_vocab[term]]))
    our_emb = our_approach_df.query("Entity==@term").UMLS_Embedding.to_numpy()[0]
    mik_emb = mik_approach_df.query("Entity==@term").UMLS_Embedding.to_numpy()[0]
    
    our_sim_scores.append(cos(our_emb, model_emb))
    mik_sim_scores.append(cos(mik_emb, model_emb))

our_sim_scores = torch.FloatTensor(our_sim_scores)
mik_sim_scores = torch.FloatTensor(mik_sim_scores)

In [5]:
print('Our Embeddings...')
print(f'Mean Sim: {torch.mean(our_sim_scores)}')
print(f'Std. Dev. Sim: {torch.std(our_sim_scores)}')

print('Mik Embeddings...')
print(f'Mean Sim: {torch.mean(mik_sim_scores)}')
print(f'Std. Dev. Sim: {torch.std(mik_sim_scores)}')

Our Embeddings...
Mean Sim: 0.04175913706421852
Std. Dev. Sim: 0.03247544914484024
Mik Embeddings...
Mean Sim: 0.4302298128604889
Std. Dev. Sim: 0.2847978174686432
