# Quick demo to try to find similar words
This could be a (hackish?) way to highlight results from ML/vector/semantic searchs


In [1]:
from transformers import BertModel, BertTokenizer
import torch
from scipy.spatial.distance import cosine

In [2]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [3]:
# Function to get word embeddings
def get_word_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    # Use the last hidden state
    embeddings = outputs.last_hidden_state
    return embeddings, inputs['input_ids']

In [4]:
# Function to find similar words
def find_similar_words(sentence1, sentence2):
    embeddings1, ids1 = get_word_embeddings(sentence1)
    embeddings2, ids2 = get_word_embeddings(sentence2)

    # Map ids back to tokens
    tokens1 = [tokenizer.decode([id]) for id in ids1[0]]
    tokens2 = [tokenizer.decode([id]) for id in ids2[0]]

    # Calculate similarities
    similarities = {}
    for i, token1 in enumerate(tokens1):
        for j, token2 in enumerate(tokens2):
            # Compute cosine similarity
            sim = 1 - cosine(embeddings1[0, i].detach().numpy(), embeddings2[0, j].detach().numpy())
            if token1 not in similarities or similarities[token1][1] < sim:
                similarities[token1] = (token2, sim)
    
    return similarities

In [5]:
# Example sentences
sentence1 = "The boy smiled at his teacher."
sentence2 = "The child was eating ice cream."

In [6]:
similar_words = find_similar_words(sentence1, sentence2)
print(similar_words)

{'[CLS]': ('[CLS]', 0.8606757052455114), 'the': ('the', 0.73774177663601), 'boy': ('child', 0.6595289324990087), 'smiled': ('was', 0.4766085492943497), 'at': ('eating', 0.407170474189551), 'his': ('the', 0.4130035502665621), 'teacher': ('child', 0.49251745015367443), '.': ('[SEP]', 0.9145055804767722), '[SEP]': ('[SEP]', 0.8814502295370953)}
