In [1]:
import os
import torch 
torch.set_default_device('mps')
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, AutoModel

os.environ['https_proxy'] = 'http://127.0.0.1:7890'
model_name = 'distilbert-base-uncased'
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [2]:
import torch.nn.functional as F

@torch.inference_mode()
def get_word_embedding(sentence: str, word: str, model, tokenizer):
    s_tokens = tokenizer.encode(sentence, add_special_tokens=True, return_tensors='pt')[0]
    outputs = model(s_tokens.view(1, -1))
    hidden_states = outputs['last_hidden_state'][0]
    assert len(hidden_states) == len(s_tokens)

    w_tokens = tokenizer.encode(word, add_special_tokens=False, return_tensors='pt')[0]
    idx = -1
    for i in range(len(s_tokens)):
        # print(f"compare {s_tokens[i:i+len(w_tokens)]=} {w_tokens=}")
        if torch.equal(s_tokens[i:i+len(w_tokens)], w_tokens):
            idx = i
            break
    assert idx != -1
    return torch.mean(hidden_states[idx:idx+len(w_tokens)], dim=0)


In [3]:
def compare_similarity(s1:str, s2:str, word: str):
    return F.cosine_similarity(
        get_word_embedding(s1, word, model, tokenizer),
        get_word_embedding(s2, word, model, tokenizer),
        dim=0)

In [4]:
@torch.inference_mode()
def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    return outputs["last_hidden_state"][:, 0, :]

In [5]:
compare_similarity("The king and the queen are happy.", "The angry and unhappy king", "king")

tensor(0.8779, device='mps:0')

In [6]:
compare_similarity("The bank wired her the money","We live next door to the bank", "bank")

tensor(0.9222, device='mps:0')

In [7]:
compare_similarity("the river bank", "the savings bank", "bank")

tensor(0.7967, device='mps:0')

In [9]:
from datasets import load_dataset

In [10]:
quora = load_dataset("quora")['train']

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading data:   0%|          | 0.00/35.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

In [12]:
quora.to_pandas().head()

Unnamed: 0,questions,is_duplicate
0,"{'id': [1, 2], 'text': ['What is the step by s...",False
1,"{'id': [3, 4], 'text': ['What is the story of ...",False
2,"{'id': [5, 6], 'text': ['How can I increase th...",False
3,"{'id': [7, 8], 'text': ['Why am I mentally ver...",False
4,"{'id': [9, 10], 'text': ['Which one dissolve i...",False


In [17]:
quora['questions'][0]['text']

['What is the step by step guide to invest in share market in india?',
 'What is the step by step guide to invest in share market?']

In [18]:
from sentence_transformers import util

In [None]:
util.pytorch_cos_sim