In [1]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [None]:

# --- Step 1: Load Pre-trained BERT Model and Tokenizer ---
print("Loading BERT model and tokenizer...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
print("BERT model loaded successfully! ✅")

In [3]:
# --- Step 2: Define the same sentences as before ---
sentences = [
    "I went to the river bank to fish.",
    "I need to go to the bank to withdraw money."
]

In [4]:
# --- Step 3: Tokenize the sentences and get BERT embeddings ---
# The tokenizer will add special tokens ([CLS], [SEP]) and convert tokens to IDs.
# `padding=True` makes sure both sentences have the same length.
# `truncation=True` ensures they don't exceed the model's max length.
# `return_tensors='pt'` returns PyTorch tensors.
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)


with torch.no_grad():
    outputs = model(**inputs)

# The embeddings are in `last_hidden_state`.

last_hidden_states = outputs.last_hidden_state

In [None]:
# --- Step 4: Extract the vector for the word "bank" from each sentence ---


# For sentence 1

tokens_1 = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
bank_index_1 = tokens_1.index('bank')
bank_vec_1 = last_hidden_states[0, bank_index_1, :]
print(f"\nSentence 1 Tokens: {tokens_1}")
print(f"Index of 'bank' in Sentence 1: {bank_index_1}")


# For sentence 2
tokens_2 = tokenizer.convert_ids_to_tokens(inputs['input_ids'][1])
bank_index_2 = tokens_2.index('bank')
bank_vec_2 = last_hidden_states[1, bank_index_2, :]
print(f"Sentence 2 Tokens: {tokens_2}")
print(f"Index of 'bank' in Sentence 2: {bank_index_2}")


# "river" from sentence 1
river_index = tokens_1.index('river')
river_vec = last_hidden_states[0, river_index, :]


In [6]:
# --- Step 5: Compare the vectors using Cosine Similarity ---

bank_vec_1_reshaped = bank_vec_1.numpy().reshape(1, -1)
bank_vec_2_reshaped = bank_vec_2.numpy().reshape(1, -1)
river_vec_reshaped = river_vec.numpy().reshape(1, -1)

# Compare the two "bank" vectors
similarity_banks = cosine_similarity(bank_vec_1_reshaped, bank_vec_2_reshaped)[0][0]

# Compare "bank" (from sentence 1) with "river"
similarity_bank_river = cosine_similarity(bank_vec_1_reshaped, river_vec_reshaped)[0][0]

In [None]:
# --- Step 6: Print the results ---
print("\n--- Cosine Similarity Results ---")
print(f"Similarity between 'bank' (river context) and 'bank' (money context): {similarity_banks:.4f}")
print(f"Similarity between 'bank' (river context) and 'river': {similarity_bank_river:.4f}")