This code demonstrates semantic similarity on sentence-level using BERT embeddings



Import required libraries

In [None]:
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np

Load pre-trained BERT model and tokenizer

Here, **bert-base-uncased** is a good general-purpose model that converts text to lowercase

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# List of example sentences to compare
# You can modify this list with your own sentences
sentences = [
    "I love programming in Python",
    "Python is my favorite programming language",
    "The weather is beautiful today",
    "I enjoy coding and software development",
    "It's raining outside right now",
    "Programming in Python is really fun",
]

Tokenize all sentences at once. The hyper-parameters are explained as:

1. padding=True ensures all sequences have same length
2. truncation=True cuts off sequences longer than max_length
3. return_tensors="pt" returns PyTorch tensors
4. max_length=128 limits sequence length to 128 tokens

In [None]:
inputs = tokenizer(
    sentences, padding=True, truncation=True, return_tensors="pt", max_length=128
)

# Generate BERT embeddings for all sentences
# torch.no_grad() disables gradient calculation for inference
with torch.no_grad():
    outputs = model(**inputs)
    # last_hidden_state contains contextual embeddings for each token
    embeddings = outputs.last_hidden_state

In [None]:
print(embeddings.shape)

torch.Size([6, 9, 768])


Get sentence embeddings by mean pooling, i,e, average out all of token embeddings in the sequence.

This will conver the above shape from [6,9,768] to [6, 768]

In [None]:

attention_mask = inputs["attention_mask"]  # 1 for real tokens, 0 for padding
# Expand attention mask to same dimensions as embeddings
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
# Apply mask to zero out padding token embeddings
masked_embeddings = embeddings * mask
# Sum all token embeddings for each sentence
summed = torch.sum(masked_embeddings, 1)
# Count number of real tokens in each sentence
counts = torch.clamp(torch.sum(attention_mask, 1, keepdim=True), min=1e-9)
# Calculate mean by dividing sum by count
sentence_embeddings = (summed / counts).numpy()

print(sentence_embeddings.shape)


(6, 768)


Transform the query sentence to embeddings

In [None]:
# Define query sentence to compare against our sentence list
query = "I really enjoy Python programming"
print(f"\nQuery: {query}\n")

# Process query sentence the same way as above
# First, tokenize the query
query_inputs = tokenizer(
    [query], padding=True, truncation=True, return_tensors="pt", max_length=128
)

# Generate BERT embeddings for query
with torch.no_grad():
    query_outputs = model(**query_inputs)
    query_embeddings = query_outputs.last_hidden_state

# Mean pooling for query embedding
query_attention_mask = query_inputs["attention_mask"]
query_mask = query_attention_mask.unsqueeze(-1).expand(query_embeddings.size()).float()
query_masked_embeddings = query_embeddings * query_mask
query_summed = torch.sum(query_masked_embeddings, 1)
query_counts = torch.clamp(torch.sum(query_attention_mask, 1, keepdim=True), min=1e-9)
query_embedding = (query_summed / query_counts).numpy()


Query: I really enjoy Python programming



Calculate cosine similarity between query and all sentences
1. Cosine similarity measures the cosine of the angle between two vectors
2. Values closer to 1 indicate higher similarity

In [None]:
similarities = cosine_similarity(query_embedding, sentence_embeddings)

# Sort sentences by similarity score (highest to lowest)
similar_sentence_indices = similarities[0].argsort()[::-1]

# Print results
print("Most similar sentences (in order of similarity):")
for idx in similar_sentence_indices:
    print(f"Similarity: {similarities[0][idx]:.4f} - {sentences[idx]}")

Most similar sentences (in order of similarity):
Similarity: 0.8981 - I love programming in Python
Similarity: 0.8463 - I enjoy coding and software development
Similarity: 0.7940 - Python is my favorite programming language
Similarity: 0.7896 - Programming in Python is really fun
Similarity: 0.5652 - The weather is beautiful today
Similarity: 0.5038 - It's raining outside right now
