#### Part -1 - Setup and Corpus Definition

In [11]:
# Import libraries
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
### Data
corpus = [
"Our company offers a comprehensive healthcare plan for all full-time employees.",
"Employees are entitled to 20 paid vacation days per year.",
"The new software update includes enhanced security features and a revamped user interface.",
"Quarterly financial reports indicate a 15% growth in revenue.",
"For technical support, please email support@examplecorp.com or call our helpline.",
"The healthcare benefits package covers medical, dental, and vision insurance.", # Similar to sentence 1
"To request time off, submit a form through the employee portal at least two weeks in advance.", # Related to sentence 2
"Security protocols have been upgraded across all company platforms following the recent patch.", # Similar to sentence 3
"Our customer service team is available 24/7 to assist with any issues." # Related to sentence 5
]

In [13]:
query = "What are the healthcare benefits?"

#### Part - 2 - Embedding Generation

In [14]:
# Load an embedding model:
model_name = 'all-MiniLM-L6-v2' # Fast and good for this scale
print(f"Loading embedding model: {model_name}...")
model = SentenceTransformer(model_name)

Loading embedding model: all-MiniLM-L6-v2...


In [15]:
# Embed the corpus and the query:
print("Embedding corpus...")
corpus_embeddings = model.encode(corpus, convert_to_tensor=False) # Get as NumPy arrays
print("Embedding query...")
query_embedding = model.encode(query, convert_to_tensor=False)


Embedding corpus...
Embedding query...


In [16]:
print(f"Corpus embeddings shape: {corpus_embeddings.shape}") # (num_sentences, embedding_dim)
print(f"Query embedding shape: {query_embedding.shape}")   # (embedding_dim,) or (1, embedding_dim)

Corpus embeddings shape: (9, 384)
Query embedding shape: (384,)


In [22]:
print(f"Corpus dimension: {corpus_embeddings.ndim}")
print(f"Query dimension: {query_embedding.ndim}")

Corpus dimension: 2
Query dimension: 1


#### Part 3: Similarity Calculation & Retrieval (Brute-Force)

In [9]:
# Calculate cosine similarity between the query embedding and all corpus embeddings.
# Reshape query_embedding to 2D if it's 1D for cosine_similarity function
if query_embedding.ndim == 1:
  query_embedding_2d = query_embedding.reshape(1, -1)
else:
  query_embedding_2d = query_embedding
similarities = cosine_similarity(query_embedding_2d, corpus_embeddings)
# similarities is a 2D array, e.g., [[s1, s2, s3, ...]] so we take the first row
similarity_scores = similarities[0]

In [19]:
# Find the top N most similar sentences:
top_k = 3
# Get indices of top_k scores in descending order
# np.argsort returns indices that would sort the array in ascending order.
# So we use a negative sign to sort in descending effectively, then take top_k.
sorted_indices = np.argsort(-similarity_scores)
top_k_indices = sorted_indices[:top_k]
print(f"\nQuery: \"{query}\"")
print(f"\nTop {top_k} most similar sentences from the corpus:")
for i, index in enumerate(top_k_indices):
  print(f"  {i+1}. \"{corpus[index]}\" (Score: {similarity_scores[index]:.4f})")


Query: "What are the healthcare benefits?"

Top 3 most similar sentences from the corpus:
  1. "The healthcare benefits package covers medical, dental, and vision insurance." (Score: 0.6462)
  2. "Our company offers a comprehensive healthcare plan for all full-time employees." (Score: 0.4301)
  3. "The new software update includes enhanced security features and a revamped user interface." (Score: 0.2337)
