In [3]:
import os
print("HF_HOME:", os.getenv("HF_HOME"))


HF_HOME: None


In [4]:
pip install sentence-transformers


Note: you may need to restart the kernel to use updated packages.


In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-base-en-v1.5')


ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
# Optional: recommended prefix for better retrieval performance
query_instruction = "Represent this sentence for retrieval: "

In [None]:

# Test document chunks (simulate a few text segments from your dataset)
chunk_texts = [
    "This insurance policy covers medical emergencies during travel.",
    "In case of accidental damage, notify the provider within 24 hours.",
    "Premiums must be paid annually to keep the policy active."
]

# Encode document chunks (no instruction prefix)
chunk_embeddings = model.encode(chunk_texts, batch_size=32, show_progress_bar=True)

# Print shape and confirm
print(f"Generated {len(chunk_embeddings)} document chunk embeddings.")
print(f"Example embedding vector (first chunk):\n{chunk_embeddings[0][:5]}...")  # preview first 5 values

# Example query embedding (for semantic search)
query_instruction = "Represent this sentence for retrieval: "
user_query = "What does the insurance policy cover during travel?"

# Encode query with prefix
query_embedding = model.encode(query_instruction + user_query)

# Print query embedding preview
print("\nQuery embedding vector preview:")
print(query_embedding[:5])  # preview first 5 values


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Generated 3 document chunk embeddings.
Example embedding vector (first chunk):
[-0.02651693 -0.00640625  0.0255814   0.03877122  0.03975612]...

Query embedding vector preview:
[-0.02143209  0.02615947  0.03398245  0.05225211  0.0672538 ]


In [None]:
import json
from sentence_transformers import SentenceTransformer

# Step 1: Load your cleaned + chunked JSON file
with open("chunked_documents.json", "r") as f:
    cleaned_chunks = json.load(f)

# Step 2: Initialize the BGE model
model = SentenceTransformer("BAAI/bge-base-en-v1.5")

# Step 3: Extract text content from each chunk
chunk_texts = [item["text"] for item in cleaned_chunks]

# Step 4: Generate embeddings for each chunk
chunk_embeddings = model.encode(chunk_texts, batch_size=32, show_progress_bar=True)

# Step 5: Merge embeddings back into the chunk metadata
embedded_chunks = []
for chunk, embedding in zip(cleaned_chunks, chunk_embeddings):
    chunk["embedding"] = embedding.tolist()  # convert numpy array to list
    embedded_chunks.append(chunk)

# Step 6: Save the final embedded chunks
with open("embedded_chunks.json", "w") as f:
    json.dump(embedded_chunks, f, indent=2)

print("✅ Document chunk embedding complete! Saved to embedded_chunks.json")

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


✅ Document chunk embedding complete! Saved to embedded_chunks.json


In [None]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load embedded chunks
with open("embedded_chunks.json", "r") as f:
    embedded_chunks = json.load(f)

# Load model
model = SentenceTransformer("BAAI/bge-base-en-v1.5")

# Function to encode query with instruction
def encode_query(query):
    instruction = "Represent this question for retrieving supporting documents: "
    return model.encode(instruction + query)

# Convert stored embeddings to numpy array
chunk_embeddings = np.array([item["embedding"] for item in embedded_chunks])

# Step 1: Get user query
user_query = input("Enter your question: ")
query_embedding = encode_query(user_query).reshape(1, -1)

# Step 2: Compute cosine similarities
similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]

# Step 3: Get top-k results
top_k = 5
top_indices = similarities.argsort()[-top_k:][::-1]

# Step 4: Display results
print(f"\n🔍 Top {top_k} most relevant chunks:\n")
for idx in top_indices:
    chunk = embedded_chunks[idx]
    print(f"📄 File: {chunk['file_name']} | Chunk ID: {chunk['chunk_id']}")
    print(f"🔗 Score: {similarities[idx]:.4f}")
    print(f"📝 Text: {chunk['text'][:500]}...\n")


Enter your question:  What are Special Conditions applicable to Personal Accident Covers?


  return forward_call(*args, **kwargs)



🔍 Top 5 most relevant chunks:

📄 File: doc3.pdf | Chunk ID: doc3-C101
🔗 Score: 0.7781
📝 Text: Any exclusion mentioned in the 'General Exclusions” section of this Policy. c. Special Conditions applicable to Personal Accident Covers-Common Carrier (AD&PTD): 1. In the event of partial loss or impairment of the function of one of the above parts of the body or senses, the appropriate proportion of the percentage as stated in the “Table of Beneﬁts” will be considered for payment. 2. If the accident impairs a number of physical or mental functions, the degree of disablement given in the Table ...

📄 File: doc3.pdf | Chunk ID: doc3-C44
🔗 Score: 0.7498
📝 Text: by childbirth, maternity or pregnancy or in consequence thereof, venereal disease or inﬁrmity. 10. Payment of compensation in respect of accidental death, injury or disablement of the Insured/Insured Person, due to or arising out of or directly connected with or traceable to act of terrorism or terrorist activities. 11. Any exclusion me

In [None]:
pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-win_amd64.whl.metadata (7.1 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp310-cp310-win_amd64.whl.metadata (9.0 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp310-cp310-win_amd64.whl.metadata (5.1 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.36.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.36.0-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chromadb)
  Downloading opentelemetry_sdk-1.36.0-py3-none-any.whl.metadata (1.5 kB)
Collecting pypika>=0.48.9 (from chr



In [None]:
import json
import chromadb
from chromadb.config import Settings

In [None]:
# Initialize ChromaDB client and collection
# Use PersistentClient instead of the old Client
chroma_client = chromadb.PersistentClient(
    path="chroma_db"  # your desired storage directory
)

collection = chroma_client.get_or_create_collection(name="document_chunks")

# Load embedded chunks
with open("embedded_chunks.json", "r") as f:
    data = json.load(f)

# Insert each chunk into the ChromaDB collection
for i, item in enumerate(data):
    collection.add(
        ids=[f"chunk_{i}"],
        embeddings=[item["embedding"]],
        documents=[item["text"]],
        metadatas=[{
            "file_name": item["file_name"],
            "chunk_id": item["chunk_id"]
        }]
    )

In [None]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

In [None]:
# 1. Load embedding model
model = SentenceTransformer("BAAI/bge-base-en-v1.5")

# 2. Load ChromaDB collection
chroma_client = chromadb.PersistentClient(
    path="chroma_db"  # your desired storage directory
)

collection = chroma_client.get_collection("document_chunks")

In [None]:
# 3. Define a function for semantic retrieval
def semantic_search(query, top_k=5):
    formatted_query = "Represent this sentence for searching relevant passages: " + query
    query_embedding = model.encode(formatted_query)

    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )

     # Display results with document source info
    print(f"\n🔍 Top {top_k} results for query: \"{query}\"\n")
    for i in range(top_k):
        doc_text = results["documents"][0][i]
        doc_score = round(results["distances"][0][i], 4)

        # ✅ Accessing metadata properly (first list is for batch size = 1)
        doc_meta = results["metadatas"][0][i]
        doc_source = doc_meta.get("file_name", "Unknown")
        chunk_id = doc_meta.get("chunk_id", "N/A")

        print(f"Result {i+1} (Score: {doc_score})")
        print(f"📄 Source: {doc_source} | Chunk ID: {chunk_id}")
        print(f"📝 Text:\n{doc_text}\n")

In [None]:
# 4. Example usage
if __name__ == "__main__":
    example_queries = [
        "What are Special Conditions applicable to Personal Accident Covers?",
        "What is mean by Grace Period?",
        "What is the process to file a cashless insurance claim?",
        "waiting period for health insurance"
    ]

    for q in example_queries:
        semantic_search(q, top_k=3)


🔍 Top 3 results for query: "What are Special Conditions applicable to Personal Accident Covers?"

Result 1 (Score: 0.4714)
📄 Source: doc3.pdf | Chunk ID: doc3-C101
📝 Text:
Any exclusion mentioned in the 'General Exclusions” section of this Policy. c. Special Conditions applicable to Personal Accident Covers-Common Carrier (AD&PTD): 1. In the event of partial loss or impairment of the function of one of the above parts of the body or senses, the appropriate proportion of the percentage as stated in the “Table of Beneﬁts” will be considered for payment. 2. If the accident impairs a number of physical or mental functions, the degree of disablement given in the Table of Beneﬁts will be added together, but the amount payable shall not exceed 100% of the Sum Insured as speciﬁed in the Policy Schedule. 3. If the accident aﬀects parts of the body or senses whose loss or inability to function is not dealt with above, the governing factor in determining the beneﬁt amount in such a case will be 