In [1]:
import json

with open("chunks.json", "r", encoding="utf-8") as f:
    chunks = json.load(f)

print("Total chunks loaded:", len(chunks))
print("Sample chunk:", chunks[0])


Total chunks loaded: 271
Sample chunk: {'id': 'doc0_sec0_chunk0', 'text': 'FinSolve Technologies Engineering Document FinSolve Technologies Engineering Document', 'meta': {'chunk_id': 'doc0_sec0_chunk0', 'source': 'files\\engineering\\engineering_master_doc.md', 'filename': 'engineering_master_doc.md', 'doc_id': 0, 'section_index': 0, 'section_heading': 'FinSolve Technologies Engineering Document', 'title': 'FinSolve Technologies Engineering Document', 'department': 'engineering', 'roles': ['Engineering', 'C-Level'], 'token_count': 8}}


In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("Embedding model loaded successfully")


  from .autonotebook import tqdm as notebook_tqdm


Embedding model loaded successfully


In [3]:
from tqdm import tqdm

texts = [chunk["text"] for chunk in chunks]

embeddings = model.encode(
    texts,
    show_progress_bar=True,
    batch_size=32
)

print("Total embeddings generated:", len(embeddings))

Batches: 100%|██████████| 9/9 [00:05<00:00,  1.60it/s]

Total embeddings generated: 271





In [4]:
import os
print(os.getcwd())

c:\Users\prath\OneDrive\Desktop\Infosys-project


In [5]:
import chromadb

client = chromadb.PersistentClient(
    path=r"C:\Users\prath\OneDrive\Desktop\Infosys-project\chroma_db"
)

collection = client.get_or_create_collection(
    name="company_documents"
)

print("Client initialized")


Client initialized


In [6]:
collection = client.get_or_create_collection(
    name="company_documents"
)

In [7]:
def clean_metadata(meta: dict):
    cleaned = {}
    for key, value in meta.items():
        if value is None:
            continue  
        if isinstance(value, list):
            cleaned[key] = ", ".join(map(str, value))
        else:
            cleaned[key] = value
    return cleaned


In [8]:
def semantic_search(query, top_k=5):
    query_embedding = model.encode([query]).tolist()
    
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=top_k
    )
    
    return results

In [9]:
from tqdm import tqdm

for idx, chunk in enumerate(tqdm(chunks)):
    collection.add(
        ids=[chunk["id"]],
        documents=[chunk["text"]],
        embeddings=[embeddings[idx].tolist()],
        metadatas=[clean_metadata(chunk["meta"])]
    )

print("Indexing done")

100%|██████████| 271/271 [00:12<00:00, 22.31it/s]

Indexing done





In [10]:
query = "cloud infrastructure security"

results = semantic_search(query, top_k=3)

for i in range(len(results["documents"][0])):
    print(f"\nResult {i+1}")
    print("Text:", results["documents"][0][i][:200], "...")
    print("Metadata:", results["metadatas"][0][i])


Result 1
Text: FinSolve Technologies Engineering Document 2.3.5 Infrastructure * **AWS**: Primary cloud provider utilizing EC2, ECS, Lambda, RDS, S3, CloudFront, and other managed services. * **Kubernetes**: Contain ...
Metadata: {'department': 'engineering', 'section_index': 14, 'token_count': 52, 'doc_id': 0, 'roles': 'Engineering, C-Level', 'source': 'files\\engineering\\engineering_master_doc.md', 'title': 'FinSolve Technologies Engineering Document', 'filename': 'engineering_master_doc.md', 'section_heading': '2.3.5 Infrastructure', 'chunk_id': 'doc0_sec14_chunk0'}

Result 2
Text: FinSolve Technologies Engineering Document 7.2.1 Cloud Infrastructure * **Terraform Modules**: * Network infrastructure (VPC, subnets, security groups) * Compute resources (EC2, ECS, Lambda) * Databas ...
Metadata: {'section_index': 75, 'source': 'files\\engineering\\engineering_master_doc.md', 'roles': 'Engineering, C-Level', 'department': 'engineering', 'section_heading': '7.2.1 Cloud Infrastructure',

In [11]:
print("Total documents in DB:", collection.count())

Total documents in DB: 271


In [12]:
print("Chunks loaded:", len(chunks))

Chunks loaded: 271


In [13]:
print("Total documents in DB:", collection.count())
collection.peek(limit=1)["metadatas"]

Total documents in DB: 271


[{'source': 'files\\engineering\\engineering_master_doc.md',
  'roles': 'Engineering, C-Level',
  'department': 'engineering',
  'doc_id': 0,
  'section_index': 0,
  'title': 'FinSolve Technologies Engineering Document',
  'chunk_id': 'doc0_sec0_chunk0',
  'filename': 'engineering_master_doc.md',
  'section_heading': 'FinSolve Technologies Engineering Document',
  'token_count': 8}]

In [14]:
import time

test_queries = [
    "API Gateway authentication",
    "database scalability strategy",
    "security compliance standards"
]

for q in test_queries:
    start_time = time.time()
    res = semantic_search(q, top_k=3)
    end_time = time.time()

    print("\nQuery:", q)
    print("Time Taken:", round(end_time - start_time, 4), "seconds")

    docs = res.get("documents", [[]])[0]

    if docs:
        print("Top Result Preview:", docs[0][:120], "...")
    else:
        print("No results found")

    print("-" * 60)




Query: API Gateway authentication
Time Taken: 0.0215 seconds
Top Result Preview: FinSolve Technologies Engineering Document 2.3.2 API Gateway * Centralized entry point for all client requests * Impleme ...
------------------------------------------------------------

Query: database scalability strategy
Time Taken: 0.0215 seconds
Top Result Preview: FinSolve Technologies Engineering Document 2.4.2 Database Scalability * PostgreSQL uses range-based sharding for high-vo ...
------------------------------------------------------------

Query: security compliance standards
Time Taken: 0.0181 seconds
Top Result Preview: Employee Handbook Company Commitment to Data Protection - Regular security audits and employee training. - Data stored o ...
------------------------------------------------------------
