- Sankaran S

In [1]:
from pathlib import Path

kb_dir = Path("../kb")
docs = []
for path in kb_dir.glob("*.md"):
    try:
        text = path.read_text(encoding="utf-16")
    except UnicodeDecodeError:
        # Try alternative with error replacement (may cause minor data loss but no crash)
        text = path.read_text(encoding="utf-16", errors="replace")
    docs.append({"path": str(path), "text": text})
len(docs), docs[0].keys()


(3, dict_keys(['path', 'text']))

In [2]:
def simple_chunk(text, max_chars=400):
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    chunks = []
    current = ""
    for p in paragraphs:
        if len(current) + len(p) < max_chars:
            current += ("\n\n" + p) if current else p
        else:
            if current:
                chunks.append(current)
            current = p
    if current:
        chunks.append(current)
    return chunks

kb_chunks = []
for d in docs:
    for chunk in simple_chunk(d["text"]):
        kb_chunks.append({
            "doc_path": d["path"],
            "content": chunk
        })
len(kb_chunks)


6

In [3]:
kb_chunks

[{'doc_path': '..\\kb\\faq.md',
  'content': "Frequently Asked Questions\n\nQ: How do I reset my company email password?\nA: Visit the internal IT portal, click on 'Forgot Password,' and follow the instructions sent to your registered mobile number.\n\nQ: Who do I contact for technical support?\nA: You can email support@company.com or call the IT help desk at extension 200."},
 {'doc_path': '..\\kb\\faq.md',
  'content': "Q: What is the company’s dress code?\nA: Smart casuals are allowed; formal attire is required during client meetings.\n\nQ: How can I apply for leave?\nA: Log in to the HR portal, select 'Leave Application,' fill out the required details, and submit for manager approval."},
 {'doc_path': '..\\kb\\policy.md', 'content': 'Company Work Policy'},
 {'doc_path': '..\\kb\\policy.md',
  'content': '- Working Hours: Employees are expected to work from 9:00 AM to 6:00 PM, Monday to Friday. Remote work is allowed with prior approval.\n- Leave Policy: Each employee is entitled to

In [4]:
from sentence_transformers import SentenceTransformer

embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = SentenceTransformer(embed_model_name)

# Compute embeddings
texts = [c["content"] for c in kb_chunks]
embeddings = embed_model.encode(texts, batch_size=16, show_progress_bar=True)
len(embeddings), len(embeddings[0])


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.76it/s]


(6, 384)

In [5]:
import faiss
import numpy as np

# Convert embeddings to np.float32 array
embedding_dim = len(embeddings[0])
np_embeddings = np.array(embeddings).astype('float32')

# Create FAISS index for cosine similarity (normalize embeddings first)
faiss.normalize_L2(np_embeddings)
index = faiss.IndexFlatIP(embedding_dim)  # Inner product = cosine similarity on normalized vectors
index.add(np_embeddings)

# Confirm number of vectors added
print("Number of vectors in index:", index.ntotal)


Number of vectors in index: 6


In [6]:
import pickle

# Save FAISS index binary
faiss.write_index(index, 'faiss_index.bin')

# Save metadata (the chunks and their file paths)
with open('metadata.pkl', 'wb') as f:
    pickle.dump(kb_chunks, f)


In [7]:
def load_faiss_index_and_metadata():
    index = faiss.read_index('faiss_index.bin')
    with open('metadata.pkl', 'rb') as f:
        metadata = pickle.load(f)
    return index, metadata

def retrieve_top_k_faiss(query, embed_model, index, metadata, k=3):
    q_embedding = embed_model.encode([query])
    q_embedding = np.array(q_embedding).astype('float32')
    faiss.normalize_L2(q_embedding)
    
    distances, indices = index.search(q_embedding, k)
    
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        results.append({
            "doc_path": metadata[idx]["doc_path"],
            "content": metadata[idx]["content"],
            "score": dist,
        })
    return results


In [8]:
index, metadata = load_faiss_index_and_metadata()
query = "What is the company policy on leave?"
top_k = retrieve_top_k_faiss(query, embed_model, index, metadata, k=3)
for i, item in enumerate(top_k):
    print(f"Result {i+1} (Score: {item['score']}):\n{item['content']}\n---")


Result 1 (Score: 0.7193081378936768):
Company Work Policy
---
Result 2 (Score: 0.5233244895935059):
- Working Hours: Employees are expected to work from 9:00 AM to 6:00 PM, Monday to Friday. Remote work is allowed with prior approval.
- Leave Policy: Each employee is entitled to 12 days of paid leave per year, not including public holidays.
- Code of Conduct: All team members should maintain a professional attitude and respect colleagues, clients, and company property.
- Data Privacy: Sensitive company and user data must not be shared outside the organization.
- Attendance: All employees should mark their daily attendance in the HR portal. In case of absence, inform your manager by email.
---
Result 3 (Score: 0.3595171272754669):
Q: What is the company’s dress code?
A: Smart casuals are allowed; formal attire is required during client meetings.

Q: How can I apply for leave?
A: Log in to the HR portal, select 'Leave Application,' fill out the required details, and submit for manager ap