In [1]:
# =========================
# CELL 1 — Install packages
# =========================
# Purpose:
# - Installs all required libraries for:
#   1) Loading PDFs
#   2) Chunking text into token-sized pieces
#   3) Creating embeddings via OpenAI
#   4) Storing/searching embeddings in ChromaDB
# Run:
# - Only once per new environment (or when examiner runs on a fresh machine)

!pip install -r requirements.txt

# openai – Used to generate embeddings and produce final LLM answers from the OpenAI models.
# chromadb – Used as the local vector database to store and retrieve embedded document chunks.
# pypdf – Used to read and extract text from policy PDF files.
# python-dotenv – Used to securely load the OpenAI API key from a .env file.
# tiktoken – Used to split text into token-based chunks for better embedding and retrieval accuracy.




In [16]:
# ==========================================
# CELL 2 — Load API key (.env) + OpenAI client
# ==========================================
# Purpose:
# - Loads OPENAI_API_KEY from a local .env file (so that we do not hardcode in our code).
# - Creates the OpenAI client object used later for embeddings + answers.
# Notes for examiner:
# - To run this notebook end-to-end on examiners machine, the examiner needs their own OPENAI_API_KEY
#   with API billing enabled.

from dotenv import load_dotenv
import os
from openai import OpenAI

# Loads .env from the current working directory (same folder where we have this notebook)
load_dotenv(override=True)

key = os.getenv("OPENAI_API_KEY")
if not key:
    raise RuntimeError(
        "OPENAI_API_KEY not found. Create a .env file in the SAME folder as this notebook with:\n"
        "OPENAI_API_KEY=sk-...\n"
        "Do not share your API key publicly."
    )

client = OpenAI()
print("OPENAI_API_KEY loaded ✅", key[:7] + "...")
print("OpenAI client initialized ✅")


OPENAI_API_KEY loaded ✅ sk-proj...
OpenAI client initialized ✅


In [17]:
# ==========================================
# CELL 3 — Configuration (paths + model choices)
# ==========================================
# Purpose:
# - Sets:
#   1) Where the PDFs are located (relative path)
#   2) Where ChromaDB should store its persistent index (relative path)
#   3) Which OpenAI models to use
#   4) Chunking and retrieval parameters
# Portability:
# - Use relative paths so the examiner can run the project directly after downloading or cloning it.


from pathlib import Path

DATA_DIR = Path("./Policy documents")   # put PDFs here
CHROMA_DIR = Path("./chroma_db")        # will be created on first index run

# Models
EMBED_MODEL = "text-embedding-3-small"
CHAT_MODEL = "gpt-4o-mini"

# RAG settings
CHUNK_TOKENS = 700
CHUNK_OVERLAP = 120
TOP_K = 5

COLLECTION_NAME = "office_policy_rag"

print("DATA_DIR:", DATA_DIR.resolve())
print("CHROMA_DIR:", CHROMA_DIR.resolve())


DATA_DIR: D:\Gen_AI\Final Project\Policy documents
CHROMA_DIR: D:\Gen_AI\Final Project\chroma_db


In [18]:
# ==========================================
# CELL 4 — Load PDFs (extract text page-by-page)
# ==========================================
# Purpose:
# - Reads all PDFs under DATA_DIR (including subfolders).
# - Extracts text from each page and stores metadata for citations.
# Output:
# - raw_pages: list of dicts with keys: text, source, path, page

from pypdf import PdfReader

def load_pdfs(folder: Path):
    pages = []
    pdf_files = list(folder.rglob("*.pdf"))

    if not pdf_files:
        raise FileNotFoundError(
            f"No PDFs found in {folder}. Create the folder and add PDFs, e.g.\n"
            f"{folder}/policy1.pdf"
        )

    for pdf_path in pdf_files:
        reader = PdfReader(str(pdf_path))
        for page_num, page in enumerate(reader.pages, start=1):
            text = (page.extract_text() or "").strip()
            if text:
                pages.append({
                    "text": text,
                    "source": pdf_path.name,
                    "path": str(pdf_path),
                    "page": page_num
                })
    return pages

raw_pages = load_pdfs(DATA_DIR)
print("Total extracted pages:", len(raw_pages))

# Preview one page to confirm extraction looks correct
print("Example page:", raw_pages[0]["source"], "page", raw_pages[0]["page"])
print(raw_pages[0]["text"][:300])


Total extracted pages: 11
Example page: XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf page 1
XYZ Dummy Company LTD - India Employee Policy Handbook (Fictitious)
Page 1
 XYZ Dummy Company LTD
 India Employee Policy Handbook
Version 1.0 | Effective date: 15 Feb 2026
Disclaimer (for academic/demo use): This handbook is a fictitious sample created for a GenAI/RAG
capstone. It is not legal advic


In [19]:
# ==========================================
# CELL 5 — Chunk pages into token-sized chunks
# ==========================================
# Purpose:
# - Splits long page text into overlapping chunks for better retrieval.
# - Uses token-based chunking (more stable for LLM context than characters).
# Output:
# - chunks: list of dicts with keys: chunk, source, path, page, chunk_id

import tiktoken

enc = tiktoken.get_encoding("cl100k_base")

def chunk_text(text: str, chunk_tokens: int, overlap: int):
    tokens = enc.encode(text)
    chunks = []
    start = 0

    while start < len(tokens):
        end = min(start + chunk_tokens, len(tokens))
        chunk = enc.decode(tokens[start:end]).strip()
        if chunk:
            chunks.append(chunk)

        start = end - overlap
        if start < 0:
            start = 0
        if end == len(tokens):
            break

    return chunks

def make_chunks(pages):
    out = []
    for p in pages:
        page_chunks = chunk_text(p["text"], CHUNK_TOKENS, CHUNK_OVERLAP)
        for i, ch in enumerate(page_chunks):
            out.append({
                "chunk": ch,
                "source": p["source"],
                "path": p["path"],
                "page": p["page"],
                "chunk_id": f'{p["source"]}:p{p["page"]}:c{i}'
            })
    return out

chunks = make_chunks(raw_pages)
print("Total chunks created:", len(chunks))
print("Example chunk_id:", chunks[0]["chunk_id"])
print(chunks[0]["chunk"][:300])


Total chunks created: 11
Example chunk_id: XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p1:c0
XYZ Dummy Company LTD - India Employee Policy Handbook (Fictitious)
Page 1
 XYZ Dummy Company LTD
 India Employee Policy Handbook
Version 1.0 | Effective date: 15 Feb 2026
Disclaimer (for academic/demo use): This handbook is a fictitious sample created for a GenAI/RAG
capstone. It is not legal advic


In [20]:
# ==========================================
# CELL 6 — Create/load ChromaDB + index documents (with safety checks + optional rebuild)
# ==========================================
# Purpose:
# - Creates/loads a persistent local vector DB (ChromaDB) in CHROMA_DIR.
# - Uses OpenAI embeddings to embed your chunks and store them in a collection.
# - Skips indexing if already indexed, unless you set REBUILD_INDEX = True.

import chromadb
from chromadb.utils.embedding_functions import EmbeddingFunction

# ---- (A) Required objects check (gives clear errors)
required = ["CHROMA_DIR", "COLLECTION_NAME", "EMBED_MODEL", "chunks", "client"]
missing = [v for v in required if v not in globals()]
if missing:
    raise RuntimeError(
        f"Missing required variables: {missing}\n"
        "Fix:\n"
        "- Run the earlier cells in order (especially the OpenAI client init cell that sets `client = OpenAI()`).\n"
        "- Ensure chunking cell ran and created `chunks`.\n"
    )

# ---- (B) Optional: force rebuild when you change PDFs
# Set to True if you changed PDFs and want to re-index from scratch.
REBUILD_INDEX = False  # <-- change to True when documents change

class OpenAIEmbedder(EmbeddingFunction):
    def __init__(self, client, model, batch_size=64):
        self.client = client
        self.model = model
        self.batch_size = batch_size

    def __call__(self, texts):
        embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i:i+self.batch_size]
            resp = self.client.embeddings.create(model=self.model, input=batch)
            embeddings.extend([item.embedding for item in resp.data])
        return embeddings

# Persistent Chroma DB on disk
chroma_client = chromadb.PersistentClient(path=str(CHROMA_DIR))
embedder = OpenAIEmbedder(client, EMBED_MODEL, batch_size=64)

# If documents changed, wipe the old collection so we don't mix old + new content
if REBUILD_INDEX:
    try:
        chroma_client.delete_collection(name=COLLECTION_NAME)
        print("Old collection deleted ✅ (rebuild requested)")
    except Exception:
        print("No existing collection to delete (continuing)")

# Create/get collection
collection = chroma_client.get_or_create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedder
)

# Index chunks only if empty
if collection.count() == 0:
    ids = [c["chunk_id"] for c in chunks]
    documents = [c["chunk"] for c in chunks]
    metadatas = [{"source": c["source"], "path": c["path"], "page": c["page"]} for c in chunks]

    BATCH = 256
    for i in range(0, len(ids), BATCH):
        collection.add(
            ids=ids[i:i+BATCH],
            documents=documents[i:i+BATCH],
            metadatas=metadatas[i:i+BATCH]
        )

    print("Indexed chunks ✅", collection.count())
else:
    print("Collection already indexed ✅", collection.count())



Collection already indexed ✅ 11


In [21]:
# ==========================================
# CELL 7 — Retrieval (vector search)
# ==========================================
# Purpose:
# - Given a user query, retrieves TOP_K most relevant chunks from ChromaDB.
# Output:
# - List of hits with chunk text + metadata for citations.

def retrieve(query: str, top_k: int = TOP_K):
    res = collection.query(query_texts=[query], n_results=top_k)
    hits = []
    for doc, meta, _id in zip(res["documents"][0], res["metadatas"][0], res["ids"][0]):
        hits.append({"id": _id, "text": doc, "meta": meta})
    return hits


In [22]:
# ==========================================
# CELL 8 — Answer generation (RAG: retrieve then generate)
# ==========================================
# Purpose:
# - Uses retrieved chunks as grounded context.
# - Forces the model to answer ONLY from provided excerpts.
# - If not found, the model should say it couldn't find it.
# Output:
# - final answer text + the retrieved hits for transparency.

SYSTEM_PROMPT = """You are a policy assistant for India-based employees.
Answer ONLY using the provided policy excerpts.
If the answer is not in the excerpts, say: "I couldn't find this in the provided policies."
Keep the answer concise and practical.
Always end with citations in this format: (Source, page)."""

def answer(query: str, top_k: int = TOP_K):
    hits = retrieve(query, top_k=top_k)

    context = "\n\n".join(
        [f"[{h['id']}] ({h['meta']['source']}, page {h['meta']['page']})\n{h['text']}"
         for h in hits]
    )

    user_input = f"Question: {query}\n\nPolicy excerpts:\n{context}"

    resp = client.responses.create(
        model=CHAT_MODEL,
        input=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_input},
        ],
    )

    return resp.output_text, hits


In [23]:
# ==========================================
# CELL 9 — Interactive demo (multiple questions)
# ==========================================
# Purpose:
# - Lets the user/examiner ask multiple questions without editing the code each time.
# - Each question is sent to the RAG pipeline:
#   1) retrieve() pulls the most relevant TOP_K chunks from ChromaDB
#   2) answer() sends those chunks + the question to the LLM to generate a grounded response
# - Prints the final answer plus the supporting sources (PDF name + page) for verification.
# How to use:
# - Type a question and press Enter.
# - Type 'exit' to stop the demo loop.

while True:
    query = input("Mention Policy name that you want to know about? (or 'exit'): ").strip()
    if query.lower() == "exit":
        break

    final_answer, hits = answer(query)

    print("\nANSWER:\n", final_answer)
    print("\nTOP SOURCES USED:")
    for h in hits:
        print("-", h["meta"]["source"], "| page", h["meta"]["page"], "|", h["id"])
    print("\n" + "="*60 + "\n")



Mention Policy name that you want to know about? (or 'exit'):  dress



ANSWER:
 The dress code at XYZ Dummy Company LTD supports a professional workplace and is business casual by default. However, stricter attire may be required at client sites. Employees should avoid clothing with offensive text or images, and safety gear must be used in designated areas (Source, page 3).

TOP SOURCES USED:
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 3 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p3:c0
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 11 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p11:c0
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 2 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p2:c0
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 6 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p6:c0
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 1 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p1:c0




Mention Policy name that you want to know about? (or 'exit'):  leave



ANSWER:
 Employees are entitled to various types of leave, including annual/earned leave and sick leave, depending on employment type and location. Requests for leave should be submitted in the HR system in advance where possible, and medical documentation may be required for extended sick leave. It's important for employees to inform their manager/HR as early as possible and provide required documentation to avoid unexplained absences, which may lead to disciplinary actions (Source, page 6).

TOP SOURCES USED:
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 6 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p6:c0
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 5 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p5:c0
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 10 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p10:c0
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 8 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p8:c0
- XY

Mention Policy name that you want to know about? (or 'exit'):  food



ANSWER:
 I couldn't find this in the provided policies.

TOP SOURCES USED:
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 11 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p11:c0
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 9 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p9:c0
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 3 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p3:c0
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 6 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p6:c0
- XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf | page 4 | XYZ_Dummy_Company_LTD_India_Employee_Policies.pdf:p4:c0




Mention Policy name that you want to know about? (or 'exit'):  exit
