In [1]:
import os
import re
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document

In [2]:
load_dotenv()

True

## Read Data

In [3]:
DATA_PATH = "documents"

documents = []

for file in os.listdir(DATA_PATH):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(DATA_PATH, file))
        docs = loader.load()
        documents.extend(docs)

print(f"Loaded {len(documents)} pages.")


Loaded 26 pages.


## Adaptive Chunking with Metadata

In [4]:
def extract_metadata_from_filename(filename):
    mapping = {
        "1_": "overview",
        "2_": "onboarding",
        "3_": "employment",
        "4_": "conduct",
        "5_": "compensation",
        "6_": "benefits",
        "7_": "leave",
        "8_": "performance",
        "9_": "it_security",
        "10_": "safety"
    }
    for key, value in mapping.items():
        if filename.startswith(key):
            return value
    return "general"


In [5]:
## Section-aware Chunking 

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=100
)

enhanced_docs = []

for doc in documents:
    filename = os.path.basename(doc.metadata.get("source", "unknown"))
    category = extract_metadata_from_filename(filename)

    chunks = text_splitter.split_text(doc.page_content)

    for chunk in chunks:
        enhanced_docs.append(
            Document(
                page_content=chunk,
                metadata={
                    "source_file": filename,
                    "category": category
                }
            )
        )

print(f"Total Chunks Created: {len(enhanced_docs)}")
print(enhanced_docs[0].metadata)


Total Chunks Created: 178
{'source_file': '6_ Employee Benefits and Perks.pdf', 'category': 'benefits'}


## Create Embedding Store

In [6]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

vector_store = FAISS.from_documents(enhanced_docs, embeddings)
vector_store.save_local("adaptive_vector_store")

print("Vector store created.")


  embeddings = OpenAIEmbeddings(


Vector store created.


## Retrieval 

In [46]:
retriever = vector_store.as_retriever(
    search_kwargs={
        "k": 8
    }
)

## Lightweight Re-Ranking

In [10]:
llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    temperature=0
)

def rerank(query, docs, top_n=4):
    scored_docs = []

    for doc in docs:
        prompt = f"""
        Rate relevance of this passage to the query on scale 1-10.
        Query: {query}
        Passage: {doc.page_content}
        Only return number.
        """
        score = llm.predict(prompt)
        
        try:
            score = int(score.strip())
        except:
            score = 5

        scored_docs.append((doc, score))

    scored_docs.sort(key=lambda x: x[1], reverse=True)

    return [doc for doc, score in scored_docs[:top_n]]


  llm = ChatOpenAI(


## Putting it together 

In [45]:
query = "What are the company's policies on remote work?"

In [47]:
initial_docs = retriever.get_relevant_documents(query)

In [48]:
initial_docs

[Document(id='724936f5-78ed-4a4f-82af-5ea9fc152232', metadata={'source_file': '3_ General Employment Policies.pdf', 'category': 'employment'}, page_content='remote\n \nwork\n \nfor\n \npart\n \nof\n \nthe\n \nweek\n \nor\n \nfull-time,\n \ndepending\n \non\n \nteam\n \nneeds.\n \nIf\n \nyour\n \nposition\n \nis\n \neligible\n \nfor\n \nremote\n \nor\n \nhybrid\n \nwork,\n \nyou\n \nwill\n \ndiscuss\n \nthe\n \ndetails\n \nwith\n \nyour\n \nmanager\n \n(e.g.,\n \nwhich\n \ndays\n \nyou\n \ncome\n \nto\n \nthe\n \noffice\n \nversus\n \nwork\n \nfrom\n \nhome).\n \nAll\n \nremote\n \nwork\n \narrangements\n \nmust\n \nensure\n \nyou\n \nhave\n \na\n \nproper\n \nworkspace\n \nand\n \nreliable\n \ninternet.\n \nWe\n \nexpect\n \nremote\n \nemployees\n \nto\n \nbe\n \nfully\n \nengaged\n \nduring\n \nwork\n \nhours\n \n‚Äì\n \nthat\n \nmeans\n \nbeing\n \navailable\n \nonline,\n \nattending'),
 Document(id='6c4286f3-19c1-4188-bda8-2613a9e876ef', metadata={'source_file': '3_ General Employme

In [8]:
def adaptive_rag(query):

    print("\nüîç Retrieving...")
    initial_docs = retriever.get_relevant_documents(query)

    print(f"Initial Retrieved: {len(initial_docs)}")

    reranked_docs = rerank(query, initial_docs)

    print(f"After ReRank: {len(reranked_docs)}")

    context = "\n\n".join([doc.page_content for doc in reranked_docs])

    final_prompt = f"""
    You are an HR policy assistant.
    Use ONLY the context below to answer.
    If not found, say 'The document does not specify this.'

    Context:
    {context}

    Question:
    {query}
    """

    answer = llm.predict(final_prompt)

    return answer


In [14]:
response = adaptive_rag(query)


üîç Retrieving...
Initial Retrieved: 8
After ReRank: 4


In [15]:
print(response)

The company allows remote work for part of the week or full-time, depending on team needs. If your position is eligible for remote or hybrid work, you will discuss the details with your manager, including which days you will come to the office versus work from home. All remote work arrangements must ensure you have a proper workspace and reliable internet. Remote employees are expected to be fully engaged during work hours, which includes being available online, attending meetings, and meeting productivity expectations similar to in-office staff. The company supports work-life balance and offers flexible work arrangements. Additionally, a stipend or company equipment may be provided to support your home office setup if needed.


## Version 2

In [68]:
def detect_category_from_query(query):
    query = query.lower()

    mapping = {
        "leave": ["leave", "vacation", "holiday", "sick", "parental"],
        "benefits": ["401k", "insurance", "hsa", "benefits", "medical"],
        "compensation": ["salary", "bonus", "payroll", "raise"],
        "it_security": ["password", "mfa", "security", "phishing"],
        "safety": ["fire", "evacuation", "emergency", "injury"],
        "performance": ["review", "promotion", "pip", "career"],
        "conduct": ["harassment", "confidential", "nda", "ethics"],
        "onboarding": ["orientation", "new hire", "first day"],
        "employment": ["work hours", "attendance", "remote"]
    }

    for category, keywords in mapping.items():
        if any(keyword in query for keyword in keywords):
            return category

    return None


In [69]:
def adaptive_metadata_retrieval(query):

    category = detect_category_from_query(query)

    if category:
        print(f"üîé Detected Category: {category}")

        retriever = vector_store.as_retriever(
            search_kwargs={
                "k": 6,
                "filter": {"category": category}
            }
        )
    else:
        print("‚ö† No category detected. Using global search.")
        retriever = vector_store.as_retriever(search_kwargs={"k": 6})

    docs = retriever.get_relevant_documents(query)

    return docs


In [72]:
category = detect_category_from_query(query)
category

'it_security'

In [70]:
docs = adaptive_metadata_retrieval(query)

üîé Detected Category: it_security


In [71]:
docs

[Document(id='e0738924-83f6-44fb-b979-e8331e782ca5', metadata={'source_file': '9_ IT Usage and Security Policies.pdf', 'category': 'it_security'}, page_content='account  compromise  (e.g.,  you  accidentally  entered  your  password  on  a  phishing  site),  report  it  \nto\n \nIT\n \nimmediately\n \nso\n \nwe\n \ncan\n \nsecure\n \nthe\n \naccount.\n \nData  Protection  and  Privacy:  Depending  on  your  role,  you  might  handle  sensitive  data  ‚Äì  like  \ncustomer\n \ninformation,\n \nfinancial\n \nrecords,\n \nor\n \npersonal\n \ndata\n \nof\n \nemployees.\n \nAlways\n \nstore\n \nsuch\n \ndata\n \nin\n \napproved\n \nlocations\n \n(like\n \ncompany\n \ncloud\n \nstorage\n \nor\n \nservers)\n \nand\n \nnot\n \non\n \nunapproved\n \npersonal\n \ndrives\n.\n \nOur\n \ncompany\n \nsystems\n \n(like\n \nthe\n \ncorporate\n \nGoogle'),
 Document(id='72eb327b-3d8e-4a7e-9a9e-6e116d12b7f2', metadata={'source_file': '9_ IT Usage and Security Policies.pdf', 'category': 'it_security'}, p

In [18]:
def adaptive_rag_with_metadata(query):

    docs = adaptive_metadata_retrieval(query)

    reranked_docs = rerank(query, docs)

    context = "\n\n".join([doc.page_content for doc in reranked_docs])

    final_prompt = f"""
    You are an HR policy assistant.
    Use only the context below.
    If not found, say 'The document does not specify this.'

    Context:
    {context}

    Question:
    {query}
    """

    return llm.predict(final_prompt)


In [None]:
response = adaptive_rag_with_metadata("How many paid holidays do employees get?")

üîé Detected Category: leave


In [20]:
response

'Employees get 10 paid holidays each year.'

In [22]:
response = adaptive_rag_with_metadata("What are the company's policies on remote work?")

üîé Detected Category: employment


In [24]:
print(response)

The document specifies that the company supports remote work for part of the week or full-time, depending on team needs. If a position is eligible for remote or hybrid work, employees will discuss the details with their manager, including which days to come to the office versus working from home. All remote work arrangements must ensure a proper workspace and reliable internet. Remote employees are expected to be fully engaged during work hours, which includes being available online, attending meetings, and meeting productivity expectations similar to in-office staff. The company also provides a stipend or company equipment to support home office setups if needed.


## Upgrade 

In [51]:
intent_prompt = f"""
Classify this HR question into one category:
[leave, benefits, compensation, safety, conduct, onboarding, performance, it_security, employment]

Question: {query}

Only return category name.
"""
intent = llm.predict(intent_prompt)
intent

'employment'

## Version 3.0

In [73]:
def rewrite_query(query):
    rewrite_prompt = f"""
    Rewrite the HR policy question to be more specific and retrieval-friendly.
    Preserve meaning. Expand vague references.

    Question:
    {query}

    Rewritten Question:
    """
    rewritten = llm.predict(rewrite_prompt)
    return rewritten.strip()


In [74]:
response =rewrite_query("What are the company's policies on remote work?")
print(response)

What are the specific policies and guidelines established by the company regarding remote work arrangements, including eligibility criteria, communication expectations, and any required documentation or approval processes?


In [75]:
from IPython.display import Markdown, display

# Render the existing `response` variable as Markdown in the notebook
display(Markdown(f"### Response\n\n{response}"))

### Response

What are the specific policies and guidelines established by the company regarding remote work arrangements, including eligibility criteria, communication expectations, and any required documentation or approval processes?

In [34]:
#!pip install rank_bm25


In [54]:
from rank_bm25 import BM25Okapi

# Prepare corpus
tokenized_corpus = [doc.page_content.split() for doc in enhanced_docs]
bm25 = BM25Okapi(tokenized_corpus)


In [36]:
bm25

<rank_bm25.BM25Okapi at 0x13bb40110>

In [76]:
def hybrid_retrieve(query, top_k=8, category=None):

    # Semantic retrieval
    if category:
        semantic_retriever = vector_store.as_retriever(
            search_kwargs={"k": top_k, "filter": {"category": category}}
        )
    else:
        semantic_retriever = vector_store.as_retriever(search_kwargs={"k": top_k})

    semantic_docs = semantic_retriever.get_relevant_documents(query)

    # BM25 retrieval
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)

    top_bm25_indices = sorted(
        range(len(bm25_scores)),
        key=lambda i: bm25_scores[i],
        reverse=True
    )[:top_k]

    bm25_docs = [enhanced_docs[i] for i in top_bm25_indices]

    # Merge and deduplicate
    combined = {doc.page_content: doc for doc in semantic_docs + bm25_docs}

    return list(combined.values())


### Re-Ranking Layer 

    We retrieve ~12 candidates ‚Üí re-rank ‚Üí keep top 4.

In [77]:
def rerank(query, docs, top_n=4):

    scored = []

    for doc in docs:
        prompt = f"""
        Score relevance (1-10).

        Query:
        {query}

        Passage:
        {doc.page_content}

        Only return a number.
        """
        score = llm.predict(prompt)

        try:
            score = int(score.strip())
        except:
            score = 5

        scored.append((doc, score))

    scored.sort(key=lambda x: x[1], reverse=True)

    return [doc for doc, score in scored[:top_n]]


In [78]:
def generate_answer(query, context_docs):

    context = "\n\n".join(
        f"[Source: {doc.metadata['source_file']}]\n{doc.page_content}"
        for doc in context_docs
    )

    final_prompt = f"""
    You are an HR policy assistant.

    Use ONLY the provided context.
    Provide:

    1. Clear Answer
    2. Bullet list of Source Documents used.

    If not found, say:
    "The document does not specify this."

    Context:
    {context}

    Question:
    {query}

    Answer:
    """

    return llm.predict(final_prompt)


In [58]:
def adaptive_rag_v3(user_query):

    print("Original Query:", user_query)

    # 1. Rewrite
    rewritten_query = rewrite_query(user_query)
    print("Rewritten Query:", rewritten_query)

    # 2. Intent Detection
    category = detect_category_from_query(rewritten_query)
    print("Detected Category:", category)

    # 3. Hybrid Retrieval
    retrieved_docs = hybrid_retrieve(
        rewritten_query,
        top_k=8,
        category=category
    )

    print("Retrieved Candidates:", len(retrieved_docs))

    # 4. ReRank
    top_docs = rerank(rewritten_query, retrieved_docs)
    print("After ReRank:", len(top_docs))

    # 5. Generate
    answer = generate_answer(rewritten_query, top_docs)

    return answer


In [42]:
response = adaptive_rag_v3("How many paid holidays do employees get?")

Original Query: How many paid holidays do employees get?
Rewritten Query: What is the total number of paid holidays that employees are entitled to each year according to the company's HR policy?
Detected Category: leave
Retrieved Candidates: 14
After ReRank: 4


In [43]:
display(Markdown(f"## Response\n\n{response}"))

## Response

1. Employees are entitled to 10 paid holidays each year according to the company's HR policy.

2. Bullet list of Source Documents used:
   - [Source: 7_ Leave and Time Off Policies.pdf]

In [62]:
#query = "What behaviors are considered violations of the code of conduct?"
query = "What should I do if I suspect a security breach?"
response = adaptive_rag_v3(query)

Original Query: What should I do if I suspect a security breach?
Rewritten Query: What specific steps should I take if I suspect a security breach involving sensitive employee data or company information within our organization?
Detected Category: it_security
Retrieved Candidates: 14
After ReRank: 4


In [65]:
response

'The document does not specify this. \n\n- Source Documents used:\n  - 9_ IT Usage and Security Policies.pdf\n  - 4_ Code of Conduct and Ethics.pdf'

In [66]:
display(Markdown(f"## Response\n\n{response}"))

## Response

The document does not specify this. 

- Source Documents used:
  - 9_ IT Usage and Security Policies.pdf
  - 4_ Code of Conduct and Ethics.pdf

In [None]:
## Additional Code
benchmark = [
    {
        "query": "How many paid holidays?",
        "expected_category": "leave",
        "expected_doc": "7_ Leave and Time Off Policies.pdf"
    },
    ...
]

def evaluate_retrieval(benchmark, retriever):

    correct = 0

    for item in benchmark:
        docs = retriever.get_relevant_documents(item["query"])

        retrieved_sources = [
            doc.metadata["source_file"]
            for doc in docs
        ]

        if item["expected_doc"] in retrieved_sources:
            correct += 1

    recall = correct / len(benchmark)

    return recall
