###### S1. I have the chunks in the format List[Dict] with required metadata and text
###### S2. Next, create a function to create index or if it already exist 
###### S3. create a function to upsert the embeddings into the already created index
###### S4. create a function to check if the if the particular document is already there
###### S5. Create a function 

In [1]:
from src.ingestion import ingest

In [2]:
ready_chunks = ingest("docs\HR Handbook 2025 for website.pdf")

In [4]:
ready_chunks[1]

{'id': 'chunk-1-HR Handbook 2025 for website.pdf',
 'page_no': '2',
 'source': 'HR Handbook 2025 for website.pdf',
 'chunk_text': 'Rules 7-9 2) JOINING 2.1 General Terms of Offer of Letter 10 2.2 Joining Forms 10 2.3 Joining Formalities and subsequent follow-up by HR 11 2.4 Fixation of Salary 11 2.5 Terms mentioned in the Service Contract Agreement 11-12 3) WORK CYCLE 3.1 Personal File & Service Book 13 3.2 Time & Attendance 13 3.3 Leave Entitlement 14-15 3.4 Promotion, Upgradation and Career Progression Process 15-16 3.5 Salary and Increments 16 3.6 National Pension System (NPS) 16-17 3.7 Gratuity 17 3.8 Non-Performing St'}

In [6]:
from pinecone import Pinecone
from src.config import (
    PINECONE_API_KEY,
    CLOUD,
    INDEX_NAME,
    REGION
)

In [7]:
pc = Pinecone(
    api_key= PINECONE_API_KEY
)

In [8]:
import time

def get_or_create_index(index_name : str = INDEX_NAME,
                          cloud : str = CLOUD,
                          region : str = REGION):
    if not pc.has_index(index_name):
        pc.create_index_for_model(
            name = index_name,
            cloud= cloud,
            region= region,
            embed={"model" : "llama-text-embed-v2", "field_map" : {'text' : "chunk_text"}})
        print(f"Index- {INDEX_NAME} is creating...")
        while not pc.describe_index(index_name).status.get('ready', False):
            time.sleep(1)
        print(f"Index- {INDEX_NAME} is created")
    else:
        print(f"Index- {INDEX_NAME} is already created")

    return pc.Index(index_name)


  


In [5]:
from typing import List, Dict

In [None]:


BATCH_SIZE = 96

def upsert_chunks(chunks: List[Dict], index_name: str):
    index = get_or_create_index(index_name)

    records = []

    for chunk in chunks:
        records.append({
            "_id": chunk["id"],
            "chunk_text": chunk["chunk_text"],
            "source": chunk["source"],
            "page_no": chunk["page_no"]
        })

    # Batch upsert
    for i in range(0, len(records), BATCH_SIZE):
        batch = records[i:i+BATCH_SIZE]
        index.upsert_records("example-namespace", records=batch)


In [55]:
chunks = ingest("docs\HR Handbook 2025 for website.pdf")


In [56]:
upsert_chunks(chunks=chunks, index_name=INDEX_NAME)

Index- rag-pipeline-reranker is already created


In [11]:
index = get_or_create_index(INDEX_NAME)

results = index.search(
    namespace="example-namespace", 
    query={
        "inputs": {"text": "What are the Joining Formalities and subsequent follow-up by HR Division"}, 
        "top_k": 5
    },
    fields=["source", "chunk_text", "page_no"]
)

#temp_result = results

Index- rag-pipeline-reranker is already created


In [12]:
results

{'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '3409',
                                    'content-type': 'application/json',
                                    'date': 'Tue, 17 Feb 2026 14:40:50 GMT',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '186',
                                    'x-pinecone-api-version': '2025-10',
                                    'x-pinecone-max-indexed-lsn': '5',
                                    'x-pinecone-response-duration-ms': '188'}},
 'result': {'hits': [{'_id': 'chunk-38-HR Handbook 2025 for website.pdf',
                      '_score': 0.5703608989715576,
                      'fields': {'chunk_text': 'and may recommend disciplinary '
                                               'action as deemed fit. 2.2 '
                                               'Joining Forms At the time of '
        

In [77]:
new = temp_result.get("result", {}).get("hits", {})

In [78]:
new

[{'_id': 'chunk-38-HR Handbook 2025 for website.pdf',
  '_score': 0.5703608989715576,
  'fields': {'chunk_text': 'and may recommend disciplinary action as deemed '
                           'fit. 2.2 Joining Forms At the time of joining the '
                           'institute, the selected candidate has to report to '
                           'HR Division to complete the joining formalities by '
                           'filling the following forms: 1. Joining undertaking '
                           '2. Joining Report 3. Insurance Forms (Health & Term '
                           'Insurance) 4. Hometown & Dependent Declaration form '
                           '5. HRA Undertaking 6. ID Card 7. Medical Fitness '
                           'Certificate 8. NPS Account Creation Form 11 9. '
                           'Gratuity nominee form 10. Service book relevant',
             'page_no': '10,11',
             'source': 'HR Handbook 2025 for website.pdf'}},
 {'_id': 'chunk-1-HR

In [79]:
retreived = []

for r in new:
    d = {}
    d['id'] = r.get("_id", "")
    d['score'] = r.get("_score", 0)
    d['chunk_text'] = r.get("fields", {}).get("chunk_text")
    d['page_no'] = r.get("fields", {}).get("page_no", "")
    d['source'] = r.get("fields", {}).get("source", "")
    retreived.append(d)

In [80]:
retreived

[{'id': 'chunk-38-HR Handbook 2025 for website.pdf',
  'score': 0.5703608989715576,
  'chunk_text': 'and may recommend disciplinary action as deemed fit. 2.2 Joining Forms At the time of joining the institute, the selected candidate has to report to HR Division to complete the joining formalities by filling the following forms: 1. Joining undertaking 2. Joining Report 3. Insurance Forms (Health & Term Insurance) 4. Hometown & Dependent Declaration form 5. HRA Undertaking 6. ID Card 7. Medical Fitness Certificate 8. NPS Account Creation Form 11 9. Gratuity nominee form 10. Service book relevant',
  'page_no': '10,11',
  'source': 'HR Handbook 2025 for website.pdf'},
 {'id': 'chunk-1-HR Handbook 2025 for website.pdf',
  'score': 0.5012588500976562,
  'chunk_text': 'Rules 7-9 2) JOINING 2.1 General Terms of Offer of Letter 10 2.2 Joining Forms 10 2.3 Joining Formalities and subsequent follow-up by HR 11 2.4 Fixation of Salary 11 2.5 Terms mentioned in the Service Contract Agreement 11-12 

In [1]:
from pinecone import Pinecone
from src.config import PINECONE_API_KEY, INDEX_NAME
from typing import List, Dict

In [2]:
pc = Pinecone(api_key=PINECONE_API_KEY)
def search_vector_db(index_name : str = INDEX_NAME, namespace : str = "example-namespace", top_k : int = 5, query : str = "Explain about Promotion, Upgradation, And Career Progression Process") -> List[Dict]:
    index = pc.Index(index_name)

    results = index.search(
    namespace=namespace, 
    query={
        "inputs": {"text": query}, 
        "top_k": top_k
    },
    fields=["source", "chunk_text", "page_no"]
)
    retreived = []

    for r in results.get("result", {}).get("hits", {}):
        d = {}
        d['id'] = r.get("_id", "")
        d['score'] = r.get("_score", 0)
        d['chunk_text'] = r.get("fields", {}).get("chunk_text")
        d['page_no'] = r.get("fields", {}).get("page_no", "")
        d['source'] = r.get("fields", {}).get("source", "")
        retreived.append(d)
    
    return retreived
    

In [3]:
search_vector_db(INDEX_NAME)

[{'id': 'chunk-56-HR Handbook 2025 for website.pdf',
  'score': 0.5592735409736633,
  'chunk_text': 'on a case-by-case basis. 3.4 Promotion, Upgradation, And Career Progression Process This section deals with furtherance of the work cycle of the staff after joining and includes general norms for promotion and financial upgradation, in line with the Recruitment & Promotion Rules for non-teaching staff 2024. General Norms i. In case there are enough internal candidates available for promotion to the next cadre level (i.e., Pay Level-6, 8, etc.), the essential qualifications and experience specif',
  'page_no': '15',
  'source': 'HR Handbook 2025 for website.pdf'},
 {'id': 'chunk-60-HR Handbook 2025 for website.pdf',
  'score': 0.5211267471313477,
  'chunk_text': 'be subject to the completion of the following: i. Qualifying service at the required Pay Level ii. Satisfactory Performance as per Appraisal reports and performance benchmarks iii. Subject to the availability of a vacancy as per

## rerank

In [39]:
def search_vector_db_reranker(index_name : str = INDEX_NAME, namespace : str = "example-namespace", top_k : int = 5, query : str = "What is a work cycle?") -> List[Dict]:
    pc = Pinecone(api_key=PINECONE_API_KEY)
    index = pc.Index(index_name)
    results = index.search(
    namespace=namespace, 
    query={
        "inputs": {"text": query}, 
        "top_k": top_k
    },
    rerank={
        "model": "bge-reranker-v2-m3",
        "top_n": 5,
        "rank_fields": ["chunk_text"]
        },
    fields=["source", "chunk_text", "page_no"]
)
    
    retreived = []

    for r in results.get("result", {}).get("hits", {}):
        d = {}
        d['id'] = r.get("_id", "")
        d['score'] = r.get("_score", 0)
        d['chunk_text'] = r.get("fields", {}).get("chunk_text")
        d['page_no'] = r.get("fields", {}).get("page_no", "")
        d['source'] = r.get("fields", {}).get("source", "")
        retreived.append(d)
    
    return retreived
    

In [40]:
retreived_result =search_vector_db_reranker(INDEX_NAME)

In [41]:
retreived_result

[{'id': 'chunk-46-HR Handbook 2025 for website.pdf',
  'score': 0.3631950318813324,
  'chunk_text': 'al information- employee code, designation, division, PRAN etc. ii. Personal information – Date of birth, nationality, contact information, emergency contact information, educational qualification, category, address (correspondence & permanent), hometown, family details, height, blood group & identification mark etc. iii. Employee work cycle updates- joining details, pay fixation, probation, medical fitness, term/promotion/financial upgradation, subsequent educational qualifications (acquired),',
  'page_no': '13',
  'source': 'HR Handbook 2025 for website.pdf'},
 {'id': 'chunk-1-HR Handbook 2025 for website.pdf',
  'score': 0.09585607796907425,
  'chunk_text': 'Rules 7-9 2) JOINING 2.1 General Terms of Offer of Letter 10 2.2 Joining Forms 10 2.3 Joining Formalities and subsequent follow-up by HR 11 2.4 Fixation of Salary 11 2.5 Terms mentioned in the Service Contract Agreement 11-12 3)

## Generation -->

In [42]:
retreived_result

[{'id': 'chunk-46-HR Handbook 2025 for website.pdf',
  'score': 0.3631950318813324,
  'chunk_text': 'al information- employee code, designation, division, PRAN etc. ii. Personal information – Date of birth, nationality, contact information, emergency contact information, educational qualification, category, address (correspondence & permanent), hometown, family details, height, blood group & identification mark etc. iii. Employee work cycle updates- joining details, pay fixation, probation, medical fitness, term/promotion/financial upgradation, subsequent educational qualifications (acquired),',
  'page_no': '13',
  'source': 'HR Handbook 2025 for website.pdf'},
 {'id': 'chunk-1-HR Handbook 2025 for website.pdf',
  'score': 0.09585607796907425,
  'chunk_text': 'Rules 7-9 2) JOINING 2.1 General Terms of Offer of Letter 10 2.2 Joining Forms 10 2.3 Joining Formalities and subsequent follow-up by HR 11 2.4 Fixation of Salary 11 2.5 Terms mentioned in the Service Contract Agreement 11-12 3)

In [43]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="openai/gpt-oss-120b",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [44]:
SYSTEM_PROMPT = """You are a helpful assistant that answers questions based on the provided context.
Use ONLY the context below to answer. If the answer is not in the context, say "I don't have enough information to answer that."

CITATION RULES:
- Each context chunk is labeled [1], [2], etc. with its source document and page number(s).
- When you use information from a chunk, cite it inline like [1], [2], etc.
- At the end of your answer, add a "References" section listing each cited source with page numbers.
- Format: [n] source_filename, p.X

"""

In [45]:
def build_context_block(chunks: List[Dict]) -> str:
    """Format retrieved chunks into a numbered context string with page info."""
    parts = []
    for i, c in enumerate(chunks, 1):
        source = c.get("source", "unknown")
        pages = c.get("pages", "")
        text = c.get("chunk_text", "")
        page_label = f", p.{pages}" if pages else ""
        parts.append(f"[{i}] (source: {source}{page_label})\n{text}")
    return "\n\n".join(parts)



def generate_answer(query: str, chunks: List[Dict]) -> str:
    context = build_context_block(chunks)

    messages = [
        ("system", SYSTEM_PROMPT),
        ("human", f"Context:\n{context}\n\n---\nQuestion: {query}"),
    ]

    ai_msg = llm.invoke(messages)
    return ai_msg.content

In [46]:
generate_answer(query="What is a work cycle?", chunks = retreived_result)

'A **work cycle** is the series of HR‑managed processes and record‑keeping activities that track an employee’s service from the moment of joining through all subsequent updates.  It includes maintaining a personal file and service book, monitoring time‑and‑attendance, leave entitlement, promotion, up‑gradation and career‑progression steps, salary and increments, NPS and gratuity, as well as documenting specific updates such as pay fixation, probation status, medical fitness, term‑promotions/financial up‑gradation, newly acquired educational qualifications, internal transfers, any misconduct or penalties, and long‑leave details【2】.  These elements together constitute the employee’s “work cycle” throughout their tenure【3】【4】.  \n\n**References**  \n[2] HR Handbook 2025 for website.pdf, p.13‑17 (WORK CYCLE sections)  \n[3] HR Handbook 2025 for website.pdf, p.4‑5 (EMPLOYEE WORK CYCLE UPDATES)  \n[4] HR Handbook 2025 for website.pdf, p.6 (EMPLOYEE WORK CYCLE UPDATES)'