In [1]:
print('hello world')

hello world


In [2]:
import sys
import os

# go two levels up to project root
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))

In [3]:
from dotenv import load_dotenv

load_dotenv()  # reads variables from a .env file and sets them in os.environ

True

In [4]:
from src.fs_utils import get_root_dir
get_root_dir()

PosixPath('/home/sagar/python/rag-tutorial/minor-project')

In [5]:
from src.vector_store import get_collection

In [6]:
root_dir = get_root_dir()

In [7]:
ordinance_collection = get_collection('ordinance')

In [8]:
from langchain_community.document_loaders import PyPDFLoader

In [12]:
b_tech_ordinance_loader = PyPDFLoader(root_dir / "data" / "ordinance" / "b_tech_ordinance.pdf")

In [15]:
b_tech_ordinance_docs = b_tech_ordinance_loader.load()

In [50]:
b_tech_ordinance_docs[0].page_content

'ORDINANCES AND REGULATIONS\nBACHELOR OF TECHNOLOGY (B.TECH.)\nNATIONAL INSTITUTE OF TECHNOLOGY RAIPUR\nOffice of Dean Academics\nNIT Raipur\nCG, India, 492010\ni'

In [24]:
len(ordinance_collection.get()['ids'])

184

In [9]:
from langchain_google_genai import ChatGoogleGenerativeAI
test_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    temperature=0,
    max_retries=2
)
test_llm.invoke('hello, who are you?')

AIMessage(content='I am a large language model, trained by Google.', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash-lite', 'safety_ratings': [], 'grounding_metadata': {}, 'model_provider': 'google_genai'}, id='lc_run--7ce3abb2-5b75-406a-b0eb-b180349782b5-0', usage_metadata={'input_tokens': 7, 'output_tokens': 11, 'total_tokens': 18, 'input_token_details': {'cache_read': 0}})

In [8]:
from pydantic import BaseModel, Field
from typing import List

class LLMStructureOutput(BaseModel):
    summary: str = Field(description='this is the summary of the chunk after cleaning')
    metadata_tags: dict = Field(description='this is the metadata applied to the chunk. Keep this precise dont add more than 3-4 key-value paired tags')
    skip: bool = Field(description='if the whole chunk is redundant, set this to true')
    

In [9]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    temperature=0,
    max_retries=2
).with_structured_output(LLMStructureOutput)

In [10]:
system_prompt = {"role": "system", "content": "You are an AI assistant helping to summarize text chunks extracted from the ordinance documents of NIT Raipur. Each chunk will be stored in a vector database for a RAG system. Produce a concise, factual summary that keeps all important details. Remove filler, formatting, or irrelevant text. If the entire chunk adds no importance to the ordinance content (like headers, title, index, introduction pages, page numbers, or signatures etc), set skip to true. Generate 3–4 short metadata tags describing the main topics in the chunk."}

In [11]:
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from tqdm import tqdm

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # max characters per chunk
    chunk_overlap=100,    # overlap for context continuity
    length_function=len,  # default
)

from tqdm import tqdm

def get_summarized_chunks(docs: list, metadatas: dict | None = None, batch_size: int = 8) -> list:
    if metadatas is None:
        metadatas = {}

    res = []
    chunks = splitter.split_documents(docs)

    # Create all message payloads first
    all_batches = []
    for chunk in chunks:
        messages = [
            system_prompt,
            {"role": "user", "content": chunk.page_content}
        ]
        all_batches.append(messages)

    # Process in batches using llm.batch()
    for i in tqdm(range(0, len(all_batches), batch_size)):
        batch = all_batches[i:i + batch_size]
        try:
            responses = llm.batch(batch)
        except Exception as e:
            print(f"Batch {i // batch_size} failed: {e}")
            continue

        for resp, chunk in zip(responses, chunks[i:i + batch_size]):
            if not resp or getattr(resp, "skip", False):
                continue

            # merge metadata properly
            try:
                resp.metadata_tags |= metadatas
            except Exception:
                pass

            res.append(resp)

    return res
        

In [218]:
summarized_chunks = get_summarized_chunks(
    b_tech_ordinance_docs[7:],
    {
        'degree': 'B.tech',
        'source': 'https://nitrr.ac.in/downloads/ordinance/B.Tech%20Oridnance%20Updated%20with%20Attachement.pdf',
        'program_level': 'Undergraduate',
    }
)

100%|███████████████████████████████████████| 12/12 [02:31<00:00, 12.62s/it]


In [122]:
len(summarized_chunks)

90

In [12]:
from uuid import uuid4
def add_to_collection(summarized_chunks):
    ordinance_collection.add(
        ids = [uuid4().hex for i in range(len(summarized_chunks))],
        documents=[chunk.summary for chunk in summarized_chunks],
        metadatas=[chunk.metadata_tags for chunk in summarized_chunks]
    )

In [219]:
b_arch_ordinance_docs = PyPDFLoader(root_dir / "data" / "ordinance" / "b_arch_ordinance.pdf").load()
b_arch_summarized_chunks = get_summarized_chunks(
    b_arch_ordinance_docs[7:],
    metadatas={
        'degree': 'B.Arch',
        'program_level': 'Undergraduate'
    }
)

100%|███████████████████████████████████████| 12/12 [00:17<00:00,  1.49s/it]


In [224]:
b_arch_summarized_chunks[10]

LLMStructureOutput(summary='Students must register for audit and/or honors courses during semester registration. The Departmental Academic Committee (DAC) must approve courses taken for honors. Registration each semester is mandatory. Students who fail to register for one or more semesters may be allowed to re-register under extraordinary circumstances if they provide justification (medical certificates, affidavits, etc.) and the DAC recommends it.', metadata_tags={'degree': 'B.Arch', 'program_level': 'Undergraduate'}, skip=False)

In [222]:
len(b_arch_summarized_chunks)

94

In [225]:
add_to_collection(b_arch_summarized_chunks)

In [37]:
m_tech_ordinance_docs = PyPDFLoader(root_dir / "data" / "ordinance" / "m_tech_ordinance.pdf").load()

In [38]:
m_tech_summarized_chunks = get_summarized_chunks(
    m_tech_ordinance_docs[5:],
    metadatas={
        'degree': 'M.Tech',
        'program_level': 'Postgraduate'
    }
)

100%|█████████████████████████████████████████| 8/8 [00:14<00:00,  1.85s/it]


In [39]:
add_to_collection(m_tech_summarized_chunks)

In [63]:
mca_ordinance_docs = PyPDFLoader(root_dir / "data" / "ordinance" / "mca_ordinance.pdf").load()

In [65]:
mca_summarized_chunks = get_summarized_chunks(
    mca_ordinance_docs[7:],
    metadatas={
        'degree': 'MCA',
        'program_level': 'Postgraduate'
    }
)

100%|███████████████████████████████████████| 11/11 [00:17<00:00,  1.61s/it]


In [66]:
add_to_collection(mca_summarized_chunks)

In [77]:
phd_ordinance_docs = PyPDFLoader(root_dir / "data" / "ordinance" / "phd_ordinance.pdf").load()

Ignoring wrong pointing object 2 65536 (offset 0)
Ignoring wrong pointing object 49 65536 (offset 0)
Ignoring wrong pointing object 78 65536 (offset 0)
Ignoring wrong pointing object 88 65536 (offset 0)
Ignoring wrong pointing object 91 65536 (offset 0)
Ignoring wrong pointing object 101 65536 (offset 0)
Ignoring wrong pointing object 104 65536 (offset 0)
Ignoring wrong pointing object 107 65536 (offset 0)
Ignoring wrong pointing object 110 65536 (offset 0)
Ignoring wrong pointing object 113 65536 (offset 0)
Ignoring wrong pointing object 116 65536 (offset 0)
Ignoring wrong pointing object 127 65536 (offset 0)
Ignoring wrong pointing object 130 65536 (offset 0)
Ignoring wrong pointing object 133 65536 (offset 0)
Ignoring wrong pointing object 136 65536 (offset 0)
Ignoring wrong pointing object 139 65536 (offset 0)
Ignoring wrong pointing object 142 65536 (offset 0)
Ignoring wrong pointing object 145 65536 (offset 0)
Ignoring wrong pointing object 148 65536 (offset 0)
Ignoring wrong poi

In [79]:
phd_summarized_chunks = get_summarized_chunks(
    phd_ordinance_docs[4:],
    {
        'degree': 'PHD',
        'program_level': 'Postgraduate'
    }
)

100%|███████████████████████████████████████| 12/12 [02:44<00:00, 13.73s/it]


In [80]:
add_to_collection(phd_summarized_chunks)

In [112]:
msc_ordinance_docs = PyPDFLoader(root_dir / "data" / "ordinance" / "msc_ordinance.pdf").load()

In [114]:
msc_summarized_chunks = get_summarized_chunks(
    msc_ordinance_docs[6:],
    {
        'degree': 'M.SC',
        'program_level': 'Postgraduate'
    }
)

100%|███████████████████████████████████████| 10/10 [02:39<00:00, 15.96s/it]


In [115]:
add_to_collection(msc_summarized_chunks)

# Testing the ordinance collection

In [116]:
from langchain.tools import tool

@tool
def query_vector_store(query: str, filters: dict = {}) -> str:
    """
        Tool: ordinance_retriever
        
        Purpose: 
        Use this tool to find official rules, policies, and regulations from the NIT Raipur Ordinance documents. 
        It is the best tool for any question related to academic rules, grading, credits, or policies.
        
        Input Fields:
        
        1. search_query (string):
           A concise semantic search query, derived from the user's question.
           Example: If the user asks, "How is the SPI for B.Tech students calculated?", the search_query should be "SPI calculation for B.Tech".
        
        2. metadata_filter (dict):
           A mandatory ChromaDB filter dictionary to narrow the search. You must infer the correct filter from the query.
        
        Metadata & Filter Rules:
        
        Available Filter Fields:
        * degree (string): Valid values are "B.tech", "B.Arch", "M.Tech", "MCA", "PHD" OR "M.SC".
        * program_level (string): Valid values are "Undergraduate" or "Postgraduate".
        
        Filter Syntax (Strictly Enforced):
        You MUST generate a valid Chroma filter dictionary.
        
        * For a single condition: Use the field name as the top-level key.
            Example: {"degree": {"$eq": "B.Tech"}}
        
        * For multiple conditions: You MUST use a single top-level $and operator.
            Example: {"$and": [{"degree": {"$eq": "B.Tech"}}, {"program_level": {"$eq": "Undergraduate"}}]}
        
        CRITICAL ERROR TO AVOID:
        Never use multiple fields at the top level.
        * WRONG: {"degree": "B.Tech", "program_level": "Undergraduate"}
        * RIGHT: {"$and": [{"degree": {"$eq": "B.Tech"}}, {"program_level": {"$eq": "Undergraduate"}}]}
        
        Agent Strategy Hint:
        
        If a user's query asks to compare two different domains (e.g., "How is SPI calculated for B.Tech and B.Arch students?"), do not use an $or filter.
        
        Instead, call this tool multiple times (once for B.Tech, once for B.Arch). This ensures you retrieve the distinct, correct context for each program, which is essential for an accurate answer.
        """
    if filters:
        results = ordinance_collection.query(query_texts=[query], n_results=5, where=filters)
    else:
        results = ordinance_collection.query(query_texts=[query], n_results=3)
    docs = results["documents"][0]
    metas = results["metadatas"][0]
    # Combine results for LLM
    res =  "\n\n".join(
        f"Source: {meta.get('source')} (Page {meta.get('page', '?')})\n{doc}"
        for doc, meta in zip(docs, metas)
    )
    return res

In [117]:
vector_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    max_retries=2,
).bind_tools([query_vector_store])

In [121]:
vector_llm_system_prompt = """
System Prompt: NIT Raipur Academic Assistant

You are an academic assistant for National Institute of Technology Raipur. Your primary function is to provide students with factual and clear information regarding academic regulations and policies.

### 1. Core Directive: Use the Ordinance Tool

Your most important task is to retrieve information.

* You MUST use the `OrdinanceRetriever` tool for *any* query that may relate to institute rules, regulations, or policies.
* This includes, but is not limited to: admission, grading, SPI/CPI, attendance, credits, examinations, degree requirements, and academic penalties.

### 2. Rules for Answering

After you receive context from the tool, follow these rules strictly:

* Be 100% Factual: Base your entire answer *only* on the retrieved context. DO NOT add any information, assumptions, or external knowledge, even if it seems helpful.
* Cite Your Source: You MUST conclude your answer by citing the source and page number from the metadata (e.g., "*(Source: Academic Ordinance 2024, Page 15)*").
* Handle Missing Information: If the retrieved context does not contain the answer or is ambiguous, you MUST state: "The provided ordinance does not specify this clearly."
* Handle Multiple Rules: If the context provides different rules (e.g., for B.Tech vs. M.Tech), clearly explain the conditions for each.

### 3. Style Guide

* Tone: Maintain a formal, professional, and helpful student-facing tone.
* Clarity: Use simple, direct language. Avoid jargon.
* Format: Prefer short paragraphs and bullet points for scannability.
* Relevance: Keep your answer strictly focused on the user's specific query. Do not add irrelevant details.

### 4. Summary of Your Workflow

1.  Analyze Query: Does this question *sound* like it's about a rule?
2.  Call Tool: If yes, formulate a query and call `OrdinanceRetriever`.
3.  If not, answer it by yourself if you feel confident. Otherwise give a clear response that you are not able to answer that with your current knowledge.
3.  Read Context: Carefully analyze the text and metadata returned by the tool.
4.  Synthesize Answer: Compose a factual answer based *only* on the context, following all style and citation rules.
"""

In [122]:
from langchain.tools import tool
from langchain.messages import HumanMessage, ToolMessage
from langchain_google_genai import ChatGoogleGenerativeAI

# Initialize and bind (potentially multiple) tools to the model
model_with_tools = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite").bind_tools([query_vector_store])

# Step 1: Model generates tool calls
messages = [HumanMessage("what kinds of degrees are offered by NIT Raipur?")]
ai_msg = model_with_tools.invoke(messages)
messages.append(ai_msg)

# Check the tool calls in the response
print(ai_msg.tool_calls)

# Step 2: Execute tools and collect results
for tool_call in ai_msg.tool_calls:
    # Execute the tool with the generated arguments
    tool_result = query_vector_store.invoke(tool_call)
    messages.append(tool_result)

# Step 3: Pass results back to model for final response
final_response = model_with_tools.invoke(messages)

[{'name': 'query_vector_store', 'args': {'query': 'degrees offered by NIT Raipur'}, 'id': '2deb7e38-6c22-4b74-8de4-553ac2fcd60d', 'type': 'tool_call'}]


In [123]:
final_response.content

'The NIT Raipur offers the following degrees:\n- B.Arch (Bachelor of Architecture)\n- M.Tech (Master of Technology)\n- B.Tech (Bachelor of Technology) - Implied by the context of CCMT and SPI calculations for B.Tech students.'