In [1]:
print('hello world')

hello world


In [2]:
import sys
import os

# go two levels up to project root
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))

In [3]:
from src.fs_utils import get_root_dir
get_root_dir()

PosixPath('/home/sagar/python/rag-tutorial/minor-project')

In [4]:
from src.vector_store import get_collection

Collection(name=tmp)

In [8]:
root_dir = get_root_dir()

In [9]:
ordinance_collection = get_collection('ordinance')

In [10]:
from langchain_community.document_loaders import PyPDFLoader

In [12]:
b_tech_ordinance_loader = PyPDFLoader(root_dir / "data" / "ordinance" / "b_tech_ordinance.pdf")

In [15]:
b_tech_ordinance_docs = b_tech_ordinance_loader.load()

In [50]:
b_tech_ordinance_docs[0].page_content

'ORDINANCES AND REGULATIONS\nBACHELOR OF TECHNOLOGY (B.TECH.)\nNATIONAL INSTITUTE OF TECHNOLOGY RAIPUR\nOffice of Dean Academics\nNIT Raipur\nCG, India, 492010\ni'

In [114]:
ordinance_collection.delete(where={'degree': 'B.tech'})

In [115]:
ordinance_collection.get()

{'ids': [],
 'embeddings': None,
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': []}

In [116]:
from pydantic import BaseModel, Field
from typing import List

class LLMStructureOutput(BaseModel):
    summary: str = Field(description='this is the summary of the chunk after cleaning')
    metadata_tags: dict = Field(description='this is the metadata applied to the chunk. Keep this precise dont add more than 3-4 key-value paired tags')
    skip: bool = Field(description='if the whole chunk is redundant, set this to true')
    

In [117]:
from langchain_google_genai import ChatGoogleGenerativeAI
import os
os.environ['GOOGLE_API_KEY'] = 'AIzaSyAq_9Jgg7rM13m1ng89s_0IX0j7qIf5uSw'
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    temperature=0,
    max_retries=2
).with_structured_output(LLMStructureOutput)

In [118]:
system_prompt = {"role": "system", "content": "You are an AI assistant helping to summarize text chunks extracted from the ordinance documents of NIT Raipur. Each chunk will be stored in a vector database for a RAG system. Produce a concise, factual summary that keeps all important details. Remove filler, formatting, or irrelevant text. If the entire chunk adds no importance to the ordinance content (like headers, title, index, introduction pages, page numbers, or signatures etc), set skip to true. Generate 3–4 short metadata tags describing the main topics in the chunk."}

In [205]:
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from tqdm import tqdm

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # max characters per chunk
    chunk_overlap=100,    # overlap for context continuity
    length_function=len,  # default
)

from tqdm import tqdm

def get_summarized_chunks(docs: list, metadatas: dict | None = None, batch_size: int = 8) -> list:
    if metadatas is None:
        metadatas = {}

    res = []
    chunks = splitter.split_documents(docs)

    # Create all message payloads first
    all_batches = []
    for chunk in chunks:
        messages = [
            system_prompt,
            {"role": "user", "content": chunk.page_content}
        ]
        all_batches.append(messages)

    # Process in batches using llm.batch()
    for i in tqdm(range(0, len(all_batches), batch_size)):
        batch = all_batches[i:i + batch_size]
        try:
            responses = llm.batch(batch)
        except Exception as e:
            print(f"Batch {i // batch_size} failed: {e}")
            continue

        for resp, chunk in zip(responses, chunks[i:i + batch_size]):
            if not resp or getattr(resp, "skip", False):
                continue

            # merge metadata properly
            try:
                resp.metadata_tags.extend(
                    [{"key": k, "value": v} for k, v in metadatas.items()]
                )
            except Exception:
                pass

            res.append(resp)

    return res
        

In [121]:
summarized_chunks = get_summarized_chunks(
    b_tech_ordinance_docs[7:],
    {
        'degree': 'B.tech',
        'source': 'https://nitrr.ac.in/downloads/ordinance/B.Tech%20Oridnance%20Updated%20with%20Attachement.pdf',
        'program_level': 'Undergraduate',
    }
)

100%|███████████████████████████████████████| 93/93 [04:36<00:00,  2.97s/it]


In [122]:
len(summarized_chunks)

90

In [203]:
from uuid import uuid4
def add_to_collection(summarized_chunks):
    ordinance_collection.add(
        ids = [uuid4().hex for i in range(len(summarized_chunks))],
        documents=[chunk.summary for chunk in summarized_chunks],
        metadatas=[chunk.metadata_tags for chunk in summarized_chunks]
    )

In [206]:
b_arch_ordinance_docs = PyPDFLoader(root_dir / "data" / "ordinance" / "b_arch_ordinance.pdf").load()
b_arch_summarized_chunks = get_summarized_chunks(
    b_arch_ordinance_docs[7:],
    metadatas={
        'degree': 'B.Arch',
        'program_level': 'Undergraduate'
    }
)
add_to_collection(b_arch_summarized_chunks)

100%|███████████████████████████████████████| 12/12 [00:20<00:00,  1.75s/it]


ValueError: Expected metadata to be a non-empty dict, got 0 metadata attributes in add.

In [None]:
len(b_arch_summarized_chunks)

# Testing the ordinance collection

In [181]:
from langchain.tools import tool

@tool
def query_vector_store(query: str) -> str:
    """ Use this to look up official rules or policies from the NIT Raipur Ordinance documents. 
    The input should be a clear academic question.
    You can apply the following metadata filters:
    1. degree: [B.tech],
    2. program_level: [Undergraduate]
    """
    results = ordinance_collection.query(query_texts=[query], n_results=3)
    docs = results["documents"][0]
    metas = results["metadatas"][0]
    # Combine results for LLM
    res =  "\n\n".join(
        f"Source: {meta.get('source')} (Page {meta.get('page', '?')})\n{doc}"
        for doc, meta in zip(docs, metas)
    )
    return res

In [182]:
vector_llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    max_retries=2,
).bind_tools([query_vector_store])

In [188]:
vector_llm_system_prompt = """You are an academic assistant for National Institute of Technology Raipur.
You have access to an internal ordinance knowledge base that includes summarized and structured information about all academic regulations, rules, and policies.

Your goals:

Use the OrdinanceRetriever tool whenever a question may relate to institute rules, regulations, or policies (like admission, grading, attendance, credits, or degree requirements).

Read the retrieved context carefully and compose a factual, clear, and concise answer in simple language.

If multiple rules exist, explain the differences or conditions.

If you are not fully sure or context is missing, say:
“The provided ordinance does not specify this clearly.”

Always include the source and page number (from metadata) if available.

Style rules:

Use formal, student-facing tone.

Avoid assumptions or made-up information.

Prefer short paragraphs and bullet points for clarity.

Keep the response strictly relevant to the user’s query.

Example reasoning flow:

Interpret query → decide if ordinance info is needed → call OrdinanceRetriever → summarize retrieved text → answer factually with sources."""

In [199]:
from langchain.tools import tool
from langchain.messages import HumanMessage, ToolMessage
from langchain_google_genai import ChatGoogleGenerativeAI

# Initialize and bind (potentially multiple) tools to the model
model_with_tools = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite").bind_tools([query_vector_store])

# Step 1: Model generates tool calls
messages = [HumanMessage("explain about the minor project that students do in their 7th semester. Explain what is the weightage of different components.")]
ai_msg = model_with_tools.invoke(messages)
messages.append(ai_msg)

# Check the tool calls in the response
print(ai_msg.tool_calls)

# Step 2: Execute tools and collect results
for tool_call in ai_msg.tool_calls:
    # Execute the tool with the generated arguments
    tool_result = query_vector_store.invoke(tool_call)
    messages.append(tool_result)

# Step 3: Pass results back to model for final response
final_response = model_with_tools.invoke(messages)

[{'name': 'query_vector_store', 'args': {'degree': 'B.tech', 'program_level': 'Undergraduate', 'query': 'Minor project in 7th semester and its weightage'}, 'id': '9fb2f00a-deec-458c-808f-7467d736db8f', 'type': 'tool_call'}]


In [200]:
final_response.content

'Students undertake their minor project in the 7th semester. The evaluation is based on a mid-semester evaluation, a field evaluation report from their supervisor, and a final presentation and viva-voce.\n\nThe weightage of different components is as follows:\n*   **Continuous evaluation:** 40 marks (by the project guide)\n*   **Mid-term review:** 20 marks\n*   **End-semester viva-voce:** 40 marks\n\nDepartments are responsible for developing rubrics for evaluating project work, which include components such as topic selection, problem statement, literature review, methodology, oral and written presentation, and viva-voce examination. Plagiarism is considered unethical and will lead to strict disciplinary action.'