In [None]:
!pip install pypdf



In [None]:
from tqdm import tqdm

### Reading the PDF and storing the page-wise content

In [None]:
from pypdf import PdfReader

page_wise_text = []
reader = PdfReader("UG_RULE_BOOK.pdf")
print(f"Pages: {len(reader.pages)}")
for page in tqdm(reader.pages):
    page_wise_text.append((page.extract_text()))

Pages: 52


100%|██████████| 52/52 [00:02<00:00, 21.44it/s]


In [None]:
print(page_wise_text[4])

5                                      
Move to Index 
6.9 Display of In-semester performance 32 
6.10 Disclosing evaluated material in a course 32 
6.11 Answer script retention period 32 
6.12 Compensatory time to PwD students 32 
7 REGISTRATION OF NPTEL/ SWAYAM COURSES 33 
7.1 Credit Equivalence of NPTEL/SWAYAM courses with IIT Bombay 
courses 33 
7.2 Policy for registering/ award of grades for NPTEL/ SWAYAM/ Semes-
ter Exchange courses 33 
8 CHANGE OF BRANCH 34 
8.1 Conversion to LASE programme 34 
9 PERFORMANCE REQUIREMENTS 36 
9.1 Award of degree 36 
9.2 Award of Medals 37 
9.3 Transfer to Academic Rehabilitation Programme (ARP) 37 
9.4 Exit Degree option for Undergraduates 37 
9.5 Early Exit for Dual Degree/ IDDDP Students 38 
10 UNDERGRADUATE RESEARCH AWARDS (URA) 40 
11 ACADEMIC REHABILITATION PROGRAMME (ARP) 41 
12 
ALLOWING B. TECH. AND B.S. STUDENTS TO LEAVE AFTER COMPLET-
ING THEIR CREDIT REQUIREMENTS FOR B.TECH./B.S. EARLIER THAN 
4 YEARS 
41 
13 CONVERSION FROM B.TECH. TO

### Making an Embedding Class to ensure Local Embeddings rather than API Calls

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_core.embeddings import Embeddings
from typing import List

class LocalEmbeddings(Embeddings):
    def __init__(self, model_name: str):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.model.encode(text).tolist() for text in texts]

    def embed_query(self, query: str) -> List[float]:
        return self.model.encode(query).tolist()

In [None]:
model = LocalEmbeddings("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain_community)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 k

In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


### Indexation

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

### Splitting the data string into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents(page_wise_text)


In [None]:
len(chunks)

174

In [None]:
## preparing the vector_store
embeddings = model
vector_store = FAISS.from_documents(chunks, embeddings)

In [None]:
vector_store.index_to_docstore_id

{0: 'ca695774-1517-4ca1-a728-e2562b448f66',
 1: '84319ce6-283d-4c42-ab04-f81be3156f2e',
 2: '9f107847-ea09-4ea3-b121-de8970d2f72a',
 3: '99ab3aa0-2f9b-4a16-922a-09cad0b5a486',
 4: 'c49589bd-a821-400e-b6b5-d9b932536db7',
 5: 'fa2c6f51-e1f5-4755-95fa-9963db02fbf2',
 6: '4c3144b2-2007-4fd3-a134-b93d49c528db',
 7: 'c3baefb3-1517-4798-98a8-d301a4c64fc8',
 8: 'a6b41261-3211-44ee-95fb-6f868ffc9012',
 9: '9d80a399-5ef8-49cb-b039-3e1f8f13bb70',
 10: 'fc528e70-142a-441b-8915-0b947e6227bd',
 11: '44c6672f-9036-4534-bfc8-18994a983500',
 12: '5bf224ce-cf93-48bb-a299-0c07275b3abf',
 13: '859f1c26-3437-4690-8332-6f1df502b1e8',
 14: 'c674fe88-a3f3-428c-bf31-aa84ad95e653',
 15: '07c2706b-f015-45c6-a6b8-a08ca8ffc430',
 16: 'b9d02d85-c2c1-4a46-844b-3f221cbc0d09',
 17: '08eee5d8-f8a7-446d-b0d0-903514c3cdf7',
 18: 'bc25f894-ce0b-4d62-97c1-0e4ba9eeba63',
 19: '0f2fbc5f-7c6a-40a2-b9ca-057d60738185',
 20: '65bad9bd-00ab-4b58-aac3-600205785a27',
 21: 'a9042b4e-3c57-4d9c-a651-3c56bef1a953',
 22: 'aebf6723-b678-

### Retrieval

In [None]:
## creating the retriever object
retriever = vector_store.as_retriever(search_type = 'similarity', search_kwargs = {'k': 5})

In [None]:
retriever.invoke('What is the max number of credits in a semester?')

[Document(id='27d22d99-6cb3-4e86-a2ab-0e66a6559f4a', metadata={}, page_content='5.2 Permissible Registration Load (Ref: 235th Senate meeting) \nStudents are allowed to register as per their academic standing subject to a maximum credit limit \nas given below: \nAcademic Standing Maximum Credits Allowed \nCategory I  54 credits \nCategory II, III, IV  48 credits \nCategory V 36 credits (Ref: 261st Senate meeting) \nCategory VI (ARP) 24 credits \nThe Faculty Adviser is the competent authority to approve course registration in all cases. \nEvery student must register for a minimum of 18 credits each semester. In case a student has com-\npleted most of the minimum credit requirements, she/he may register for the remaining courses \nwhich are available in that semester, which may be less than 18 credits. \nThe maximum credits allowed as given above cannot be exceeded under any circumstances. While \nthese are the maximum allowed credits the Faculty Adviser may prescribe lower credits especi

In [None]:
!pip install langchain_google_genai

Collecting langchain_google_genai
  Downloading langchain_google_genai-2.1.6-py3-none-any.whl.metadata (7.0 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain_google_genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain_google_genai)
  Downloading google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata (9.8 kB)
Downloading langchain_google_genai-2.1.6-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Downloading google_ai_generativelanguage-0.6.18-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: filetype, google-ai-generativelanguage, langchain_google_genai
  Attempting uninstall: google-ai-generativelangu

In [None]:
import os
from google.colab import userdata
os.environ["GOOGLE_API_KEY"] = userdata.get('GEMINI_KEY')

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI


llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.5)
prompt = "Explain what RAG is"
response = llm.invoke(prompt)
print(response.content)

RAG stands for **Retrieval-Augmented Generation**. It's a technique used in natural language processing (NLP) that combines the strengths of two different approaches:

*   **Retrieval:**  This involves searching a large external knowledge base (like a database, a collection of documents, or the internet) to find relevant information.
*   **Generation:** This uses a large language model (LLM) to generate text, such as answering a question, summarizing a document, or creating content.

**How RAG works:**

1.  **User Input (Query):**  The process starts with a user asking a question or providing a prompt.

2.  **Retrieval:**
    *   The user's query is used to search a knowledge base.  This search is often done using techniques like semantic search or keyword search to find documents or passages that are relevant to the query.
    *   The retrieved information is a set of relevant documents or passages from the knowledge base.

3.  **Augmentation:**
    *   The original user query is comb

### Augmentation

In [None]:
from langchain_core.prompts import PromptTemplate

In [None]:
prompt = PromptTemplate(
    template = """
      You are a helpful assistant.
      Answer about the UG academic rules of IITB ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

### Generation (Setting up the main LangChain chain)

In [None]:
## call invoke once, and the entire pipeline runs
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda

In [None]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [None]:
parallel_chain = RunnableParallel( ## Dictionary as input
    {
        'context': retriever | RunnableLambda(format_docs), ## output is context string
        'question': RunnablePassthrough()
    }
)

In [None]:
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()

In [None]:
main_chain = parallel_chain | prompt | llm | parser

In [None]:
main_chain.invoke("What is the max number of credits that can be done in a semester?")

'The maximum number of credits allowed in a semester depends on the academic standing:\n\n*   Category I: 54 credits\n*   Category II, III, IV: 48 credits\n*   Category V: 36 credits\n*   Category VI (ARP): 24 credits'

In [None]:
main_chain.invoke("What do these categories mean?")

'The academic standing of a student is categorized as follows:\n\n*   **Category I:** A student who has a CPI of at least 8 and no outstanding FR/DX/DR/W grade in a core course.\n*   **Category II:** A student with a CPI less than 8 and no outstanding FR/DX/DR/W grade in a core course.\n*   **Category III:** A student who has at least one outstanding FR/DX/DR/W in core courses and at most one FR or DX grade in any other course in two preceding regular registered semesters, subject to having earned at least 18 credits in each of the semesters. These 18 credits may be earned in courses with any TAG.\n*   **Category IV:** A student who has at least one outstanding FR/DX/DR/W in core courses and more than one FR or DX grade in any other course in two preceding regular registered semesters, subject to having earned at least 18 credits in each of the semesters. These 18 credits may be earned in courses with any TAG.\n*   **Category V:** A student who has not earned at least 18 credits (in co

In [None]:
main_chain.invoke("What is the difference between minors and honours?")

'A minor is an additional credential a student may earn if they do 30 credits worth of additional courses in a discipline other than their major discipline. Honours is an additional credential a student may earn if they opt for the extra 24 credits needed for this in their own discipline.'

In [None]:
main_chain.invoke("What is an 'II' grade? How does it get converted to another grade?")

'An "II" grade is a place holder and gets converted to an appropriate grade after a semester-end re-examination, as per applicability.\n\nHere\'s how it can be converted:\n\n*   **Project-based courses:** For R&D projects like BTP-1, BTP-2, DDP-1, and the first stage of certain BS/MSc projects, a letter grade must replace the II grade within 20 days after the last date for grade submission of the previous semester. Otherwise, it becomes an FR. For the final stage of BS/MSc projects and DDP-2, there\'s an initial 20-day window, with a possible 15-day extension (total 35 days) if recommended by the guide and approved by the DUGC. After this, it becomes an FR.\n*   **Re-examination:** If a student is awarded an II grade, they are expected to appear for a re-examination. If they fail to appear for the re-examination, the instructor will award an "FR" grade again. If the absence in the re-exam is due to medical / valid reasons, the student must submit supporting documents within seven days 

In [None]:
print(format_docs(retriever.invoke("What are credits?")))

other activities like seminar, project, etc., every semester.  
The curriculum is designed to permit B.Tech., B.S. and B.De s. students, who are not identified as 
academically weak, to optionally take additional courses. The freedom to take about six credits 
every semester after the first year, permits a student to satisfy her/his interests / abilities and aspi-
rations. 
It is expected that all students with reasonably good academic standing, utilize this surplus time for 
enhancing their academic learning experience, though the initiative is left entirely to them. They 
can use it to credit an assortment of courses/projects anywhere in the Institute, (subject to require-
ments of each of these courses being met), to gain a wide exposure. These additional academic 
accomplishments will find a separate mention in the transcript (refer Sec.6.8.4). They can also credit 
courses in specific areas which can qualify as a minor/ honours. They may alternatively devote part

18 credits may b

In [None]:
main_chain.invoke("In what all matters does the DEAN have the final say?")

'Based on the provided text, the Dean (Academic Programme) has the final say on matters colored yellow and underlined. The Dean also approves courses offered during the summer.'

In [None]:
main_chain.invoke("Which matters are yellow and underlined")

'Based on the provided text, a rule that is colored yellow and underlined means that the final authority is the Associate Dean (Academic Programme)/ Dean (Academic Programme).'

In [None]:
main_chain.invoke("What are credits?")

'A certain quantum of academic work measured in terms of credits is laid down as the requirement for a particular degree. A student earns credits by satisfactorily clearing courses/ other academic activities every semester. The credits associated with a course/ other academic activity are dependent on the number of hours of work expected to be put in by the student per week.'

In [None]:
main_chain.invoke("Can we change branches? I need the latest info")

'Yes, branch change is possible with certain restrictions.\n\n*   **JEE Advanced 2023 onwards:** Branch change into JOSAA administered branches is not allowed for students admitted through JEE (Advanced) 2023 onwards.\n*   **LASE Programme:** Branch change to the Liberal Arts, Sciences and Engineering Programme (LASE) is possible from the academic year 2022-2023. Admission to LASE is based on a written test, personal interview, and academic performance (CPI).\n*   **Timing:** Branch changes can only be done once at the beginning of the second academic year.\n*   **Eligibility Criteria:**\n    *   Completion of prescribed course credits in the first two semesters.\n    *   No FR/DX/DR/W grades at the end of the first two regular registered semesters.\n    *   NP grade in NOCS is not a bar.\n    *   NP grade in GC 101 is not a bar.\n    *   "Eligibility-CPI" of at least 7.0.\n*   **Other criteria:**\n    *   "Branch-Change-CPI" of at least 9 AND a seat is available in the desired branch.

In [None]:
main_chain.invoke("Is branch change still allowed?")

'Branch change into any of the JOSAA administered branches of admission is not allowed for students admitted through JEE (Advanced) 2023 onwards. However, branch change to the LASE program is possible.'

In [None]:
main_chain.invoke("Is there an attendance policy?")

'Yes, there is an attendance policy. Here\'s what the provided text says about it:\n\n*   Attendance in classes is mandatory from the very beginning of the semester. Students who miss even a single lecture from among the first three lectures of a course, are liable to have themselves deregistered from the corresponding course.\n*   It is expected that all students should have 100% attendance in courses.\n*   If a student has less than 80% of the total expected attendance, the instructor may award a ‘Drop due to inadequate attendance’, ‘DX’ grade in that course. This grade is treated as equivalent to a ‘Course Drop’. No specific concession is given for lack of attendance on medical grounds.\n*   If a student has 80% attendance or more, they cannot be awarded ‘Drop due to inadequate attendance, unless the instructor has declared in the first week that no DX grade will be awarded.\n*   For summer courses, a minimum of 80% attendance is compulsory. Instructors may award an "FR" grade to st

In [None]:
main_chain.invoke('Inter Disciplinary Dual Degree Program?')

"Here's what the document says about the Interdisciplinary Dual Degree Programme (IDDDP):\n\n*   There are guidelines for the Interdisciplinary Dual Degree Programme (IDDDP), as per the 259th Senate Meeting.\n*   For students admitted into the IDDDP program, the CMInDS may prescribe additional pre-requisite courses.\n*   To be eligible for the IDDDP in AI and ML, students need to complete specific requirements including completing equivalent courses by the 6th semester (waived for the first year of admission), completing 4 PG level courses, and completing two stages of the Dual Degree Project."

In [None]:
main_chain.invoke('IDDDP?')

'Here\'s what the provided text says about the IDDDP:\n\n*   **General Information:**\n    *   IDDDP is only for the movement of students from one academic unit to another.\n    *   A DD specialization / M.Tech. program usually requires the completion of 8 to 9 courses of 6 credits and a DD/M.Tech. project (DDP/MTP) of 74 - 92 credits.\n    *   IDDDP students (2021 batch onwards) should complete the B.Tech./ B.S. of their parent academic unit.\n    *   If the host academic unit offering M.Tech. has a UG programme with honours, IDDDP students will be required to do 24 credits from the Honours basket and at least 24 credits from PG.\n    *   For M.Sc./ MBA, requirements will be as approved by the Senate.\n    *   An admitting academic unit can prescribe additional courses over and above that required for IDDDP, which may differ depending on the parent academic unit of the incoming student.\n    *   An admitting academic unit must prescribe required courses to the student at the time of a

In [None]:
main_chain.invoke('W grade?')

'W Course Withdrawn'

### Saving the Vector Store & the Embedding Model for future use

In [None]:
vector_store.save_local('ugac_rb')

In [None]:
embeddings.model.save_pretrained("Transformer_Model")

### Recognising the color

In [None]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m76.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [None]:
import fitz  # PyMuPDF

def extract_background_highlights(pdf_path):
    doc = fitz.open(pdf_path)
    results = []
    for pno, page in tqdm(enumerate(doc, start=1)):
        words = page.get_text("words")  # list of (x0,y0,x1,y1,word, block, line, word_no)
        for x0, y0, x1, y1, word, *_ in words:
            rect = fitz.Rect(x0, y0, x1, y1)
            pix = page.get_pixmap(clip=rect, dpi=150)
            # Sample a few pixels, compute avg RGB
            arr = pix.samples
            # Simplify: pick center pixel
            w, h = pix.width, pix.height
            r = pix.samples[((h//2)*pix.stride + (w//2)*4)]
            g = pix.samples[((h//2)*pix.stride + (w//2)*4)+1]
            b = pix.samples[((h//2)*pix.stride + (w//2)*4)+2]
            # Convert to float 0–1
            color = (r/255, g/255, b/255)
            results.append({"page": pno, "text": word, "color": color})
    return results


data = extract_background_highlights("UG_RULE_BOOK.pdf")
for item in data:
    print(item)


52it [00:48,  1.06it/s]

{'page': 1, 'text': 'INDIAN', 'color': (0.0, 0.03529411764705882, 0.0)}
{'page': 1, 'text': 'INSTITUTE', 'color': (0.03529411764705882, 0.0, 0.0)}
{'page': 1, 'text': 'OF', 'color': (0.03529411764705882, 0.0, 0.0)}
{'page': 1, 'text': 'TECHNOLOGY', 'color': (1.0, 1.0, 1.0)}
{'page': 1, 'text': 'BOMBAY', 'color': (1.0, 1.0, 1.0)}
{'page': 1, 'text': 'Rules', 'color': (0.03529411764705882, 0.0, 0.0)}
{'page': 1, 'text': '&', 'color': (1.0, 1.0, 1.0)}
{'page': 1, 'text': 'Regulations', 'color': (0.03529411764705882, 0.0, 0.0)}
{'page': 1, 'text': 'for', 'color': (1.0, 1.0, 1.0)}
{'page': 1, 'text': 'Undergraduate', 'color': (1.0, 1.0, 1.0)}
{'page': 1, 'text': 'Programmes', 'color': (0.0, 0.0, 0.03529411764705882)}
{'page': 1, 'text': 'Applicable', 'color': (0.0, 0.0, 0.03529411764705882)}
{'page': 1, 'text': 'to', 'color': (1.0, 1.0, 1.0)}
{'page': 1, 'text': 'the', 'color': (1.0, 1.0, 0.4823529411764706)}
{'page': 1, 'text': 'B.Tech.,', 'color': (1.0, 1.0, 0.3764705882352941)}
{'page': 




In [None]:
data

[{'page': 1, 'text': 'INDIAN', 'color': (0.0, 0.03529411764705882, 0.0)},
 {'page': 1, 'text': 'INSTITUTE', 'color': (0.03529411764705882, 0.0, 0.0)},
 {'page': 1, 'text': 'OF', 'color': (0.03529411764705882, 0.0, 0.0)},
 {'page': 1, 'text': 'TECHNOLOGY', 'color': (1.0, 1.0, 1.0)},
 {'page': 1, 'text': 'BOMBAY', 'color': (1.0, 1.0, 1.0)},
 {'page': 1, 'text': 'Rules', 'color': (0.03529411764705882, 0.0, 0.0)},
 {'page': 1, 'text': '&', 'color': (1.0, 1.0, 1.0)},
 {'page': 1, 'text': 'Regulations', 'color': (0.03529411764705882, 0.0, 0.0)},
 {'page': 1, 'text': 'for', 'color': (1.0, 1.0, 1.0)},
 {'page': 1, 'text': 'Undergraduate', 'color': (1.0, 1.0, 1.0)},
 {'page': 1, 'text': 'Programmes', 'color': (0.0, 0.0, 0.03529411764705882)},
 {'page': 1, 'text': 'Applicable', 'color': (0.0, 0.0, 0.03529411764705882)},
 {'page': 1, 'text': 'to', 'color': (1.0, 1.0, 1.0)},
 {'page': 1, 'text': 'the', 'color': (1.0, 1.0, 0.4823529411764706)},
 {'page': 1, 'text': 'B.Tech.,', 'color': (1.0, 1.0, 0

In [None]:
import fitz  # PyMuPDF
import io
import re
from langchain_core.documents.base import Document

# Step 1: Extract sentences + bbox per page
def extract_sentences_with_bbox(pdf_path):
    doc = fitz.open(pdf_path)
    sentences_data = []
    for pno, page in enumerate(doc, start=1):
        spans = []
        struct = page.get_text("dict", sort=True)
        for block in struct["blocks"]:
            for line in block.get("lines", []):
                for span in line["spans"]:
                    spans.append((span["text"], fitz.Rect(span["bbox"])))
        text = " ".join(t for t, _ in spans)
        sentences = re.split(r'(?<=[.!?])\s+', text)
        # approximate bbox for each sentence
        start = 0
        for sent in sentences:
            sent = sent.strip()
            if not sent: continue
            end = start + len(sent)
            # approximate spans of this sentence
            sbbox = None
            length = 0
            for t, r in spans:
                length += len(t) + 1
                if length >= start and sbbox is None: sbbox = r
                elif sbbox: sbbox |= r
                if length >= end: break
            sentences_data.append((pno, sent, sbbox))
            start = end + 1
    return sentences_data, doc

# Step 2: Sample background color via pixmap
def sample_color_for_rect(page, rect, dpi=150):
    if rect is None: return None
    pix = page.get_pixmap(clip=rect, dpi=dpi)
    w, h = pix.width, pix.height
    stride = pix.stride
    idx = ((h//2)*stride + (w//2)*4)
    r, g, b = pix.samples[idx:idx+3]
    return (r/255, g/255, b/255)

# Map color to human label
def label_color(rgb):
    if rgb is None: return "none"
    if abs(rgb[0]-1)<0.1 and abs(rgb[1]-1)<0.1 and abs(rgb[2])<0.1:
        return "yellow"
    return "other"

# Step 3: Build Documents with color metadata
sentences, doc_obj = extract_sentences_with_bbox("UG_RULE_BOOK.pdf")
documents = []
for page_num, sent, bbox in sentences:
    page = doc_obj[page_num-1]
    rgb = sample_color_for_rect(page, bbox)
    lbl = label_color(rgb)
    documents.append(Document(page_content=sent, metadata={"page": page_num, "color": lbl}))

# Step 4: Split, embed, and build FAISS vector store
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(documents)

In [None]:
len(documents), len(chunks)

(1117, 1128)

In [None]:
main_chain.invoke("What do the highlight colours mean?")

"Here's what the colors indicate regarding the final authority for a rule:\n\n*   **Green:** The final authority for the rule is the Convener DUGC.\n*   **Yellow and Underlined:** The final authority is Associate Dean (Academic Programme)/ Dean (Academic Programme).\n*   **Yellow (without underline):** The Convener, UGAPEC is the authority.\n*   **Blue:** The final authority is the Senate.\n*   **Uncolored:** The rule is to be implemented strictly."

In [None]:
import fitz  # PyMuPDF

def extract_paragraphs_with_color(pdf_path):
    doc = fitz.open(pdf_path)
    results = []
    for pno, page in enumerate(doc, start=1):
        struct = page.get_text("dict", sort=True)
        for block in struct["blocks"]:
            if block.get("type", None) != 0 or "lines" not in block:
                continue
            text = ""
            colors = []
            for line in block["lines"]:
                for span in line["spans"]:
                    text += span["text"]
                    # span["color"] returns fill color (as) integer or tuple
                    colors.append(span.get("color", None))
            if text.strip():
                # pick the first non-default color
                color = next((c for c in colors if c and c != 0), None)
                results.append({"page": pno, "text": text.strip(), "color": color})
    return results

paragraphs = extract_paragraphs_with_color("UG_RULE_BOOK.pdf")


In [None]:
def rgb_from_int(color):
    # Convert integer to RGB tuple
    return ((color >> 16) & 255)/255, ((color >> 8) & 255)/255, (color & 255)/255
def label_color(c):
    if c is None:
        return "none"
    # Unpack only RGB, regardless of extra channels
    if isinstance(c, (list, tuple)):
        if len(c) >= 3:
            r, g, b = c[:3]
        else:
            return "other"
    else:
        # c is an integer — convert to RGB tuple
        r, g, b = rgb_from_int(c)  # your existing integer-to-RGB converter

    # Now classify colors
    if g > 0.8 and r < 0.5 and b < 0.5:
        return "green"
    if r > 0.8 and g > 0.8 and b < 0.5:
        return "yellow"
    if b > 0.8 and r < 0.5 and g < 0.5:
        return "blue"
    return "none"



In [None]:
from langchain_core.documents.base import Document

docs = [
    Document(
        page_content=para["text"],
        metadata={"page": para["page"], "color": label_color(para["color"])}
    )
    for para in paragraphs
]


In [None]:
len(docs)

1004

In [None]:
from collections import defaultdict

groups = defaultdict(list)
for p in paragraphs:
    lbl = label_color(p["color"])
    key = (p["page"], lbl)
    groups[key].append(p["text"])

merged = []
for (page, lbl), texts in groups.items():
    merged.append({"page": page, "color": lbl, "text": "\n".join(texts)})


In [None]:
len(merged)

54

In [None]:
merged[0]['text']

'INDIAN INSTITUTE OF TECHNOLOGY BOMBAY\nRules & Regulations  for Undergraduate Programmes\nApplicable to the B.Tech., B.S., B.Des.,   Dual Degree students admitted from the\nAcademic Year 2007 - 2008\nUpdated: June, 2025'

In [None]:
# peek into the color values
sample = merged[:10]
for item in sample:
    print(item["color"], type(item["color"]))

none <class 'str'>
none <class 'str'>
none <class 'str'>
none <class 'str'>
none <class 'str'>
none <class 'str'>
none <class 'str'>
none <class 'str'>
none <class 'str'>
none <class 'str'>


In [None]:
docs = [
    Document(
        page_content=para["text"],
        metadata={"page": para["page"], "color": label_color(para["color"])}
    )
    for para in merged
]


TypeError: unsupported operand type(s) for >>: 'str' and 'int'