# 1. Loading The PDFs

In [1]:
import os
import fitz  # This is the import for PyMuPDF
from tqdm import tqdm

# Define the path to your folder containing the PDF files
pdf_folder_path = "PDFs"

# Check if the PDF folder exists
if not os.path.exists(pdf_folder_path):
    print(f"Error: The folder '{pdf_folder_path}' was not found.")
else:
    # Get a list of all PDF files in the folder
    pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith(".pdf")]

    if not pdf_files:
        print(f"No PDF files found in the '{pdf_folder_path}' folder.")
    else:
        print(f"Found {len(pdf_files)} PDF files to process.")

Found 65 PDF files to process.


# 2. Extract Text from all PDFs

In [2]:
import os
import fitz  # PyMuPDF
import pdfplumber
import pytesseract
from PIL import Image
import io
from tqdm import tqdm

# --- CONFIGURATION ---
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
pdf_folder_path = "PDFs"
pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith(".pdf")]
# -------------------

processed_documents = []

print("Starting optimized extraction (text, tables, and images)...")

for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
    file_path = os.path.join(pdf_folder_path, pdf_file)
    full_text = ""

    # --- COMBINED PyMuPDF STEP: Open the PDF once for text and images ---
    try:
        doc_fitz = fitz.open(file_path)
        # 1. Extract Plain Text and Images in the same loop
        for page_num, page in enumerate(doc_fitz):
            # Extract plain text
            full_text += page.get_text()

            # Extract text from images on the same page
            image_list = page.get_images(full=True)
            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = doc_fitz.extract_image(xref)
                image_bytes = base_image["image"]
                
                try:
                    image = Image.open(io.BytesIO(image_bytes))
                    ocr_text = pytesseract.image_to_string(image)
                    if ocr_text.strip():
                        full_text += f"\n\n--- IMAGE OCR DATA ---\n{ocr_text.strip()}\n--- END IMAGE OCR ---\n"
                except Exception as e:
                    pass # Ignore images that Tesseract can't read
        doc_fitz.close()
    except Exception as e:
        print(f"Error processing {pdf_file} with PyMuPDF: {e}")


    # --- 2. Extract Table Data using pdfplumber (still a separate step) ---
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                tables = page.extract_tables()
                for table in tables:
                    if table:
                        table_text = "\n".join([" | ".join(map(str, row)) for row in table])
                        full_text += f"\n\n--- TABLE DATA ---\n{table_text}\n--- END TABLE ---\n"
    except Exception as e:
        print(f"Could not process tables in {pdf_file} with pdfplumber: {e}")

    processed_documents.append(full_text)

print(f"\nOptimized extraction complete. Loaded {len(processed_documents)} documents.")

Starting optimized extraction (text, tables, and images)...


Processing PDFs:   0%|          | 0/65 [00:00<?, ?it/s]

MuPDF error: library error: FT_New_Memory_Face(LNWSSX+NotoSerif-Italic): unknown file format



Cannot set gray stroke color because /'P0' is an invalid float value
Processing PDFs:   5%|▍         | 3/65 [01:28<31:28, 30.45s/it]Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Processing PDFs:   8%|▊         | 5/65 [02:06<22:50, 22.84s/it]Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Processing PDFs:   9%|▉         | 6/65 [02:33<23:52, 24.28s/it]Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Processing PDFs:  18%|█▊        | 12/65 [04:17<14:39, 16.59s/it]Cannot set gray stroke color because /'P0' is an invalid 

MuPDF error: library error: FT_New_Memory_Face(OXKTKW+NotoSerif-Italic): unknown file format



Processing PDFs:  26%|██▌       | 17/65 [07:54<36:02, 45.06s/it]Cannot set gray stroke color because /'P0' is an invalid float value
Processing PDFs:  29%|██▉       | 19/65 [10:28<49:19, 64.34s/it]Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Processing PDFs:  31%|███       | 20/65 [10:48<38:21, 51.14s/it]Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Processing PDFs:  32%|███▏      | 21/65 [11:53<40:27, 55.17s/it]Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P4' is an inval


Optimized extraction complete. Loaded 65 documents.





# Implementing the Hybrid Chunking Strategy

In [3]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
import re

# --- 1. Initialize the embedding model and the Semantic Chunker ---
# The semantic chunker uses an embedding model to measure the distance between sentences.
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

semantic_splitter = SemanticChunker(
    embeddings=embeddings,
    breakpoint_threshold_type="percentile" # Uses a percentile of distance scores as the breakpoint
)

# --- 2. Implement the Hybrid Chunking Logic ---
final_chunks = []
print("Starting hybrid chunking process...")

for doc_text in tqdm(processed_documents, desc="Applying Hybrid Chunking"):
    
    # --- Specialized Handling for Atomic Units (Tables) ---
    # Find all table blocks, treat them as atomic chunks, and remove them from the main text
    table_pattern = r"--- TABLE DATA ---(.*?)--- END TABLE ---"
    tables = re.findall(table_pattern, doc_text, re.DOTALL)
    for table_content in tables:
        final_chunks.append(f"--- TABLE DATA ---\n{table_content.strip()}\n--- END TABLE ---")
    
    # Create a new text version with tables removed for further chunking
    text_without_tables = re.sub(table_pattern, "", doc_text)

    # --- Level 1 - Structural Partitioning (A Simple Approach) ---
    # We'll use double newlines as a proxy for paragraph breaks as a simple structural split.
    # A more advanced version could use regex for "Chapter X" or "Section Y".
    structural_blocks = text_without_tables.split('\n\n')
    
    # --- Level 2 - Semantic Subdivision ---
    # Process each structural block with the semantic chunker
    for block in structural_blocks:
        if len(block.strip()) > 200: # A threshold to avoid chunking very small leftover blocks
            semantic_chunks = semantic_splitter.create_documents([block])
            for chunk in semantic_chunks:
                final_chunks.append(chunk.page_content)

print(f"\nSuccessfully created {len(final_chunks)} chunks using the hybrid strategy.")

# Let's replace the old 'chunks' variable with our new, intelligently created ones
chunks = final_chunks

  embeddings = HuggingFaceEmbeddings(model_name=model_name)


Starting hybrid chunking process...


Applying Hybrid Chunking: 100%|██████████| 65/65 [09:43<00:00,  8.98s/it]


Successfully created 4242 chunks using the hybrid strategy.





## How This Code Implements Your Strategy
Atomic Units (Tables) 🧱: The code first uses regular expressions (re.findall) to find every block of text you marked as --- TABLE DATA ---. It pulls them out, adds them to our final_chunks list as complete units, and then removes them from the main text.

Level 1 (Structural) 🏛️: It then takes the remaining text and performs a simple structural split using double newlines (\n\n) as a separator. This treats each paragraph as a distinct "structural block."

Level 2 (Semantic) 🧩: Finally, it loops through each of these structural blocks. For each block, it applies the SemanticChunker, which further subdivides the paragraph into even smaller, thematically unified chunks based on the semantic meaning of the sentences.

You have now successfully implemented the advanced chunking strategy you outlined. The chunks variable now holds a much higher quality of retrieval-ready information.

# Hybrid Chunking

In [4]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# 1. Initialize the embedding model we need
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

print(f"Starting to create embeddings for all {len(chunks)} chunks...")

# 2. Create the FAISS vector database from the text chunks and the model
db = FAISS.from_texts(chunks, embeddings)

# 3. Save the completed vector database to your local disk
db.save_local("faiss_index")

print("\nSuccessfully created and saved the FAISS vector database to the 'faiss_index' folder.")

Starting to create embeddings for all 4242 chunks...

Successfully created and saved the FAISS vector database to the 'faiss_index' folder.


# llm

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

print("Reverting to a more robust chunking strategy...")

# This splitter is better for textbooks as it tries to keep paragraphs together.
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=250,
    separators=["\n\n", "\n", ". ", " "]
)

# --- CORRECTED LINES ---
# The create_documents method splits our list of texts into Document objects.
new_chunks_docs = recursive_splitter.create_documents(processed_documents)
# We then extract the plain text content from each Document object.
new_chunks = [doc.page_content for doc in new_chunks_docs]
# ---------------------

# Overwrite the old, bad chunks variable
chunks = new_chunks

print(f"Successfully re-chunked the documents into {len(chunks)} more coherent chunks.")

Reverting to a more robust chunking strategy...
Successfully re-chunked the documents into 2281 more coherent chunks.


In [6]:
print(f"Starting to rebuild the vector database with {len(chunks)} new chunks...")

# This will create a new database in memory with the better chunks
db = FAISS.from_texts(chunks, embeddings)

# This will overwrite the old 'faiss_index' folder with the new, corrected one
db.save_local("faiss_index")

print("Successfully rebuilt and saved the new FAISS vector database.")

Starting to rebuild the vector database with 2281 new chunks...
Successfully rebuilt and saved the new FAISS vector database.


In [7]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda, RunnableParallel
from langchain.schema.output_parser import StrOutputParser
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder

# --- 1. Basic Setup (Same as before) ---
os.environ["GOOGLE_API_KEY"] = "AIzaSyDZdvehzgwS3j90fiIF5iWXlPoq4QhrRsQ"
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.7)
db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
retriever = db.as_retriever(search_kwargs={"k": 20})
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# --- 2. Query Expansion Chain (Same as before) ---
query_expansion_prompt = PromptTemplate(
    template="""
    You are a helpful AI assistant. Your task is to expand a short user question into a more detailed and descriptive search query for a vector database.
    Focus on including key terms and concepts related to the original question.

    Original question: {question}

    Expanded query:
    """,
    input_variables=["question"],
)
query_expansion_chain = query_expansion_prompt | llm | StrOutputParser()

# --- 3. Re-ranking Logic (Same as before) ---
def rerank_documents(inputs):
    question = inputs['question']
    docs = inputs['context']
    doc_pairs = [[question, doc.page_content] for doc in docs]
    scores = cross_encoder.predict(doc_pairs)
    scored_docs = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True)
    reranked_docs = [doc for score, doc in scored_docs[:5]]
    return "\n\n".join([doc.page_content for doc in reranked_docs])

# --- 4. Final Answer Generation Prompt (Same as before) ---
final_prompt = PromptTemplate(
    template="""
    You are a helpful AI assistant for students of Indian History from NCERT books.
    Answer the question based ONLY on the following context.
    If you don't know the answer from the context, just say that you don't know.

    Context: {context}

    Question: {question}

    Answer:
    """,
    input_variables=["context", "question"],
)

# --- 5. Build the CORRECTED RAG Chain ---
# This dictionary prepares the initial context and question.
setup_and_retrieval = RunnableParallel(
    {"context": query_expansion_chain | retriever, "question": RunnablePassthrough()}
)

# This is the main chain with the corrected data flow.
full_rag_chain = (
    setup_and_retrieval
    | {
        "context": RunnableLambda(rerank_documents), # The reranker gets the dict and outputs a string
        "question": lambda x: x["question"],       # This lambda gets the dict and passes the question through
    }
    | final_prompt
    | llm
    | StrOutputParser()
)


# --- 6. Test the Final Chain! ---
print("Full RAG chain with Query Expansion and Re-ranking is ready. Asking the test question... 🚀")
test_question = "What were the main features of the Harappan civilization's town planning?"
response = full_rag_chain.invoke(test_question)

print("\n--- Sample Qustion ---")
print(test_question)
print("\n--- FINAL RESPONSE ---")
print(response)

Full RAG chain with Query Expansion and Re-ranking is ready. Asking the test question... 🚀

--- Sample Qustion ---
What were the main features of the Harappan civilization's town planning?

--- FINAL RESPONSE ---
The Harappan civilization's town planning included a carefully planned drainage system with roads and streets laid out in a grid pattern, intersecting at right angles.  Streets with drains were laid out first, and houses were built along them.  The settlements were planned and implemented accordingly, using standardized bricks with a ratio where length and breadth were four and twice the height respectively.  Most settlements had a smaller, higher western part (Citadel) and a larger, lower eastern section (Lower Town), although there were variations.  Larger cities were built according to precise plans and had wide streets often oriented to cardinal directions.  Many cities were surrounded by fortifications.


## The Journey: From Raw PDFs to a Smart AI
We have completed the entire backend process. We started with dozens of raw PDF files and navigated through a complex debugging journey to reach this point:

Advanced Extraction: We pulled not just text, but also data from tables and images.

Intelligent Chunking: We diagnosed the failure of our initial advanced chunking and rebuilt our knowledge base with a more robust, paragraph-aware strategy.

Multi-Stage RAG: We built a sophisticated query engine that uses Query Expansion to understand the user's intent, a Retriever to find a broad set of facts, a Re-ranker to pinpoint the most relevant context, and the Gemini LLM to generate a final, trustworthy answer.

## Sample Questions

In [8]:
# This is a special cell for debugging our RAG pipeline

# --- 1. Define our test question ---
test_question = "What were the main features of the Harappan civilization's town planning?"
print(f"--- ORIGINAL QUESTION ---\n{test_question}\n")


# --- 2. Step 1: See the Expanded Query ---
print("--- 1. EXPANDED QUERY ---")
expanded_question = query_expansion_chain.invoke(test_question)
print(f"{expanded_question}\n")


# --- 3. Step 2: See the Initial Retrieved Documents (Before Re-ranking) ---
print("--- 2. INITIALLY RETRIEVED DOCS (Top 20) ---")
initial_docs = retriever.invoke(expanded_question)
for i, doc in enumerate(initial_docs):
    print(f"--- DOC {i+1} ---\n{doc.page_content}\n")


# --- 4. Step 3: See the Final Re-ranked Documents ---
print("--- 3. RE-RANKED DOCS (Top 5) ---")
# Manually run the re-ranking function to see its output
reranked_context = rerank_documents(
    {"question": test_question, "context": initial_docs}
)
print(reranked_context)

--- ORIGINAL QUESTION ---
What were the main features of the Harappan civilization's town planning?

--- 1. EXPANDED QUERY ---
Expanded query:

"Harappan civilization urban planning features AND (grid pattern OR planned cities OR standardized brick sizes OR drainage systems OR citadels OR lower town OR residential areas OR public buildings OR sanitation infrastructure OR water management OR street layout OR evidence of planning OR architectural design OR urban morphology)  NOT (rural settlements OR villages OR archaeological interpretations ONLY)"


This expanded query aims to:

* **Specify the subject:** Clearly identifies the Harappan civilization as the focus.
* **Broaden the search terms:** Includes synonyms and related concepts for "town planning," such as grid pattern, planned cities, etc., to capture a wider range of relevant results.
* **Use Boolean operators:** Employs "AND" to ensure all specified aspects are present in the results and "OR" to broaden the search within specif