In [3]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader
)

In [4]:
# --- Import necessary libraries ---
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
import os

# --- Step 1: Define your directory path ---
pdf_directory = "C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science"

# --- Step 2: Use DirectoryLoader to load all PDFs using PyMuPDFLoader ---
print("\nЁЯУД Loading all PDF files using PyMuPDFLoader...\n")

try:
    pdf_loader = DirectoryLoader(
        path=pdf_directory,
        glob="*.pdf",               # Loads only PDF files
        loader_cls=PyMuPDFLoader,
        show_progress=True
    )

    # --- Step 3: Load all documents (each page = one Document) ---
    pdf_docs = pdf_loader.load()
    print(f"тЬЕ Loaded {len(pdf_docs)} pages from all PDFs in '{pdf_directory}'\n")

    # --- Step 4: Group documents by file for better previews ---
    # Create a dictionary to group pages by their PDF source file
    pdf_files = {}
    for doc in pdf_docs:
        file_path = doc.metadata.get("source", "Unknown file")
        pdf_files.setdefault(file_path, []).append(doc)

    # --- Step 5: Preview each PDF file ---
    print("ЁЯУШ --- PDF File Previews ---\n")
    for file_path, docs in pdf_files.items():
        file_name = os.path.basename(file_path)
        print(f"ЁЯУД File: {file_name}")
        print(f"   Total pages loaded: {len(docs)}")

        # Preview first few lines of the first page
        first_page_text = docs[0].page_content.strip().replace("\n", " ")
        preview_text = first_page_text[:400] + ("..." if len(first_page_text) > 400 else "")
        print(f"   ЁЯУЭ Preview (first 400 chars): {preview_text}")
        print("-" * 100)

except Exception as e:
    print(f"тЭМ Error while loading PDFs: {e}")



ЁЯУД Loading all PDF files using PyMuPDFLoader...



100%|тЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИ| 29/29 [00:02<00:00, 14.10it/s]

тЬЕ Loaded 498 pages from all PDFs in 'C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science'

ЁЯУШ --- PDF File Previews ---

ЁЯУД File: hecu101.pdf
   Total pages loaded: 7
   ЁЯУЭ Preview (first 400 chars): Chapter 1тАЙтАФтАЙExploring the Investigative World of Science 1 Exploring the  Investigative  World of Science 1 Dear Young Scientists,  Welcome back! On the first page of each chapter, you will find a set of questions.  These are not meant for any examтАФтАЙthey are unique invitations to spark your  curiosity to explore the world of science! Why is one side of a puri thinner than the other? Are there more...
----------------------------------------------------------------------------------------------------
ЁЯУД File: hecu102.pdf
   Total pages loaded: 20
   ЁЯУЭ Preview (first 400 chars): 8 Curiosity тАФ Textbook of Science for Grade 8 2 The Invisible Living  World: Beyond Our  Naked Eye Probe and ponder 	 z Have you ever wondered what you might see if the invisible 




In [5]:
import os
import re
import unicodedata
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings

# -----------------------------
# SmartPDFProcessor Class
# -----------------------------
class SmartPDFProcessor:
    def __init__(self, chunk_size=400, chunk_overlap=20):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[" ", "\n"]
        )

    def process_pdf(self, pdf_path: str) -> List[Document]:
        loader = PyMuPDFLoader(pdf_path)
        pages = loader.load()
        processed_chunks = []

        for page_num, page in enumerate(pages):
            cleaned_text = self._clean_text(page.page_content)
            if len(cleaned_text.strip()) < 50:
                continue
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_num + 1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text),
                    "source_file": os.path.basename(pdf_path)
                }]
            )
            processed_chunks.extend(chunks)

        print(f"тЬЕ Processed {len(processed_chunks)} chunks from '{os.path.basename(pdf_path)}'")
        return processed_chunks

    def process_directory(self, folder_path: str) -> List[Document]:
        all_chunks = []
        pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
        if not pdf_files:
            print("тЪая╕П No PDF files found in the directory.")
            return []

        print(f"\nЁЯУБ Found {len(pdf_files)} PDFs in '{folder_path}'\n")
        for pdf_file in pdf_files:
            pdf_path = os.path.join(folder_path, pdf_file)
            chunks = self.process_pdf(pdf_path)
            if chunks:
                print(f"\nЁЯУШ Preview of '{pdf_file}':")
                print(chunks[0].page_content[:300], "...\n")
            all_chunks.extend(chunks)

        print(f"\nтЬЕ Total Chunks Created from Directory: {len(all_chunks)}")
        return all_chunks

    def _clean_text(self, text: str) -> str:
        text = unicodedata.normalize("NFKC", text)
        text = text.replace("ямБ", "fi").replace("ямВ", "fl")
        text = text.replace("тАЩ", "'").replace("тАШ", "'")
        text = text.replace("тАЬ", '"').replace("тАЭ", '"')
        text = text.replace("тАФ", "-").replace("тАУ", "-")
        text = re.sub(r"[\x00-\x1f\x7f-\x9f]", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text


In [6]:
folder_path = "C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science"

processor = SmartPDFProcessor(chunk_size=400, chunk_overlap=20)
all_chunks = processor.process_directory(folder_path)

# Preview first 2 chunks
for i, chunk in enumerate(all_chunks[:2]):
    print(f"\nЁЯУД Chunk {i+1} (Page {chunk.metadata['page']} of {chunk.metadata['total_pages']}):")
    print(chunk.page_content[:300], "...")


ЁЯУБ Found 29 PDFs in 'C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science'

тЬЕ Processed 35 chunks from 'hecu101.pdf'

ЁЯУШ Preview of 'hecu101.pdf':
Chapter 1 - Exploring the Investigative World of Science 1 Exploring the Investigative World of Science 1 Dear Young Scientists, Welcome back! On the first page of each chapter, you will find a set of questions. These are not meant for any exam- they are unique invitations to spark your curiosity to ...

тЬЕ Processed 112 chunks from 'hecu102.pdf'

ЁЯУШ Preview of 'hecu102.pdf':
8 Curiosity - Textbook of Science for Grade 8 2 The Invisible Living World: Beyond Our Naked Eye Probe and ponder z Have you ever wondered what you might see if the invisible world around you became visible? z How do you think your observation of this hidden world might change the way you think abou ...

тЬЕ Processed 93 chunks from 'hecu103.pdf'

ЁЯУШ Preview of 'hecu103.pdf':
28 Curiosity - Textbook of Science for Grade 8 Immersion into social med

In [9]:
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
print(f"тЪЩя╕П Loading Hugging Face embeddings model: {model_name}")
embeddings = HuggingFaceEmbeddings(model_name=model_name)
print("тЬЕ Model loaded successfully.")


тЪЩя╕П Loading Hugging Face embeddings model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
тЬЕ Model loaded successfully.


In [10]:
texts = [chunk.page_content for chunk in all_chunks]

print("\nЁЯФв Generating embeddings for all chunks...")
chunk_embeddings = embeddings.embed_documents(texts)

print(f"тЬЕ Generated embeddings for {len(chunk_embeddings)} chunks")
print(f"Each embedding vector has {len(chunk_embeddings[0])} dimensions")

# Preview first embedding vector (first 10 values)
print("\nЁЯФ╣ Example embedding for first chunk (first 10 values):")
print(chunk_embeddings[0][:10])



ЁЯФв Generating embeddings for all chunks...
тЬЕ Generated embeddings for 2796 chunks
Each embedding vector has 384 dimensions

ЁЯФ╣ Example embedding for first chunk (first 10 values):
[0.1356445848941803, 0.11795692890882492, 0.24925336241722107, 0.1432991623878479, -0.02940603345632553, -0.3068436086177826, -0.14760467410087585, 0.3451717793941498, 0.012838182970881462, -0.1846027821302414]


In [11]:
import chromadb
from langchain_community.vectorstores import Chroma

# Create a list of texts and corresponding metadatas
texts = [chunk.page_content for chunk in all_chunks]
metadatas = [chunk.metadata for chunk in all_chunks]

# Initialize Chroma DB (persistent directory)
persist_directory = "chroma_db"
vectorstore = Chroma.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas,
    persist_directory=persist_directory
)

# Persist to disk
vectorstore.persist()
print(f"тЬЕ Stored {len(texts)} chunks in Chroma DB at '{persist_directory}'")


тЬЕ Stored 2796 chunks in Chroma DB at 'chroma_db'


  vectorstore.persist()


In [12]:
# Reload the Chroma vectorstore
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

query = """рдРрд╕рдпрд╛ рджреЗрдЦрдпрд╛ рдЧреНтАНрдпрдпрд╛ рд╣реИ рдХрдХ рдЧреИрд╕реЛрдВ рдХреА рдХрд╡рд▓реЗреНтАНрдпрддрдпрд╛ рддрдпрд╛рдкрдореЗрдпрд╛рди рдореЗреЗрдВ рд╡реГрдХрдзрд┐ 
рдХреЗ рд╕рдпрд╛рде рд╕рдпрд╛рдореЗрдпрд╛рдиреНреНтАНрдпрддрдГ рдШрдЯрддреА рд╣реИред рдардВрдбреЗ рдЬрд▓ рдореЗреЗрдВ рдЕрдХрд┐рдХ рдСрдХреНрд╕реАрдЬрди 
рдШреНрдпрд▓ рд╕рдХрддреА рд╣реИ рдХрдЬрд╕рд╕реЗ рдЬрд▓реАреНтАНрдп рдЬреАрд╡рди рдХреЗ рдХрд▓рдП рдкреНтАНрдпрдпрд╛рдпреБрдкреНрдд 
рдСрдХреНрд╕реАрдЬрди рдХреА рдореЗрдпрд╛рддреНрд░рдпрд╛ рд╕реНрдпрдХрдирдХрд╢реНрд┐рдд рд╣реЛрддреА рд╣реИ (рдХрд┐рддреНрд░ 9.8)ред рджрдпреВрд╕рд░реА 
рдУрд░ рдЬрдм рдЬрд▓ рдЧрд░рдореЗ рд╣реЛрддрдпрд╛ рд╣реИ рддрдм рдСрдХреНрд╕реАрдЬрди рдХреА рдХрд╡рд▓реЗреНтАНрдпрддрдпрд╛ рдШрдЯ 
рдЬрдпрд╛рддреА рд╣реИред"""
results = vectorstore.similarity_search(query, k=3)

print(f"\nЁЯФН Top 3 search results for query: '{query}'")
for i, r in enumerate(results):
    print(f"\nResult {i+1} (Page {r.metadata['page']}, File: {r.metadata['source_file']}):")
    print(r.page_content[:500], "...")




ЁЯФН Top 3 search results for query: 'рдРрд╕рдпрд╛ рджреЗрдЦрдпрд╛ рдЧреНтАНрдпрдпрд╛ рд╣реИ рдХрдХ рдЧреИрд╕реЛрдВ рдХреА рдХрд╡рд▓реЗреНтАНрдпрддрдпрд╛ рддрдпрд╛рдкрдореЗрдпрд╛рди рдореЗреЗрдВ рд╡реГрдХрдзрд┐ 
рдХреЗ рд╕рдпрд╛рде рд╕рдпрд╛рдореЗрдпрд╛рдиреНреНтАНрдпрддрдГ рдШрдЯрддреА рд╣реИред рдардВрдбреЗ рдЬрд▓ рдореЗреЗрдВ рдЕрдХрд┐рдХ рдСрдХреНрд╕реАрдЬрди 
рдШреНрдпрд▓ рд╕рдХрддреА рд╣реИ рдХрдЬрд╕рд╕реЗ рдЬрд▓реАреНтАНрдп рдЬреАрд╡рди рдХреЗ рдХрд▓рдП рдкреНтАНрдпрдпрд╛рдпреБрдкреНрдд 
рдСрдХреНрд╕реАрдЬрди рдХреА рдореЗрдпрд╛рддреНрд░рдпрд╛ рд╕реНрдпрдХрдирдХрд╢реНрд┐рдд рд╣реЛрддреА рд╣реИ (рдХрд┐рддреНрд░ 9.8)ред рджрдпреВрд╕рд░реА 
рдУрд░ рдЬрдм рдЬрд▓ рдЧрд░рдореЗ рд╣реЛрддрдпрд╛ рд╣реИ рддрдм рдСрдХреНрд╕реАрдЬрди рдХреА рдХрд╡рд▓реЗреНтАНрдпрддрдпрд╛ рдШрдЯ 
рдЬрдпрд╛рддреА рд╣реИред'

Result 1 (Page 3, File: hhcu108.pdf):
рд╕рдорд░реВрдкреА рдорд┐рд╢реНрд░рдг рд╣реИред рдЕрдзрд┐рдХрд╛рдВрд╢ рдЬреАрд╡реЛрд╡реЛреЛрдВ рдХреЗ рдЬреАрд╡рд┐рддрдд рд░рд╣рдиреЗ рдХреЗ рд▓рд┐р

  vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
