## Load Pdf files

In [1]:
from langchain_community.document_loaders import(
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader
)

In [None]:
## PyPDFLoader

print("PypdfLoader")

try:
    pdf_loader = PyPDFLoader("data/pdf/contextual_learning.pdf")
    pdf_docs = pdf_loader.load()

    print(f" Loaded {len(pdf_docs)} pages")
    print(f" Page 1 content: {pdf_docs[0].page_content[:100]}...")
    print(f" Metadata: {pdf_docs[0].metadata}")

except Exception as e:
    print(f"Error: {e}")


PypdfLoader
 Loaded 16 pages
 Page 1 content: © Authors. Terms and conditions of Creative Commons Attribution 4.0 International (CC BY 4.0) apply....
 Metadata: {'producer': 'Adobe PDF Library 15.0', 'creator': 'Acrobat PDFMaker 17 for Word', 'creationdate': '2017-10-27T14:12:15+03:00', 'author': '', 'comments': '', 'company': '', 'keywords': '', 'moddate': '2017-10-27T14:12:21+03:00', 'sourcemodified': 'D:20171027111205', 'subject': '', 'title': '', 'source': 'data/pdf/contextual_learning.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1'}


In [11]:
## PyMuPDFLoader (Fast and accurate)

print("PyMuPDFLoader")

try:
    pymupdf_loader = PyMuPDFLoader("data/pdf/contextual_learning.pdf")
    pymupdf_docs = pymupdf_loader.load()

    print(f" Loaded {len(pymupdf_docs)} pages")
    print(f" Page 1 content: {pymupdf_docs[0].page_content[:100]}...")
    print(f" Metadata: {pymupdf_docs[0].metadata}")
except Exception as e:
    print(f"Error: {e}")


PyMuPDFLoader
 Loaded 16 pages
 Page 1 content: © Authors. Terms and conditions of Creative Commons Attribution 4.0 International (CC BY 4.0) apply....
 Metadata: {'producer': 'Adobe PDF Library 15.0', 'creator': 'Acrobat PDFMaker 17 for Word', 'creationdate': '2017-10-27T14:12:15+03:00', 'source': 'data/pdf/contextual_learning.pdf', 'file_path': 'data/pdf/contextual_learning.pdf', 'total_pages': 16, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2017-10-27T14:12:21+03:00', 'trapped': '', 'modDate': "D:20171027141221+03'00'", 'creationDate': "D:20171027141215+03'00'", 'page': 0}


In [14]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [25]:
from typing import List
from langchain_core.documents import Document
class SmartPDFProcessor:
    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap,
            separators = [" "]
        )

    def process_pdf(self, pdf_path: str) -> List[Document]:
        """ Process PDF with smart chunking and metadata enhancement"""
        loader = PyPDFLoader(pdf_path)
        pages = loader.load() 

        ## Process each page
        processed_chunks = []

        for page_num, page in enumerate(pages):
            ## Clean Text
            cleaned_text = self._clean_text(page.page_content)

            ## Skip nearly empty pages

            if len(cleaned_text.strip()) < 50:
                continue

            # Create chunks with enhanced metadata
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_num + 1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text)
                }]
            )

            processed_chunks.extend(chunks)
        
        return processed_chunks

    def _clean_text(self, text:str) -> str:
        """Clean Extracted text"""
        text = " ".join(text.split())

        # Fix common PDF extraction issues
        text = text.replace("fi", "fi")
        text = text.replace("fl", "fl")

        return text     


In [28]:
preprocessor = SmartPDFProcessor()

In [31]:
## Process a PDF if available

try:
    smart_chunks = preprocessor.process_pdf("data/pdf/contextual_learning.pdf")
    print(f"Processed into {len(smart_chunks)} smart chunks")

    # Show enhanced metadata

    if smart_chunks:
        print("\n Sample Chunk metadata:")

        for key, value in smart_chunks[0].metadata.items():
            print(f" {key}: {value}")

except Exception as e:
    print(f"Processing error: {e}")

Processed into 77 smart chunks

 Sample Chunk metadata:
 producer: Adobe PDF Library 15.0
 creator: Acrobat PDFMaker 17 for Word
 creationdate: 2017-10-27T14:12:15+03:00
 author: 
 comments: 
 company: 
 keywords: 
 moddate: 2017-10-27T14:12:21+03:00
 sourcemodified: D:20171027111205
 subject: 
 title: 
 source: data/pdf/contextual_learning.pdf
 total_pages: 16
 page: 1
 page_label: 1
 chunk_method: smart_pdf_processor
 char_count: 3886
