## Load Pdf Files

In [1]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader
)

In [2]:
### PypdfLoader
print("PyPdfloader")

try:
    pypdf_loader=PyPDFLoader("data/pdf/emirates_nbd_financial_statements_q1_2025_english.pdf")
    pypdf_docs=pypdf_loader.load()
    print(pypdf_docs)
    print(f"  Loaded {len(pypdf_docs)} pages")
    print(f"  Page 1 content: {pypdf_docs[0].page_content[:100]}...")
    print(f"  Metadata: {pypdf_docs[0].metadata}")

except Exception as e:
    print(f"Error : {e}")

PyPdfloader
[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-04-16T12:53:31+04:00', 'moddate': '2025-04-21T12:56:58+04:00', 'source': 'data/pdf/emirates_nbd_financial_statements_q1_2025_english.pdf', 'total_pages': 32, 'page': 0, 'page_label': '1'}, page_content='EMIRATES NBD BANK (P.J.S.C.) \nGROUP CONDENSED CONSOLIDATED INTERIM FINANCIAL STATEMENTS \nFOR THE THREE MONTHS PERIOD ENDED 31 MARCH 2025'), Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-04-16T12:53:31+04:00', 'moddate': '2025-04-21T12:56:58+04:00', 'source': 'data/pdf/emirates_nbd_financial_statements_q1_2025_english.pdf', 'total_pages': 32, 'page': 1, 'page_label': '2'}, page_content='EMIRATES NBD BANK (P.J.S.C.) \n \n \n \nGROUP CONDENSED CONSOLIDATED INTERIM FINANCIAL STATEMENTS \n \n \n \n \n \n  Contents                       Page \n \nINDEP

In [3]:
# Method 2: PyMuPDFLoader (Fast and accurate)
print("\n3️⃣ PyMuPDFLoader")
try:
    pymupdf_loader = PyMuPDFLoader("data/pdf/emirates_nbd_financial_statements_q1_2025_english.pdf")
    pymupdf_docs = pymupdf_loader.load()
    
    print(f"  Loaded {len(pymupdf_docs)} pages")
    print(f"  Includes detailed metadata")
    print(pymupdf_docs)
except Exception as e:
    print(f"  Error: {e}")


3️⃣ PyMuPDFLoader
  Loaded 32 pages
  Includes detailed metadata
[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-04-16T12:53:31+04:00', 'source': 'data/pdf/emirates_nbd_financial_statements_q1_2025_english.pdf', 'file_path': 'data/pdf/emirates_nbd_financial_statements_q1_2025_english.pdf', 'total_pages': 32, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-04-21T12:56:58+04:00', 'trapped': '', 'modDate': "D:20250421125658+04'00'", 'creationDate': "D:20250416125331+04'00'", 'page': 0}, page_content='EMIRATES NBD BANK (P.J.S.C.) \nGROUP CONDENSED CONSOLIDATED INTERIM FINANCIAL STATEMENTS \nFOR THE THREE MONTHS PERIOD ENDED 31 MARCH 2025'), Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-04-16T12:53:31+04:00', 'source': 'data/pdf/emirates_nbd_financial_statements_

In [4]:
# 📊 PDF Loader Comparison
print("\n📊 PDF Loader Comparison:")
print("\nPyPDFLoader:")
print("  ✅ Simple and reliable")
print("  ✅ Good for most PDFs")
print("  ✅ Preserves page numbers")
print("  ❌ Basic text extraction")
print("  Use when: Standard text PDFs")

print("\nPyMuPDFLoader:")
print("  ✅ Fast processing")
print("  ✅ Good text extraction")
print("  ✅ Image extraction support")
print("  Use when: Speed is important")


📊 PDF Loader Comparison:

PyPDFLoader:
  ✅ Simple and reliable
  ✅ Good for most PDFs
  ✅ Preserves page numbers
  ❌ Basic text extraction
  Use when: Standard text PDFs

PyMuPDFLoader:
  ✅ Fast processing
  ✅ Good text extraction
  ✅ Image extraction support
  Use when: Speed is important


### Handling PDF Challenges 
🎯 Purpose of This Section
PDFs are notoriously difficult to parse because they:

- Store text in complex ways (not just simple text)
- Can have formatting issues
- May contain scanned images (requiring OCR)
- Often have extraction artifacts


In [5]:

raw_pdf_text = """Company Financial Report


    The ﬁnancial performance for ﬁscal year 2024
    shows signiﬁcant growth in proﬁtability.
    
    
    
    Revenue increased by 25%.
    
The company's efﬁciency improved due to workﬂow
optimization.


Page 1 of 10
"""

# Apply the cleaning function
def clean_text(text):
    # Remove excessive whitespace
    text = " ".join(text.split())
    
    # Fix ligatures
    text = text.replace("ﬁ", "fi")
    text = text.replace("ﬂ", "fl")
    
    return text

cleaned = clean_text(raw_pdf_text)
print("BEFORE:")
print(repr(raw_pdf_text))
print("\nAFTER:")
print(repr(cleaned))

# Output:
# BEFORE:
# 'Company Financial Report\n\n\n    The ﬁnancial performance for ﬁscal year 2024\n    shows signiﬁcan'
# 
# AFTER:
# 'Company Financial Report The financial performance for fiscal year 2024 shows significant growth in'

BEFORE:
"Company Financial Report\n\n\n    The ﬁnancial performance for ﬁscal year 2024\n    shows signiﬁcant growth in proﬁtability.\n\n\n\n    Revenue increased by 25%.\n\nThe company's efﬁciency improved due to workﬂow\noptimization.\n\n\nPage 1 of 10\n"

AFTER:
"Company Financial Report The financial performance for fiscal year 2024 shows significant growth in profitability. Revenue increased by 25%. The company's efficiency improved due to workflow optimization. Page 1 of 10"


In [6]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
from langchain_core.documents import Document
from typing import List
class SmartPDFProcessor:
    """Advanced PDF processing with error handling"""
    def __init__(self,chunk_size=1000,chunk_overlap=100):
        self.chunk_size=chunk_size,
        self.chunk_overlap=chunk_overlap,
        self.text_splitter=RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[" "],

        )

    def process_pdf(self,pdf_path:str)->List[Document]:
        """Process PDF with smart chunking and metadata enhancement"""

        # Laod PDF

        loader=PyPDFLoader(pdf_path)
        pages=loader.load()

        ## Process each page

        processed_chunks=[]

        for page_num,page in enumerate(pages):
            ## clean text
            cleaned_text=self._clean_text(page.page_content)

            # Skip nearly empty pages
            if len(cleaned_text.strip()) < 50:
                continue

            # Create chunks with enhanced metadata
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_num + 1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text)
                }]
            )
            
            processed_chunks.extend(chunks)

        return processed_chunks

    def _clean_text(self, text: str) -> str:
        """Clean extracted text"""
        # Remove excessive whitespace
        text = " ".join(text.split())
        
        # Fix common PDF extraction issues
        text = text.replace("ﬁ", "fi")
        text = text.replace("ﬂ", "fl")
        
        return text

    
            


In [8]:
preprocessor=SmartPDFProcessor()

In [9]:
preprocessor

<__main__.SmartPDFProcessor at 0x1f97fed06b0>

In [10]:
## Process a PDF if available
try:
    smart_chunks=preprocessor.process_pdf("data/pdf/emirates_nbd_financial_statements_q1_2025_english.pdf")
    print(f"Processed into {len(smart_chunks)} smart chunks")

    # Show enhanced metadata
    if smart_chunks:
        print("\nSample chunk metadata:")
        for key, value in smart_chunks[0].metadata.items():
            print(f"  {key}: {value}")

except Exception as e:
    print(f"Processing error: {e}")

Processed into 72 smart chunks

Sample chunk metadata:
  producer: Microsoft® Word for Microsoft 365
  creator: Microsoft® Word for Microsoft 365
  creationdate: 2025-04-16T12:53:31+04:00
  moddate: 2025-04-21T12:56:58+04:00
  source: data/pdf/emirates_nbd_financial_statements_q1_2025_english.pdf
  total_pages: 32
  page: 1
  page_label: 1
  chunk_method: smart_pdf_processor
  char_count: 134
