## Handling PDF Challenges

PDFs are notoriously difficult to parse because they:
- Store text in complex ways (not just simple text)
- Can have formatting issues
- May contained scanned images (requiring OCR)
- Often have extraction artifacts

## Setup and Import Libraries

In [8]:
from typing import List
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

## Raw PDF Extraction

In [None]:
raw_pdf_text = """Company Financial Report


    The ﬁnancial performance for ﬁscal year 2024
    shows signiﬁcant growth in proﬁtability.
    
    
    
    Revenue increased by 25%.
    
The company's efﬁciency improved due to workﬂow
optimization.


Page 1 of 10
"""

In [4]:
def clean_text(text):
    text = " ".join(text.split())

    # Fix ligatures
    text = text.replace("ﬁ", "fi")
    text = text.replace("ﬂ", "fl")
    
    return text

In [6]:
cleaned_text = clean_text(raw_pdf_text)
print("BEFORE")
print(repr(raw_pdf_text))

print("AFTER")
print(repr(cleaned_text))

BEFORE
" Company Financial Report\n\n\n\n    The financial performance for fiscal year 2024\n    shows significant growth in profitability,\n\n\n\n    Revenue increased by 25%\n\n\nThe company's efficiency improved due to workflow \noptimization\n\n\nPage 1 of 10\n"
AFTER
"Company Financial Report The financial performance for fiscal year 2024 shows significant growth in profitability, Revenue increased by 25% The company's efficiency improved due to workflow optimization Page 1 of 10"


## Common PDF Challenges

In [10]:
class SmartPDFProcessor:
    """ 
    Advanced PDF Processing with Error Handling
    """
    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=[" "]
        )

    def process_pdf(self, pdf_path:str) -> List[Document]:
        """ 
        Process PDF with Smart Chunking and Metadata Enhancement
        """
        
        # Load PDF
        loader = PyPDFLoader(file_path=pdf_path)
        pages = loader.load()

        # Process Each Page
        processed_chunks = []

        for page_number, page in enumerate(pages):
            cleaned_text = self._clean_text(page.page_content)

            # Skip Nearly Empty Pages
            if len(cleaned_text.strip()) < 50:
                continue
            
            # Create Chunks with Enhanced Metadata
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_number + 1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text)
                }]
            )

            processed_chunks.extend(chunks)

        return processed_chunks
    
    def _clean_text(self, text:str) -> str:
        """
        Clean Extracted Text
        """
        text = " ".join(text.split())

        # Fix ligatures
        text = text.replace("ﬁ", "fi")
        text = text.replace("ﬂ", "fl")
        
        return text



In [12]:
preprocessor = SmartPDFProcessor()

In [14]:
try:
    smart_chunks = preprocessor.process_pdf(pdf_path="data/pdf_files/attention_is_all_you_need.pdf")
    print(f"Processed into {len(smart_chunks)} Smart Chunks")

    # Show Enhanced Metadata
    if smart_chunks:
        print("\nSample Chunk Metadata")
        for key, value in smart_chunks[0].metadata.items():
            print(f"{key}:{value}")
            
except Exception as e:
    print(f"Processing Error: {e}")

Processed into 49 Smart Chunks

Sample Chunk Metadata
producer:pdfTeX-1.40.25
creator:LaTeX with hyperref
creationdate:2024-04-10T21:11:43+00:00
author:
keywords:
moddate:2024-04-10T21:11:43+00:00
ptex.fullbanner:This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5
subject:
title:
trapped:/False
source:data/pdf_files/attention_is_all_you_need.pdf
total_pages:15
page:1
page_label:1
chunk_method:smart_pdf_processor
char_count:2857
