In [1]:
#Load pdf Files
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
print("Pypdf Loader")
try:
    pypdfloader=PyPDFLoader(r"data\pdf\attention.pdf")
    docs=pypdfloader.load()
    print(f"Number of pages: {len(docs)}")
    print(f"First page: {docs[0].page_content[:100]}")
    print(f"First page metadata: {docs[0].metadata}")
except Exception as e:
    print(f"Error: {e}")

Pypdf Loader
Number of pages: 15
First page: Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and
First page metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data\\pdf\\attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}


In [9]:
print("PyMupdfloader")
try:
    pymudpdf_loader=PyMuPDFLoader("data/pdf/attention.pdf",
                                  extract_images=True)
    docs=pymudpdf_loader.load()
    print(f"Number of pages: {len(docs)}")
    print(f"First page: {docs[0].page_content[:100]}")
    print(f"First page metadata: {docs[0].metadata}")
except Exception as e:
    print(f"error:{e}")


PyMupdfloader
Number of pages: 15
First page: Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and
First page metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'source': 'data/pdf/attention.pdf', 'file_path': 'data/pdf/attention.pdf', 'total_pages': 15, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'trapped': '', 'modDate': 'D:20240410211143Z', 'creationDate': 'D:20240410211143Z', 'page': 0}


In [1]:
# Example of raw PDF extraction
raw_pdf_text = """Company Financial Report


    The ﬁnancial performance for ﬁscal year 2024
    shows signiﬁcant growth in proﬁtability.
    
    
    
    Revenue increased by 25%.
    
The company's efﬁciency improved due to workﬂow
optimization.


Page 1 of 10
"""

# Apply the cleaning function

def clean_text(text):
    #Remove excessive white space
    text=" ".join(text.split())
    #fix ligatures
    text=text.replace("ﬁ","fi")
    text=text.replace("ﬂ", "fl")

    return text

cleaned_text=clean_text(raw_pdf_text)
print("BEFORE->")
print(raw_pdf_text)
print("AFTER Preprocessing->")
print(cleaned_text)


BEFORE->
Company Financial Report


    The ﬁnancial performance for ﬁscal year 2024
    shows signiﬁcant growth in proﬁtability.



    Revenue increased by 25%.

The company's efﬁciency improved due to workﬂow
optimization.


Page 1 of 10

AFTER Preprocessing->
Company Financial Report The financial performance for fiscal year 2024 shows significant growth in profitability. Revenue increased by 25%. The company's efficiency improved due to workflow optimization. Page 1 of 10


In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from typing import List
class SmartPDFProcessor():
    """Advanced pdf processing with error handling"""
    def __init__(self,chunk_size=1000,chunk_overlap=100):
        self.chunk_size=chunk_size,
        self.chunk_overlap=chunk_overlap,
        self.text_splitter=RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[' ']
        )

    def process_pdf(self,pdf_path:str)->List[Document]:
        """Process PDF with smart chunking and metadata enhancement"""
        try:
            #load pdf
            loader=PyPDFLoader(pdf_path)
            pages=loader.load()


            ##Process each page
            processed_chunks=[]
            for page_num,page in enumerate(pages):
                cleaned_text=self._clean_text(page.page_content)
                #skip nearly empty pages
                if len(cleaned_text.strip())<50:
                    continue

                # create chunks with enhanced metadata
                chunks=self.text_splitter.create_documents(
                    texts=[cleaned_text],
                    metadatas=[{
                        **page.metadata,
                        "page":page_num+1,
                        "total_pages":len(pages),
                        "chunk_method":"SmartPDFProcessor",
                        "char_count":len(cleaned_text)
                    }
                    ]
                )
                processed_chunks.extend(chunks)
            return processed_chunks

        except Exception as e:
            print(f"Pdf error:{e}")
    def _clean_text(self,text:str)->str:
        #Remove unnecessary white space
        text=" ".join(text.split())

        text=text.replace("ﬁ","fi")
        text=text.replace("ﬂ", "fl")

        return text
    


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
processor=SmartPDFProcessor()
processor

<__main__.SmartPDFProcessor at 0x1917bc15160>

In [9]:
# Preprocess the pdf
try:
    smart_chunks=processor.process_pdf("data/pdf/attention.pdf")
    print(f"Processed into {len(smart_chunks)} smart chunks")
    #show enhanced mettadata
    if smart_chunks:
        print("\nSample chunk metadata")
        for key,value in smart_chunks[0].metadata.items():
            print(f"{key}: {value}")
except Exception as e:
    print(f"processed pdf Error:{e}")

Processed into 49 smart chunks

Sample chunk metadata
producer: pdfTeX-1.40.25
creator: LaTeX with hyperref
creationdate: 2024-04-10T21:11:43+00:00
author: 
keywords: 
moddate: 2024-04-10T21:11:43+00:00
ptex.fullbanner: This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5
subject: 
title: 
trapped: /False
source: data/pdf/attention.pdf
total_pages: 15
page: 1
page_label: 1
chunk_method: SmartPDFProcessor
char_count: 2857
