### Load PDF Files

In [1]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
 ### PyPDFLoader
print("PyPDFLoader")

loader = PyPDFLoader("data/pdf/pdfreader.pdf")
pages = loader.load()
print(pages)
print(f"Number of pages: {len(pages)}")
for i, page in enumerate(pages):    
    print(f"Page {i+1} content: {page.page_content[:100]}...")  # Print first 100 characters of each page
    print(f"Page {i+1} metadata: {page.metadata}")

 ### PyMuPDFLoader

PyPDFLoader
[Document(metadata={'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2018-03-05T09:43:57+01:00', 'author': 'agimeno', 'moddate': '2018-03-12T10:24:10-04:00', 'source': 'data/pdf/pdfreader.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1'}, page_content="The EUROCALL Review, Volume 25, No. 2, September 2017 \n \n 18 \nResearch paper \n \nA look at advanced learners’ use of mobile devices for \nEnglish language study: Insights from interview data \nMariusz Kruk \nUniversity of Zielona Gora, Poland \n______________________________________________________________ \nmkruk @ uz.zgora.pl \n  \nAbstract \nThe paper discusses the results of a study which explored advanced learners of English \nengagement with their mobile devices to develop learning experiences that meet their \nneeds and goals as foreign language learners. The data were collected from 20 students \nby means of a semi -structured interview. The gathered data were subjected 

In [None]:
### method 2: PyMuPDFLoader (fast and accurate PDF parsing)
print("PyMuPDFLoader") 

loader = PyMuPDFLoader("data/pdf/pdfreader.pdf")
pages = loader.load()
print(pages)
print(f"Number of pages: {len(pages)}")

print(pages[0].metadata)


 ### UnstructuredPDFLoader

PyMuPDFLoader
[Document(metadata={'producer': 'Microsoft® Word 2016', 'creator': 'Microsoft® Word 2016', 'creationdate': '2018-03-05T09:43:57+01:00', 'source': 'data/pdf/pdfreader.pdf', 'file_path': 'data/pdf/pdfreader.pdf', 'total_pages': 11, 'format': 'PDF 1.6', 'title': '', 'author': 'agimeno', 'subject': '', 'keywords': '', 'moddate': '2018-03-12T10:24:10-04:00', 'trapped': '', 'modDate': "D:20180312102410-04'00'", 'creationDate': "D:20180305094357+01'00'", 'page': 0}, page_content="The EUROCALL Review, Volume 25, No. 2, September 2017 \n \n18 \nResearch paper \n \nA look at advanced learners’ use of mobile devices for \nEnglish language study: Insights from interview data \nMariusz Kruk \nUniversity of Zielona Gora, Poland \n______________________________________________________________ \nmkruk @ uz.zgora.pl \n  \nAbstract \nThe paper discusses the results of a study which explored advanced learners of English \nengagement with their mobile devices to develop learning experiences 

In [None]:
print("PyPDFLoader")
print("Simple and reliable")
print("Good for most PDFs")

## Handling Common PDF Issues
##### 1.Store text in complex ways(not just simple text)
##### 2.Can have formatting issues
##### 3.May contain scanned images (requiring OCR)
##### 4.often having extraction artifacts

In [14]:
raw_pdf_text = """ 
This is a sample PDF text.



It contains multiple lines.



Some lines are separated by double newlines.
    
    
This is to simulate paragraphs.
This is the end of the sample PDF text.

page 1 of 10
"""

#Applying Cleaning  Function

def clean_text(text):
    # Remove extra newlines and spaces
    cleaned_text = ' '.join(text.split())

    #fix ligatures
    cleaned_text = cleaned_text.replace('ﬁ', 'fi')
    cleaned_text = cleaned_text.replace('ﬂ', 'fl')
    return cleaned_text

cleaned_pdf_text = clean_text(raw_pdf_text)
print("Before Cleaned PDF Text:"+repr(raw_pdf_text))
print("After Cleaned PDF Text:"+repr(cleaned_pdf_text))


Before Cleaned PDF Text:' \nThis is a sample PDF text.\n\n\n\nIt contains multiple lines.\n\n\n\nSome lines are separated by double newlines.\n\n\nThis is to simulate paragraphs.\nThis is the end of the sample PDF text.\n\npage 1 of 10\n'
After Cleaned PDF Text:'This is a sample PDF text. It contains multiple lines. Some lines are separated by double newlines. This is to simulate paragraphs. This is the end of the sample PDF text. page 1 of 10'


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [19]:
from langchain_core.documents import Document
from typing import List
class SmartPDFProcessor:
    """Advanced PDF text processor with Error Handling."""
    def __init__(self,chunk_size=1000, chunk_overlap=200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=["\n\n", "\n", " ", ""]
        )

    def process(self, pdf_path:str)->List[Document]:
        """Process PDF with smart chunking and metadata enhancement and return list of Document chunks."""

        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        ##processing each page
        processed_chunks=[]
        for page_num, page in enumerate(pages):
            cleaned_text=self.__clean_text(page.page_content)

            #Skip Nearly empty pages
            if len(cleaned_text.strip())<50:
                print(f"Skipping nearly empty page {page_num+1}")
                continue

            #Create Chunks with Enhanced Metadata
            chunks=self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page_number": page_num+1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text)
                    }]
            )

            processed_chunks.extend(chunks)

            return processed_chunks

    def __clean_text(self, text:str)->str:
        """Clean text by removing extra newlines and fixing ligatures."""
        cleaned_text = ' '.join(text.split())
        cleaned_text = cleaned_text.replace('ﬁ', 'fi')
        cleaned_text = cleaned_text.replace('ﬂ', 'fl')
        return cleaned_text
            