## Load PDF files

In [None]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader
)
from typing import List

In [11]:
### PyPDFLoader

print("PyPdfloader")

try:
    pypdf_loader = PyPDFLoader("data/pdf/BikeSecurity.pdf")
    pypdf_docs = pypdf_loader.load()
    print(pypdf_docs)

except Exception as e:
    print(f"Error : {e}")

PyPdfloader
[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-07-10T15:23:25+08:00', 'author': 'Sergej Gričar, Christian Stipanović and Tea Baldigara', 'keywords': 'Non-Fungible Tokens; blockchain; bike security; sustainable mobility; QR codes', 'moddate': '2025-07-10T09:31:13+02:00', 'subject': 'As climate change concerns, urban congestion, and environmental degradation intensify, cities prioritise cycling as a sustainable transport option to reduce CO2 emissions and improve quality of life. However, rampant bicycle theft and poor security infrastructure often deter daily commuters and tourists from cycling. This study explores how advanced security measures can bolster sustainable urban mobility and tourism by addressing these challenges. A mixed-methods approach is utilised, incorporating primary survey data from Slovenia and secondary data on bicycle sales, imports and thefts from 2015 to 2024. Findings indicate that access to

In [14]:
# Method 2: PyMuPDFLoader (Fast and Accurate)
print("\n PyMuPDFLoader")

try:
    pymupdf_loader = PyMuPDFLoader("data/pdf/BikeSecurity.pdf")
    pymupdf_docs = pymupdf_loader.load()

    print(f" Loaded {len(pymupdf_docs)} pages")
    print(f" Includes detailed metadata")
    print(pymupdf_docs)
except Exception as e:
    print(f" Error: {e}")


 PyMuPDFLoader
 Loaded 20 pages
 Includes detailed metadata
[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-07-10T15:23:25+08:00', 'source': 'data/pdf/BikeSecurity.pdf', 'file_path': 'data/pdf/BikeSecurity.pdf', 'total_pages': 20, 'format': 'PDF 1.7', 'title': 'Sustainable Daily Mobility and Bike Security', 'author': 'Sergej Gričar, Christian Stipanović and Tea Baldigara', 'subject': 'As climate change concerns, urban congestion, and environmental degradation intensify, cities prioritise cycling as a sustainable transport option to reduce CO2 emissions and improve quality of life. However, rampant bicycle theft and poor security infrastructure often deter daily commuters and tourists from cycling. This study explores how advanced security measures can bolster sustainable urban mobility and tourism by addressing these challenges. A mixed-methods approach is utilised, incorporating primary survey data from Slovenia and secondary 

In [15]:
# Example of raw PDF extraction
raw_pdf_text = """Company Financial Report


    The ﬁnancial performance for ﬁscal year 2024
    shows signiﬁcant growth in proﬁtability.
    
    
    
    Revenue increased by 25%.
    
The company's efﬁciency improved due to workﬂow
optimization.


Page 1 of 10
"""

# Apply the cleaning function
def clean_text(text):
    # Remove excessive whitespace
    text = " ".join(text.split())
    
    # Fix ligatures
    text = text.replace("ﬁ", "fi")
    text = text.replace("ﬂ", "fl")
    
    return text

cleaned = clean_text(raw_pdf_text)
print("BEFORE:")
print(repr(raw_pdf_text))
print("\nAFTER:")
print(repr(cleaned))

BEFORE:
"Company Financial Report\n\n\n    The ﬁnancial performance for ﬁscal year 2024\n    shows signiﬁcant growth in proﬁtability.\n\n\n\n    Revenue increased by 25%.\n\nThe company's efﬁciency improved due to workﬂow\noptimization.\n\n\nPage 1 of 10\n"

AFTER:
"Company Financial Report The financial performance for fiscal year 2024 shows significant growth in profitability. Revenue increased by 25%. The company's efficiency improved due to workflow optimization. Page 1 of 10"


In [26]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [29]:
from langchain_core.documents import Document
class SmartPDFProcessor:
    """Advanced PDF Processing with error handling"""
    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.chunk_size = chunk_size,
        self.chunk_overlap = chunk_overlap,
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap,
            separators = [" "]
        )

    def process_pdf(self, pdf_path:str) -> List[Document]:
        """Process PDF with smart chunking and metadata enhancement"""

        #Load PDF
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        #Process each page
        processed_chunks = []

        for page_num, page in enumerate(pages):
            ## clean text
            cleaned_text = self._clean_text(page.page_content)

            # Skip nearly empty pages
            if len(cleaned_text.strip()) < 50:
                continue

            # Create chunks with enhanced metadata
            chunks = self.text_splitter.create_documents(
                texts = [cleaned_text],
                metadatas = [{
                    **page.metadata,
                    "page": page_num + 1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text)
                }]
            )

            processed_chunks.extend(chunks)

        return processed_chunks
    
    def _clean_text(self, text:str) -> str:
        # Remove excessive whitespace
        text = " ".join(text.split())
        
        # Fix ligatures
        text = text.replace("ﬁ", "fi")
        text = text.replace("ﬂ", "fl")
        
        return text

In [30]:
preprocessor = SmartPDFProcessor()

In [31]:
preprocessor

<__main__.SmartPDFProcessor at 0x1694d2510>

In [33]:
## Process a PDF if available

try:
    smart_chunks = preprocessor.process_pdf("data/pdf/BikeSecurity.pdf")
    print(f"Processed into {len(smart_chunks)} smart chunks")

    # Show enhanced metadata
    if smart_chunks:
        print("\nSample chunk metadata:")
        for key, value in smart_chunks[0].metadata.items():
            print(f" {key}: {value}")

except Exception as e:
    print(f"Processing error: {e}")

Processed into 80 smart chunks

Sample chunk metadata:
 producer: pdfTeX-1.40.25
 creator: LaTeX with hyperref
 creationdate: 2025-07-10T15:23:25+08:00
 author: Sergej Gričar, Christian Stipanović and Tea Baldigara
 keywords: Non-Fungible Tokens; blockchain; bike security; sustainable mobility; QR codes
 moddate: 2025-07-10T09:31:13+02:00
 subject: As climate change concerns, urban congestion, and environmental degradation intensify, cities prioritise cycling as a sustainable transport option to reduce CO2 emissions and improve quality of life. However, rampant bicycle theft and poor security infrastructure often deter daily commuters and tourists from cycling. This study explores how advanced security measures can bolster sustainable urban mobility and tourism by addressing these challenges. A mixed-methods approach is utilised, incorporating primary survey data from Slovenia and secondary data on bicycle sales, imports and thefts from 2015 to 2024. Findings indicate that access to se