# PDF Reading

## PyMDF

Python Fitz, also known as PyMuPDF, is a powerful library for working with PDF documents in Python. It provides a wide range of features for tasks such as reading, writing, modifying, and extracting information from PDF files. Whether you need to extract text, images, metadata, or perform operations like page merging, splitting, or annotating, Fitz has got you covered.

## Content

In this workshop, you will learn how to:
- Open and explore PDF documents
- Extract text and metadata
- Work with pages and images
- Modify PDF content
- Create new PDF documents

In [None]:
import fitz  # PyMuPDF
from pathlib import Path
from PIL import Image
import io

PDF_PATH = Path() / ".." / "demo_data" / "mupdf_explored.pdf"
PDF_PATH.exists()

# Start by looking at the file metadata

## Open a PDF document

In [None]:
def open_pdf_document(pdf_path):
    """
    Open a PDF document and return basic information about it.
    """
    # Open the PDF document
    doc = fitz.open(pdf_path)
    
    # Display basic information
    print("Document opened successfully!")
    print(f"Number of pages: {len(doc)}")
    print(f"Is PDF: {doc.is_pdf}")
    
    return doc

# Open your PDF document
doc = open_pdf_document(PDF_PATH)

## Explore document outline (bookmarks)

In [None]:
def show_document_outline(doc):
    """Display the document's table of contents/outline."""
    toc = doc.get_toc()
    if toc:
        for level, title, page in toc:
            indent = "  " * (level - 1)
            print(f"{indent}{title} (Page {page})")
    else:
        print("\nNo outline/bookmarks found in the document.")

show_document_outline(doc)

# Exploring the PDF content

## Access and explore pages

In [None]:
def explore_page_properties(doc, page_num=0):
    """
    Explore properties of a specific page.
    """
    page = doc[page_num]
    
    print(f"\nPage {page_num + 1} Properties:")
    print("-" * 25)
    print(f"Page dimensions: {page.rect}")
    print(f"Page rotation: {page.rotation} degrees")
    print(f"Media box: {page.mediabox}")
    print(f"Crop box: {page.cropbox}")
    
    return page

# Explore the first page
first_page = explore_page_properties(doc, 0)

## Extract text from a page

In [None]:
# Simple text extraction
extracted_page = first_page.get_text()
print(extracted_page[:200] + "...")

## Search for text in a page

In [None]:
def search_text_in_page(page, search_term):
    """
    Search for text in a page and return matching rectangles.
    """
    try:
        text_instances = page.search_for(search_term)
        print(f"\nFound '{search_term}' {len(text_instances)} times on page")
        return text_instances
    except Exception as e:
        print(f"Error searching for text: {e}")
        return []

# Search for common words (you can modify these)
search_results = search_text_in_page(first_page, "September")
if search_results:
    print(f"First occurrence rectangle: {search_results[0]}")

## Exercice: Count the number of pages containing the word "MuPdf"

In [None]:
...

## Render page as image

In [None]:
def render_page_as_image(page, zoom=2.0):
    """
    Render a page as an image.
    """
    # Create transformation matrix for zoom
    mat = fitz.Matrix(zoom, zoom)
    
    # Render page as pixmap
    pix = page.get_pixmap(matrix=mat)
    
    # Convert to PIL Image
    img_data = pix.tobytes("png")
    pil_img = Image.open(io.BytesIO(img_data))
    
    print(f"Rendered page as {pil_img.size[0]}x{pil_img.size[1]} image")
    
    pix = None  # Release memory
    return pil_img

# Render the first page as an image
page_image = render_page_as_image(first_page, zoom=1.5)
page_image

# MODIFYING PDF CONTENT

## Add annotations to a page

In [None]:
def add_annotations(page):
    """
    Add various types of annotations to a page.
    """
    # Add a highlight annotation
    # First, let's find some text to highlight
    text_instances = page.search_for("the")  # You can change this word
    if text_instances:
        highlight = page.add_highlight_annot(text_instances[0])
        highlight.set_colors(stroke=[1, 1, 0])  # Yellow highlight
        highlight.update()
        print("Added highlight annotation")
    
    # Add a text annotation (sticky note)
    point = fitz.Point(50, 50)  # Top-left position
    text_annot = page.add_text_annot(point, "This is a text annotation!")
    text_annot.set_info(title="Tutorial", content="Added by PyMuPDF tutorial")
    text_annot.update()
    print("Added text annotation")
    
    # Add a rectangle annotation
    rect = fitz.Rect(100, 100, 200, 150)
    rect_annot = page.add_rect_annot(rect)
    rect_annot.set_colors(stroke=[1, 0, 0])  # Red border
    rect_annot.set_border(width=2)
    rect_annot.update()
    print("Added rectangle annotation")

# Exercise 4.2: Insert text into a page
def insert_text_into_page(page, text, position, fontsize=12):
    """
    Insert text at a specific position on a page.
    """
    if not page:
        return
    
    point = fitz.Point(position[0], position[1])
    
    # Insert text
    page.insert_text(
        point, 
        text, 
        fontsize=fontsize,
        color=(0, 0, 1)  # Blue color
    )
    print(f"Inserted text at position {position}")

# Create a copy of the document for modifications
def create_modified_document():
    """Create a modified copy of the original document."""
    if not doc:
        print("No document loaded.")
        return None
    
    # Create a new document
    new_doc = fitz.open()
    
    # Copy the first page
    new_doc.insert_pdf(doc, from_page=0, to_page=0)
    
    # Get the first page of the new document
    new_page = new_doc[0]
    
    # Add annotations
    add_annotations(new_page)
    
    # Insert custom text
    insert_text_into_page(new_page, "Modified with PyMuPDF!", (50, 200), fontsize=14)
    
    return new_doc

# Create modified document
modified_doc = create_modified_document()

# CREATING NEW PDF DOCUMENTS

## Create a new PDF from scratch

In [None]:
def create_new_pdf():
    """Create a new PDF document with custom content."""
    # Create new document
    new_doc = fitz.open()
    
    # Add a new page
    page = new_doc.new_page(width=595, height=842)  # A4 size
    
    # Add title
    title_text = "PyMuPDF Tutorial - Generated PDF"
    title_rect = fitz.Rect(50, 50, 545, 100)
    page.insert_textbox(
        title_rect, 
        title_text, 
        fontsize=18, 
        color=(0, 0, 0),
        align=1  # Center alignment
    )
    
    # Add some content
    content_text = """
    This PDF was created using PyMuPDF (fitz) library.
    
    Key features demonstrated:
    • Opening and reading PDF documents
    • Extracting text and images
    • Adding annotations and modifications
    • Creating new PDF documents
    
    PyMuPDF is a powerful library for PDF manipulation in Python!
    """
    
    content_rect = fitz.Rect(50, 120, 545, 400)
    page.insert_textbox(
        content_rect, 
        content_text, 
        fontsize=12, 
        color=(0, 0, 0)
    )
    
    # Add a simple drawing
    # Draw a rectangle
    rect = fitz.Rect(50, 450, 250, 550)
    page.draw_rect(rect, color=(0, 0, 1), width=2)
    
    # Draw a circle
    center = fitz.Point(350, 500)
    page.draw_circle(center, 50, color=(1, 0, 0), width=2)
    
    print("Created new PDF with custom content")
    return new_doc

# Create a new PDF
new_pdf = create_new_pdf()
new_pdf.save("test.pdf")

## Exercice: Open the newly created pdf file and add a new page with a joke

In [None]:
...