In [3]:
import os
import json
import fitz  # PyMuPDF

def analyze_pdf(pdf_path):
    """
    Analyze the given PDF to gather various statistics.

    Args:
    - pdf_path (str): Path to the PDF file.

    Returns:
    - stats (dict): Dictionary of extracted statistics.
    """
    # Open the PDF document
    doc = fitz.open(pdf_path)
    total_pages = len(doc)

    # Check for the expected page count if necessary
    if total_pages != 591:
        print(f"Warning: PDF has {total_pages} pages, but expected 591.")

    # Initialize variables to collect statistics
    font_sizes = []
    header_sizes = []
    footer_sizes = []
    body_sizes = []
    line_counts = []
    font_changes_per_page = []
    words_per_page = []

    empty_pages = 0
    header_footer_only_pages = 0

    # Iterate over each page and collect statistics
    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("dict")["blocks"]

        # Skip pages with no visible text content
        raw_text = page.get_text("text").strip()
        if not raw_text:
            empty_pages += 1
            line_counts.append(0)
            font_changes_per_page.append(0)
            words_per_page.append(0)
            continue

        # Collect line count for the page
        line_count = 0
        page_font_sizes = set()  # Use a set to track distinct font sizes
        previous_font_size = None
        font_change_count = 0
        total_words = 0

        for block in text_blocks:
            if block.get("type") != 0:  # Skip non-text blocks
                continue

            # Calculate the line count correctly
            line_count += len(block["lines"])

            for line in block["lines"]:
                for span in line["spans"]:
                    font_size = span["size"]
                    text = span["text"]

                    if not text.strip():
                        continue

                    # Store font sizes uniquely
                    page_font_sizes.add(font_size)
                    total_words += len(text.split())

                    # Detect font size changes
                    if previous_font_size is not None and font_size != previous_font_size:
                        font_change_count += 1
                    previous_font_size = font_size

        # Collect page-level statistics
        font_sizes.extend(page_font_sizes)
        font_changes_per_page.append(font_change_count)
        words_per_page.append(total_words)
        line_counts.append(line_count)

    # Calculate consolidated statistics
    stats = {
        "total_pages": total_pages,
        "empty_pages": empty_pages,
        "header_footer_only_pages": header_footer_only_pages,
        "min_font_size": min(font_sizes) if font_sizes else None,
        "max_font_size": max(font_sizes) if font_sizes else None,
        "average_font_size": sum(font_sizes) / len(font_sizes) if font_sizes else None,
        "min_lines_on_page": min(line_counts) if line_counts else None,
        "max_lines_on_page": max(line_counts) if line_counts else None,
        "average_lines_per_page": sum(line_counts) / len(line_counts) if line_counts else None,
        "min_font_changes_per_page": min(font_changes_per_page) if font_changes_per_page else None,
        "max_font_changes_per_page": max(font_changes_per_page) if font_changes_per_page else None,
        "average_font_changes_per_page": sum(font_changes_per_page) / len(font_changes_per_page) if font_changes_per_page else None,
        "average_words_per_page": sum(words_per_page) / len(words_per_page) if words_per_page else None,
        "total_font_sizes_collected": len(font_sizes),
        "total_lines_collected": sum(line_counts),
        "total_words_collected": sum(words_per_page),
    }

    return stats

def save_stats_to_json(stats, pdf_path):
    """
    Save the extracted statistics to a JSON file.

    Args:
    - stats (dict): Statistics dictionary to be saved.
    - pdf_path (str): Path to the original PDF file for naming.
    """
    # Get the book name and create output directory
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = "./bookStats"
    os.makedirs(output_dir, exist_ok=True)

    # Create JSON file path
    json_filename = os.path.join(output_dir, f"{book_name}.json")
    
    # Write statistics to JSON file
    with open(json_filename, "w", encoding="utf-8") as json_file:
        json.dump(stats, json_file, indent=4)
    
    print(f"Book statistics saved to {json_filename}")

# Example usage
pdf_path = "Designing Data Intensive Applications.pdf"
stats = analyze_pdf(pdf_path)
save_stats_to_json(stats, pdf_path)


Book statistics saved to ./bookStats\Designing Data Intensive Applications.json


In [4]:
import fitz  # PyMuPDF
import os


def clean_pdf(pdf_path, output_path, stats):
    """
    Clean the PDF by removing headers, footers, and unnecessary page content.

    Args:
    - pdf_path (str): Path to the original PDF file.
    - output_path (str): Path to save the cleaned PDF.
    - stats (dict): Statistics dictionary from the PDF analyzer.

    Returns:
    - None
    """
    doc = fitz.open(pdf_path)
    total_pages = stats["total_pages"]

    # Set thresholds based on analysis to identify headers and footers
    min_body_font_size = stats["average_font_size"] - 2  # Assuming 2pt tolerance for body text
    max_body_font_size = stats["average_font_size"] + 2

    # Create a new PDF document to store cleaned content
    cleaned_doc = fitz.open()

    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("dict")["blocks"]

        # Create a new blank page in the cleaned PDF
        cleaned_page = cleaned_doc.new_page(width=page.rect.width, height=page.rect.height)

        for block in text_blocks:
            if block.get("type") != 0:  # Skip non-text blocks
                continue

            for line in block["lines"]:
                for span in line["spans"]:
                    font_size = span["size"]
                    text = span["text"].strip()
                    bbox = span["bbox"]

                    # Heuristic to remove headers and footers:
                    if (font_size < min_body_font_size or font_size > max_body_font_size or
                            bbox[1] < page.rect.height * 0.1 or bbox[3] > page.rect.height * 0.9 or
                            text.lower() in ["chapter", "page", "section"] or text.isdigit()):
                        continue

                    # If the text passes the filters, add it to the cleaned PDF page
                    # Use a default font like "helv" (Helvetica)
                    cleaned_page.insert_text(
                        (bbox[0], bbox[1]),  # Position text at the original location
                        text,
                        fontsize=font_size,
                        fontname="helv",  # Use a standard font available in PyMuPDF
                        color=(0, 0, 0)  # Keep the text in black color
                    )

    # Save the cleaned PDF
    cleaned_doc.save(output_path)
    cleaned_doc.close()
    doc.close()

    print(f"Cleaned PDF saved to {output_path}")



# Example usage with the analyzer's stats
pdf_path = "Designing Data Intensive Applications.pdf"
output_path = "Cleaned_Designing_Data_Intensive_Applications.pdf"
stats = analyze_pdf(pdf_path)
clean_pdf(pdf_path, output_path, stats)




Exception: need font file or buffer