In [3]:
import os
import re

# Define patterns to identify chapter names
chapter_patterns = [
    r"^Chapter \d+",         # e.g., "Chapter 1"
    r"^Chapter [A-Za-z]+",   # e.g., "Chapter One"
    r"^Part \d+",            # e.g., "Part 1"
    r"^Section \d+",         # e.g., "Section 1"
    r"^\d+\.\d+",            # e.g., "1.1"
    r"^\d+$",                # e.g., "1" (sometimes used for chapter names)
]

# Function to detect chapter titles based on patterns
def is_chapter_title(text):
    for pattern in chapter_patterns:
        if re.match(pattern, text.strip()):
            return True
    return False

# Function to extract chapters from a cleaned book text file
def extract_chapters_from_cleaned_text(cleaned_text_path):
    """
    Extract chapters from the cleaned book text and save each chapter into a separate text file.

    Args:
    - cleaned_text_path (str): Path to the cleaned book text file.

    Returns:
    - None
    """
    # Read the entire cleaned text
    with open(cleaned_text_path, "r", encoding="utf-8") as file:
        book_text = file.read()

    # Split the book text into lines for easier processing
    lines = book_text.splitlines()

    # Store the chapters as separate files in a directory
    book_name = os.path.splitext(os.path.basename(cleaned_text_path))[0]
    output_dir = f"./books/{book_name}_chapters"
    os.makedirs(output_dir, exist_ok=True)

    # Initialize variables for chapter extraction
    chapters = []          # List to hold chapter content
    current_chapter = []   # Accumulate lines for the current chapter
    current_title = None   # Track the current chapter title
    chapter_index = 0      # Chapter counter for filenames

    # Iterate over each line to detect chapter titles
    for line in lines:
        if is_chapter_title(line.strip()):
            # If a new chapter title is found, save the previous chapter if it exists
            if current_title and current_chapter:
                chapters.append((current_title, "\n".join(current_chapter)))

            # Start a new chapter
            current_title = line.strip()
            current_chapter = []  # Reset current chapter content
            chapter_index += 1
            print(f"New chapter detected: {current_title}")

        # Accumulate the line into the current chapter
        if current_chapter is not None:
            current_chapter.append(line)

    # Save the last chapter after iteration completes
    if current_title and current_chapter:
        chapters.append((current_title, "\n".join(current_chapter)))

    # Write each chapter to a separate file
    for index, (title, content) in enumerate(chapters, start=1):
        # Sanitize the chapter title to use as filename
        sanitized_title = re.sub(r'[\\/*?:"<>|]', "", title)  # Remove invalid filename chars
        sanitized_title = re.sub(r'\(.*?\)|\[.*?\]', "", sanitized_title).strip()  # Remove parenthetical
        sanitized_title = sanitized_title if sanitized_title else f"Chapter_{index}"
        chapter_filename = f"{output_dir}/{sanitized_title}_Chapter_{index}.txt"

        # Write the chapter to a file
        with open(chapter_filename, "w", encoding="utf-8") as chapter_file:
            chapter_file.write(content)

        print(f"Chapter '{title}' written to {chapter_filename}")

# Example usage
cleaned_text_path = "./cleaned_books/Designing Data Intensive Applications_cleaned.txt"
extract_chapters_from_cleaned_text(cleaned_text_path)

New chapter detected: Chapter 11. Finally, in Chapter 12 we put everything together and discuss
New chapter detected: Chapter 1: Reliable, Scalable, and Maintainable Applications
New chapter detected: Chapter 1: Reliable, Scalable, and Maintainable Applications
New chapter detected: Chapter 1: Reliable, Scalable, and Maintainable Applications
New chapter detected: Chapter 1: Reliable, Scalable, and Maintainable Applications
New chapter detected: Chapter 1: Reliable, Scalable, and Maintainable Applications
New chapter detected: Chapter 1: Reliable, Scalable, and Maintainable Applications
New chapter detected: Chapter 1: Reliable, Scalable, and Maintainable Applications
New chapter detected: Chapter 1: Reliable, Scalable, and Maintainable Applications
New chapter detected: Chapter 1: Reliable, Scalable, and Maintainable Applications
New chapter detected: Chapter 1: Reliable, Scalable, and Maintainable Applications
New chapter detected: Chapter 1: Reliable, Scalable, and Maintainable Appl

In [4]:
import os
import re
import fitz  # PyMuPDF

# Define patterns for common page numbers and footers
page_number_patterns = [
    r'^\s*\d+\s*$',                   # Simple page number: 1, 2, 3, ...
    r'^\s*Page \d+\s*$',              # Page 1, Page 2, ...
    r'^\s*Page \d+ of \d+\s*$',       # Page 1 of 50
    r'^\s*\d+\s*/\s*\d+\s*$',         # 1/50, 2/50, ...
    r'^\s*-\s*\d+\s*-\s*$',           # - 1 -, - 2 -, ...
    r'^\s*\d+\s*\|\s*$',              # 1 | (common in OCR PDFs)
]

# Patterns for detecting likely footer content
footer_patterns = [
    r'^\d+\s*\|\s*Chapter.*$',         # e.g., "530 | Chapter 12: The Future"
    r'^\d+\s*\|\s*[A-Za-z].*$',        # e.g., "530 | Reliable, Scalable Systems"
    r'^\d+\s*-\s*.*$',                 # e.g., "530 - Chapter Summary"
]

# Function to detect and remove headers, footers, and page numbers
def detect_and_remove_headers_footers(text, page_num, total_pages):
    """
    Detect and clean up page numbers, headers, and footers in the given text.

    Args:
    - text (str): Text content of the page.
    - page_num (int): The current page number (0-based index).
    - total_pages (int): Total number of pages in the PDF.

    Returns:
    - cleaned_text (str): Text with headers, footers, and page numbers removed.
    """
    lines = text.splitlines()
    cleaned_lines = []

    # Detect headers and footers by tracking lines that repeat frequently
    header_footer_candidates = {}
    for line in lines:
        cleaned_line = line.strip()
        if cleaned_line:
            if cleaned_line in header_footer_candidates:
                header_footer_candidates[cleaned_line] += 1
            else:
                header_footer_candidates[cleaned_line] = 1

    # Determine the threshold: text appearing on more than 30% of the pages is likely a header/footer
    header_footer_threshold = max(2, int(0.3 * total_pages))

    # Identify and store all headers/footers
    headers_footers = set([line for line, count in header_footer_candidates.items() if count >= header_footer_threshold])

    # Clean the text by removing headers, footers, and page numbers
    for line in lines:
        stripped_line = line.strip()

        # Remove identified headers/footers
        if stripped_line in headers_footers:
            continue

        # Remove any page number patterns
        is_page_number = any(re.match(pattern, stripped_line) for pattern in page_number_patterns)
        if is_page_number:
            continue

        # Remove footer patterns (e.g., "530 | Chapter 12: The Future")
        is_footer = any(re.match(pattern, stripped_line) for pattern in footer_patterns)
        if is_footer:
            continue

        # If the line is neither a header/footer nor a page number, retain it
        cleaned_lines.append(line)

    return "\n".join(cleaned_lines)

# Function to clean and process the entire PDF
def clean_pdf_text(pdf_path):
    """
    Clean up headers, footers, and page numbers from each page of the PDF and combine into a single text file.

    Args:
    - pdf_path (str): Path to the PDF file.

    Returns:
    - cleaned_book_text (str): Combined cleaned text of the entire book.
    """
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    cleaned_book_text = ""  # Store the entire book's cleaned text

    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        text = page.get_text("text")

        # Clean up headers, footers, and page numbers from the text
        cleaned_text = detect_and_remove_headers_footers(text, page_num, total_pages)
        cleaned_book_text += cleaned_text + "\n\n"  # Add cleaned text with spacing between pages

    return cleaned_book_text

# Function to save the entire cleaned book to a single text file
def save_cleaned_book(cleaned_text, pdf_path):
    """
    Save the cleaned text of the entire book into a single file named `book_cleaned.txt`.

    Args:
    - cleaned_text (str): Combined cleaned text of the entire book.
    - pdf_path (str): Path to the original PDF file for naming.
    """
    # Get the book name and create output directory
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = "./cleaned_books"
    os.makedirs(output_dir, exist_ok=True)

    # Save the cleaned text into a single file named book_cleaned.txt
    cleaned_book_filename = os.path.join(output_dir, f"{book_name}_cleaned.txt")
    with open(cleaned_book_filename, "w", encoding="utf-8") as cleaned_file:
        cleaned_file.write(cleaned_text)
    print(f"Cleaned book saved as {cleaned_book_filename}")

# Example usage
pdf_path = "Designing Data Intensive Applications.pdf"
cleaned_book_text = clean_pdf_text(pdf_path)
save_cleaned_book(cleaned_book_text, pdf_path)


Cleaned book saved as ./cleaned_books\Designing Data Intensive Applications_cleaned.txt


In [5]:
import os
import re
import fitz  # PyMuPDF

# Define patterns for common page numbers
page_number_patterns = [
    r'^\s*\d+\s*$',                   # Simple page number: 1, 2, 3, ...
    r'^\s*Page \d+\s*$',              # Page 1, Page 2, ...
    r'^\s*Page \d+ of \d+\s*$',       # Page 1 of 50
    r'^\s*\d+\s*/\s*\d+\s*$',         # 1/50, 2/50, ...
    r'^\s*-\s*\d+\s*-\s*$',           # - 1 -, - 2 -, ...
    r'^\s*\d+\s*\|\s*$',              # 1 | (common in OCR PDFs)
]

# Helper function to detect headers and footers based on position and repetition
def detect_and_remove_headers_footers(text, page_num, total_pages, prev_footer=None):
    """
    Detect and clean up page numbers, headers, and footers in the given text using position and context.

    Args:
    - text (str): Text content of the page.
    - page_num (int): The current page number (0-based index).
    - total_pages (int): Total number of pages in the PDF.
    - prev_footer (str): Text of the previously detected footer, if any.

    Returns:
    - cleaned_text (str): Text with headers, footers, and page numbers removed.
    - footer_candidate (str): Detected footer text for this page, if found.
    """
    lines = text.splitlines()
    cleaned_lines = []
    footer_candidate = None

    # Detect the last line(s) as potential footers if they are repeated across multiple pages
    if len(lines) > 2:
        # Candidate for a footer is usually the last or second-to-last line
        footer_line = lines[-1].strip()
        second_last_line = lines[-2].strip()

        # Check if the last or second-to-last line matches a typical footer structure or repeats across pages
        is_page_number = any(re.match(pattern, footer_line) for pattern in page_number_patterns)
        if not is_page_number and prev_footer and footer_line.startswith(prev_footer.split()[0]):
            footer_candidate = footer_line

        # Similarly, check second-last line if it's not a footer already
        if not footer_candidate:
            is_page_number = any(re.match(pattern, second_last_line) for pattern in page_number_patterns)
            if not is_page_number and prev_footer and second_last_line.startswith(prev_footer.split()[0]):
                footer_candidate = second_last_line

    # Clean up headers, footers, and page numbers from the text
    for line in lines:
        stripped_line = line.strip()

        # Skip the identified footer candidate
        if footer_candidate and stripped_line == footer_candidate:
            continue

        # Remove any page number patterns
        is_page_number = any(re.match(pattern, stripped_line) for pattern in page_number_patterns)
        if is_page_number:
            continue

        # If the line is neither a header/footer nor a page number, retain it
        cleaned_lines.append(line)

    return "\n".join(cleaned_lines), footer_candidate

# Function to clean and process the entire PDF
def clean_pdf_text(pdf_path):
    """
    Clean up headers, footers, and page numbers from each page of the PDF and combine into a single text file.

    Args:
    - pdf_path (str): Path to the PDF file.

    Returns:
    - cleaned_book_text (str): Combined cleaned text of the entire book.
    """
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    cleaned_book_text = ""  # Store the entire book's cleaned text
    prev_footer = None      # Track the previously detected footer

    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        text = page.get_text("text")

        # Clean up headers, footers, and page numbers from the text using context
        cleaned_text, footer_candidate = detect_and_remove_headers_footers(text, page_num, total_pages, prev_footer)
        prev_footer = footer_candidate if footer_candidate else prev_footer  # Update the footer candidate
        cleaned_book_text += cleaned_text + "\n\n"  # Add cleaned text with spacing between pages

    return cleaned_book_text

# Function to save the entire cleaned book to a single text file
def save_cleaned_book(cleaned_text, pdf_path):
    """
    Save the cleaned text of the entire book into a single file named `book_cleaned.txt`.

    Args:
    - cleaned_text (str): Combined cleaned text of the entire book.
    - pdf_path (str): Path to the original PDF file for naming.
    """
    # Get the book name and create output directory
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = "./cleaned_books"
    os.makedirs(output_dir, exist_ok=True)

    # Save the cleaned text into a single file named book_cleaned.txt
    cleaned_book_filename = os.path.join(output_dir, f"{book_name}_cleaned.txt")
    with open(cleaned_book_filename, "w", encoding="utf-8") as cleaned_file:
        cleaned_file.write(cleaned_text)
    print(f"Cleaned book saved as {cleaned_book_filename}")

# Example usage
pdf_path = "Designing Data Intensive Applications.pdf"
cleaned_book_text = clean_pdf_text(pdf_path)
save_cleaned_book(cleaned_book_text, pdf_path)


Cleaned book saved as ./cleaned_books\Designing Data Intensive Applications_cleaned.txt


In [6]:
import os
import re
import fitz  # PyMuPDF

# Define patterns for common page numbers and header/footer markers
page_number_patterns = [
    r'^\s*\d+\s*$',                   # Simple page number: 1, 2, 3, ...
    r'^\s*Page \d+\s*$',              # Page 1, Page 2, ...
    r'^\s*Page \d+ of \d+\s*$',       # Page 1 of 50
    r'^\s*\d+\s*/\s*\d+\s*$',         # 1/50, 2/50, ...
    r'^\s*-\s*\d+\s*-\s*$',           # - 1 -, - 2 -, ...
    r'^\s*\d+\s*\|\s*$',              # 1 | (common in OCR PDFs)
]

# Function to analyze and clean headers and footers based on font size and patterns
def detect_and_remove_headers_footers(text_blocks, avg_font_size):
    """
    Detect and clean headers, footers, and page numbers using font size and position.

    Args:
    - text_blocks (list): List of text blocks from the page, including font size.
    - avg_font_size (float): Average font size in the document to differentiate headers/footers.

    Returns:
    - cleaned_text (str): Cleaned text for the page.
    """
    cleaned_lines = []

    # Iterate through each text block and decide if it's part of the main content or a header/footer
    for block in text_blocks:
        block_text = block[4].strip()  # Extract text content
        font_size = block[3]  # Font size is stored at index 3 in the block tuple
        y_position = block[1]  # Y-coordinate of the text block (position on the page)

        # Skip lines that match known page number patterns
        if any(re.match(pattern, block_text) for pattern in page_number_patterns):
            continue

        # If the font size is significantly smaller than the average font size, treat it as a header/footer
        if font_size < 0.8 * avg_font_size:
            continue

        # Retain lines that are likely part of the main content
        cleaned_lines.append(block_text)

    return "\n".join(cleaned_lines)

# Function to clean and process the entire PDF with font size analysis
def clean_pdf_text(pdf_path):
    """
    Clean up headers, footers, and page numbers from each page of the PDF using font size and position.

    Args:
    - pdf_path (str): Path to the PDF file.

    Returns:
    - cleaned_book_text (str): Combined cleaned text of the entire book.
    """
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    font_sizes = []  # Store font sizes to compute the average later
    cleaned_book_text = ""  # Store the entire book's cleaned text

    # First pass: gather font size information from all pages
    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("blocks")

        # Collect all font sizes to compute the average
        for block in text_blocks:
            font_sizes.append(block[3])  # Font size is stored at index 3 in the block tuple

    # Calculate the average font size in the document
    avg_font_size = sum(font_sizes) / len(font_sizes)

    # Second pass: clean each page based on font size and text patterns
    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("blocks")

        # Clean up headers, footers, and page numbers using font size
        cleaned_text = detect_and_remove_headers_footers(text_blocks, avg_font_size)
        cleaned_book_text += cleaned_text + "\n\n"  # Add cleaned text with spacing between pages

    return cleaned_book_text

# Function to save the entire cleaned book to a single text file
def save_cleaned_book(cleaned_text, pdf_path):
    """
    Save the cleaned text of the entire book into a single file named `book_cleaned.txt`.

    Args:
    - cleaned_text (str): Combined cleaned text of the entire book.
    - pdf_path (str): Path to the original PDF file for naming.
    """
    # Get the book name and create output directory
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = "./cleaned_books"
    os.makedirs(output_dir, exist_ok=True)

    # Save the cleaned text into a single file named book_cleaned.txt
    cleaned_book_filename = os.path.join(output_dir, f"{book_name}_cleaned.txt")
    with open(cleaned_book_filename, "w", encoding="utf-8") as cleaned_file:
        cleaned_file.write(cleaned_text)
    print(f"Cleaned book saved as {cleaned_book_filename}")

# Example usage
pdf_path = "Designing Data Intensive Applications.pdf"
cleaned_book_text = clean_pdf_text(pdf_path)
save_cleaned_book(cleaned_book_text, pdf_path)


Cleaned book saved as ./cleaned_books\Designing Data Intensive Applications_cleaned.txt


In [7]:
import os
import re
import fitz  # PyMuPDF

# Define patterns for common page numbers and header/footer markers
page_number_patterns = [
    r'^\s*\d+\s*$',                   # Simple page number: 1, 2, 3, ...
    r'^\s*Page \d+\s*$',              # Page 1, Page 2, ...
    r'^\s*Page \d+ of \d+\s*$',       # Page 1 of 50
    r'^\s*\d+\s*/\s*\d+\s*$',         # 1/50, 2/50, ...
    r'^\s*-\s*\d+\s*-\s*$',           # - 1 -, - 2 -, ...
    r'^\s*\d+\s*\|\s*$',              # 1 | (common in OCR PDFs)
]

# Define a pattern for chapter-like footers that include page numbers
footer_pattern = re.compile(r'^\d+\s*\|\s*Chapter \d+.*')

# Function to analyze and clean headers and footers based on position and patterns
def detect_and_remove_headers_footers(text_blocks, avg_font_size):
    """
    Detect and clean headers, footers, and page numbers using font size and position.

    Args:
    - text_blocks (list): List of text blocks from the page, including font size.
    - avg_font_size (float): Average font size in the document to differentiate headers/footers.

    Returns:
    - cleaned_text (str): Cleaned text for the page.
    """
    cleaned_lines = []

    # Iterate through each text block and decide if it's part of the main content or a header/footer
    for block in text_blocks:
        block_text = block[4].strip()  # Extract text content
        font_size = block[3]  # Font size is stored at index 3 in the block tuple
        y_position = block[1]  # Y-coordinate of the text block (position on the page)

        # Skip lines that match known page number patterns or footer patterns
        if any(re.match(pattern, block_text) for pattern in page_number_patterns):
            continue
        if footer_pattern.match(block_text):
            continue

        # If the font size is significantly smaller than the average font size, treat it as a header/footer
        if font_size < 0.8 * avg_font_size:
            continue

        # Retain lines that are likely part of the main content
        cleaned_lines.append(block_text)

    return "\n".join(cleaned_lines)

# Function to clean and process the entire PDF with font size analysis
def clean_pdf_text(pdf_path):
    """
    Clean up headers, footers, and page numbers from each page of the PDF using font size and position.

    Args:
    - pdf_path (str): Path to the PDF file.

    Returns:
    - cleaned_book_text (str): Combined cleaned text of the entire book.
    """
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    font_sizes = []  # Store font sizes to compute the average later
    cleaned_book_text = ""  # Store the entire book's cleaned text

    # First pass: gather font size information from all pages
    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("blocks")

        # Collect all font sizes to compute the average
        for block in text_blocks:
            font_sizes.append(block[3])  # Font size is stored at index 3 in the block tuple

    # Calculate the average font size in the document
    avg_font_size = sum(font_sizes) / len(font_sizes)

    # Second pass: clean each page based on font size and text patterns
    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("blocks")

        # Clean up headers, footers, and page numbers using font size and position
        cleaned_text = detect_and_remove_headers_footers(text_blocks, avg_font_size)
        cleaned_book_text += cleaned_text + "\n\n"  # Add cleaned text with spacing between pages

    return cleaned_book_text

# Function to save the entire cleaned book to a single text file
def save_cleaned_book(cleaned_text, pdf_path):
    """
    Save the cleaned text of the entire book into a single file named `book_cleaned.txt`.

    Args:
    - cleaned_text (str): Combined cleaned text of the entire book.
    - pdf_path (str): Path to the original PDF file for naming.
    """
    # Get the book name and create output directory
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = "./cleaned_books"
    os.makedirs(output_dir, exist_ok=True)

    # Save the cleaned text into a single file named book_cleaned.txt
    cleaned_book_filename = os.path.join(output_dir, f"{book_name}_cleaned.txt")
    with open(cleaned_book_filename, "w", encoding="utf-8") as cleaned_file:
        cleaned_file.write(cleaned_text)
    print(f"Cleaned book saved as {cleaned_book_filename}")

# Example usage
pdf_path = "Designing Data Intensive Applications.pdf"
cleaned_book_text = clean_pdf_text(pdf_path)
save_cleaned_book(cleaned_book_text, pdf_path)


Cleaned book saved as ./cleaned_books\Designing Data Intensive Applications_cleaned.txt


In [8]:
import os
import re
import fitz  # PyMuPDF

# Define patterns for common page numbers (optional)
page_number_patterns = [
    r'^\s*\d+\s*$',                   # Simple page number: 1, 2, 3, ...
    r'^\s*Page \d+\s*$',              # Page 1, Page 2, ...
    r'^\s*Page \d+ of \d+\s*$',       # Page 1 of 50
    r'^\s*\d+\s*/\s*\d+\s*$',         # 1/50, 2/50, ...
    r'^\s*-\s*\d+\s*-\s*$',           # - 1 -, - 2 -, ...
]

# Function to clean text based purely on font size
def clean_text_by_font_size(text_blocks, avg_font_size):
    """
    Clean the text based on font size. Consider text blocks significantly larger than the average as content
    and blocks significantly smaller as headers/footers.

    Args:
    - text_blocks (list): List of text blocks from the page, including font size.
    - avg_font_size (float): Average font size in the document.

    Returns:
    - cleaned_text (str): Cleaned text for the page.
    """
    cleaned_lines = []

    # Iterate through each text block and decide if it's part of the main content or a header/footer
    for block in text_blocks:
        block_text = block[4].strip()  # Extract text content
        font_size = block[3]  # Font size is stored at index 3 in the block tuple

        # Skip lines that match known page number patterns (optional)
        if any(re.match(pattern, block_text) for pattern in page_number_patterns):
            continue

        # Retain lines that have a font size close to or larger than the average
        if font_size >= 0.8 * avg_font_size:
            cleaned_lines.append(block_text)

    return "\n".join(cleaned_lines)

# Function to clean and process the entire PDF using font size analysis
def clean_pdf_text(pdf_path):
    """
    Clean up headers, footers, and page numbers from each page of the PDF using font size analysis.

    Args:
    - pdf_path (str): Path to the PDF file.

    Returns:
    - cleaned_book_text (str): Combined cleaned text of the entire book.
    """
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    font_sizes = []  # Store font sizes to compute the average later
    cleaned_book_text = ""  # Store the entire book's cleaned text

    # First pass: gather font size information from all pages
    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("blocks")

        # Collect all font sizes to compute the average
        for block in text_blocks:
            font_sizes.append(block[3])  # Font size is stored at index 3 in the block tuple

    # Calculate the average font size in the document
    avg_font_size = sum(font_sizes) / len(font_sizes)

    # Second pass: clean each page based on font size
    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("blocks")

        # Clean text using font size as the main heuristic
        cleaned_text = clean_text_by_font_size(text_blocks, avg_font_size)
        cleaned_book_text += cleaned_text + "\n\n"  # Add cleaned text with spacing between pages

    return cleaned_book_text

# Function to save the entire cleaned book to a single text file
def save_cleaned_book(cleaned_text, pdf_path):
    """
    Save the cleaned text of the entire book into a single file named `book_cleaned.txt`.

    Args:
    - cleaned_text (str): Combined cleaned text of the entire book.
    - pdf_path (str): Path to the original PDF file for naming.
    """
    # Get the book name and create output directory
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = "./cleaned_books"
    os.makedirs(output_dir, exist_ok=True)

    # Save the cleaned text into a single file named book_cleaned.txt
    cleaned_book_filename = os.path.join(output_dir, f"{book_name}_cleaned.txt")
    with open(cleaned_book_filename, "w", encoding="utf-8") as cleaned_file:
        cleaned_file.write(cleaned_text)
    print(f"Cleaned book saved as {cleaned_book_filename}")

# Example usage
pdf_path = "Designing Data Intensive Applications.pdf"
cleaned_book_text = clean_pdf_text(pdf_path)
save_cleaned_book(cleaned_book_text, pdf_path)


Cleaned book saved as ./cleaned_books\Designing Data Intensive Applications_cleaned.txt


In [9]:
import fitz  # PyMuPDF
import os


def clean_pdf(pdf_path, output_path, stats):
    """
    Clean the PDF by removing headers, footers, and unnecessary page content.

    Args:
    - pdf_path (str): Path to the original PDF file.
    - output_path (str): Path to save the cleaned PDF.
    - stats (dict): Statistics dictionary from the PDF analyzer.

    Returns:
    - None
    """
    doc = fitz.open(pdf_path)
    total_pages = stats["total_pages"]

    # Set thresholds based on analysis to identify headers and footers
    min_body_font_size = stats["average_font_size"] - 2  # Assuming 2pt tolerance for body text
    max_body_font_size = stats["average_font_size"] + 2

    # Create a new PDF document to store cleaned content
    cleaned_doc = fitz.open()

    for page_num in range(total_pages):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("dict")["blocks"]

        # Create a new blank page in the cleaned PDF
        cleaned_page = cleaned_doc.new_page(width=page.rect.width, height=page.rect.height)

        for block in text_blocks:
            if block.get("type") != 0:  # Skip non-text blocks
                continue

            for line in block["lines"]:
                for span in line["spans"]:
                    font_size = span["size"]
                    text = span["text"].strip()
                    bbox = span["bbox"]

                    # Heuristic to remove headers and footers:
                    # 1. Exclude text with significantly different font sizes.
                    # 2. Remove text that appears in typical header/footer locations (e.g., top 10% or bottom 10% of the page).
                    # 3. Skip page numbers or common phrases found in headers/footers.
                    if (font_size < min_body_font_size or font_size > max_body_font_size or
                            bbox[1] < page.rect.height * 0.1 or bbox[3] > page.rect.height * 0.9 or
                            text.lower() in ["chapter", "page", "section"] or text.isdigit()):
                        continue

                    # If the text passes the filters, add it to the cleaned PDF page
                    cleaned_page.insert_text(
                        (bbox[0], bbox[1]),  # Position text at the original location
                        text,
                        fontsize=font_size,
                        fontname=span["font"],
                        color=(0, 0, 0)  # Keep the text in black color
                    )

    # Save the cleaned PDF
    cleaned_doc.save(output_path)
    cleaned_doc.close()
    doc.close()

    print(f"Cleaned PDF saved to {output_path}")


# Example usage with the analyzer's stats
pdf_path = "Designing Data Intensive Applications.pdf"
output_path = "Cleaned_Designing_Data_Intensive_Applications.pdf"
stats = analyze_pdf(pdf_path)
clean_pdf(pdf_path, output_path, stats)


NameError: name 'analyze_pdf' is not defined