In [2]:
# Install required libraries if not already installed
# !pip install PyPDF2

import os
from PyPDF2 import PdfReader

# Function to extract chapters and write them to separate files
def extract_chapters(pdf_path):
    # Get the book name from the file path
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = f"./books/{book_name}"
    
    # Create the directory for storing chapters if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Open the PDF file
    reader = PdfReader(pdf_path)
    
    # Initialize variables to hold chapter information
    chapter_start_indices = []
    chapters = []

    # Iterate over each page in the PDF and extract text
    for page_num, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            # Check for the chapter title keyword ("Chapter")
            if "Chapter" in text:
                chapter_start_indices.append(page_num)

    # Add the last page index as the end boundary for the last chapter
    chapter_start_indices.append(len(reader.pages))
    
    # Extract each chapter based on the chapter indices
    for i in range(len(chapter_start_indices) - 1):
        start_page = chapter_start_indices[i]
        end_page = chapter_start_indices[i + 1]
        
        # Aggregate the text for the chapter
        chapter_text = ""
        for j in range(start_page, end_page):
            chapter_text += reader.pages[j].extract_text()
        
        # Write each chapter to a separate file
        chapter_filename = f"{output_dir}/chapter_{i+1}.txt"
        with open(chapter_filename, "w", encoding="utf-8") as chapter_file:
            chapter_file.write(chapter_text)
        
        print(f"Chapter {i+1} written to {chapter_filename}")

# Input PDF file path
pdf_path = "Designing Data Intensive Applications.pdf"

# Extract chapters and write to separate files
extract_chapters(pdf_path)


Chapter 1 written to ./books/Designing Data Intensive Applications/chapter_1.txt
Chapter 2 written to ./books/Designing Data Intensive Applications/chapter_2.txt
Chapter 3 written to ./books/Designing Data Intensive Applications/chapter_3.txt
Chapter 4 written to ./books/Designing Data Intensive Applications/chapter_4.txt
Chapter 5 written to ./books/Designing Data Intensive Applications/chapter_5.txt
Chapter 6 written to ./books/Designing Data Intensive Applications/chapter_6.txt
Chapter 7 written to ./books/Designing Data Intensive Applications/chapter_7.txt
Chapter 8 written to ./books/Designing Data Intensive Applications/chapter_8.txt
Chapter 9 written to ./books/Designing Data Intensive Applications/chapter_9.txt
Chapter 10 written to ./books/Designing Data Intensive Applications/chapter_10.txt
Chapter 11 written to ./books/Designing Data Intensive Applications/chapter_11.txt
Chapter 12 written to ./books/Designing Data Intensive Applications/chapter_12.txt
Chapter 13 written to 

In [4]:
# version 2

import os
import fitz  # PyMuPDF

# Helper function to extract text and analyze styles
def analyze_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    chapters = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("blocks")
        chapter_candidates = []

        # Analyze each text block
        for block in text_blocks:
            block_text = block[4].strip()
            if not block_text:
                continue

            # Extract font size and position
            font_size = block[3]  # Extract font size
            bbox = block[:4]  # bounding box (x0, y0, x1, y1)

            # Check for chapter patterns and large font sizes
            if ("chapter" in block_text.lower()) or (font_size > 12):
                chapter_candidates.append((block_text, page_num, font_size, bbox))

        # Determine the highest scoring chapter candidate (if any)
        if chapter_candidates:
            chapter_candidates.sort(key=lambda x: -x[2])  # Sort by font size descending
            chapters.append((chapter_candidates[0][1], chapter_candidates[0][0]))  # Store the page number and text

    return chapters

# Function to extract chapters and write to separate files
def extract_chapters_with_analysis(pdf_path):
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = f"./books/{book_name}"
    os.makedirs(output_dir, exist_ok=True)

    chapters = analyze_pdf(pdf_path)

    doc = fitz.open(pdf_path)
    for i, (start_page, chapter_title) in enumerate(chapters):
        if i + 1 < len(chapters):
            end_page = chapters[i + 1][0]
        else:
            end_page = len(doc)

        chapter_text = ""
        for page_num in range(start_page, end_page):
            page = doc.load_page(page_num)
            chapter_text += page.get_text("text")

        chapter_filename = f"{output_dir}/chapter_{i+1}.txt"
        with open(chapter_filename, "w", encoding="utf-8") as chapter_file:
            chapter_file.write(chapter_text)
        print(f"Chapter {i+1} ('{chapter_title}') written to {chapter_filename}")

# Example usage
pdf_path = "Designing Data Intensive Applications.pdf"
extract_chapters_with_analysis(pdf_path)


Chapter 1 ('Martin Kleppmann') written to ./books/Designing Data Intensive Applications/chapter_1.txt
Chapter 2 ('Boston
Farnham
Sebastopol
Tokyo
Beijing
Boston
Farnham
Sebastopol
Tokyo
Beijing') written to ./books/Designing Data Intensive Applications/chapter_2.txt
Chapter 3 ('[LSI]') written to ./books/Designing Data Intensive Applications/chapter_3.txt
Chapter 4 ('Technology is a powerful force in our society. Data, software, and communication can
be used for bad: to entrench unfair power structures, to undermine human rights, and
to protect vested interests. But they can also be used for good: to make underrepresented
people’s voices heard, to create opportunities for everyone, and to avert disasters. This
book is dedicated to everyone working toward the good.') written to ./books/Designing Data Intensive Applications/chapter_4.txt
Chapter 5 ('Computing is pop culture. […] Pop culture holds a disdain for history. Pop culture is all
about identity and feeling like you’re participati

In [7]:
# version 3 - based on chapter names 

import os
import re
import fitz  # PyMuPDF

# Define a list of patterns to identify chapter names
chapter_patterns = [
    r"^Chapter \d+",         # e.g., "Chapter 1"
    r"^Chapter [A-Za-z]+",   # e.g., "Chapter One"
    r"^Part \d+",            # e.g., "Part 1"
    r"^Section \d+",         # e.g., "Section 1"
    r"^\d+\.\d+",            # e.g., "1.1"
    r"^\d+$",                # e.g., "1" (sometimes used for chapter names)
]

# Function to detect chapter titles based on patterns
def is_chapter_title(text):
    for pattern in chapter_patterns:
        if re.match(pattern, text.strip()):
            return True
    return False

# Function to extract chapters and write them to separate files
def extract_chapters(pdf_path):
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Get the book name and create output directory
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = f"./books/{book_name}"
    os.makedirs(output_dir, exist_ok=True)

    # Initialize variables
    chapter_indices = []  # Store start page indices of chapters
    chapter_titles = []   # Store chapter titles
    current_chapter = None

    # Iterate over each page and detect chapter names
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("blocks")

        for block in text_blocks:
            block_text = block[4].strip()  # Extract the actual text from the block

            # Check if the block text matches any chapter patterns
            if is_chapter_title(block_text):
                chapter_indices.append(page_num)
                chapter_titles.append(block_text)
                print(f"Chapter detected: '{block_text}' on page {page_num + 1}")
                break  # Stop after detecting the first chapter title on a page

    # If no chapters were found, exit the function
    if not chapter_indices:
        print("No chapters detected. Exiting...")
        return

    # Add a final index to cover the last chapter to the end of the document
    chapter_indices.append(len(doc))

    # Write each chapter to a separate file
    for i in range(len(chapter_indices) - 1):
        start_page = chapter_indices[i]
        end_page = chapter_indices[i + 1]
        chapter_title = chapter_titles[i]

        # Extract text for the current chapter
        chapter_text = ""
        for j in range(start_page, end_page):
            page = doc.load_page(j)
            chapter_text += page.get_text("text")

        # Save the chapter to a separate file
        sanitized_title = re.sub(r'[\\/*?:"<>|]', "", chapter_title)  # Remove invalid filename chars
        chapter_filename = f"{output_dir}/{sanitized_title}_Chapter_{i+1}.txt"
        
        with open(chapter_filename, "w", encoding="utf-8") as chapter_file:
            chapter_file.write(chapter_text)

        print(f"Chapter '{chapter_title}' written to {chapter_filename}")

# Example usage
pdf_path = "Designing Data Intensive Applications.pdf"
extract_chapters(pdf_path)

Chapter detected: '3' on page 25
Chapter detected: '27' on page 49
Chapter detected: '69' on page 91
Chapter detected: '111' on page 133
Chapter detected: '151' on page 173
Chapter detected: '199' on page 221
Chapter detected: '221' on page 243
Chapter detected: '273' on page 295
Chapter detected: '321' on page 343
Chapter detected: 'Chapter 8 (see “Relying on Synchronized Clocks” on page 291) is another
attempt to introduce order into a disorderly world, for example to determine
which one of two writes happened later.' on page 361
Chapter detected: '389' on page 411
Chapter detected: '216.58.210.78 - - [27/Feb/2015:17:55:11 +0000] "GET /css/typography.css HTTP/1.1"
200 3377 "http://martin.kleppmann.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X
10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115
Safari/537.36"' on page 413
Chapter detected: '439' on page 461
Chapter detected: '489' on page 511
Chapter detected: '232' on page 581
Chapter detected: '451' on page 582
Chapt

In [9]:
# version 4

import os
import re
import fitz  # PyMuPDF

# Define a list of patterns to identify chapter names
chapter_patterns = [
    r"^Chapter \d+",         # e.g., "Chapter 1"
    r"^Chapter [A-Za-z]+",   # e.g., "Chapter One"
    r"^Part \d+",            # e.g., "Part 1"
    r"^Section \d+",         # e.g., "Section 1"
    r"^\d+\.\d+",            # e.g., "1.1"
    r"^\d+$",                # e.g., "1" (sometimes used for chapter names)
]

# Function to detect chapter titles based on patterns
def is_chapter_title(text):
    for pattern in chapter_patterns:
        if re.match(pattern, text.strip()):
            return True
    return False

# Function to filter chapter candidates by analyzing font size and position
def is_valid_chapter_candidate(block, avg_font_size):
    block_text = block[4].strip()
    font_size = block[3]  # Font size is stored at index 3 in the block tuple
    bbox = block[:4]  # Bounding box of the text block: [x0, y0, x1, y1]
    y_position = bbox[1]  # Top y-coordinate (position on the page)

    # Conditions to consider it a valid chapter title:
    return (
        is_chapter_title(block_text) and
        font_size >= avg_font_size and  # Ensure the font size is larger than the average
        y_position < 200  # Chapter titles are usually positioned at the top
    )

# Function to extract chapters and write them to separate files
def extract_chapters(pdf_path):
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Get the book name and create output directory
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = f"./books/{book_name}"
    os.makedirs(output_dir, exist_ok=True)

    # Initialize variables
    chapter_indices = []  # Store start page indices of chapters
    chapter_titles = []   # Store chapter titles
    current_chapter = None

    # Iterate over each page and detect chapter names
    font_sizes = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("blocks")

        # Gather font size statistics to calculate average
        for block in text_blocks:
            font_sizes.append(block[3])  # Font size is stored at index 3 in the block tuple

    avg_font_size = sum(font_sizes) / len(font_sizes)  # Calculate average font size

    # Re-check each page to identify chapters based on font size and patterns
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("blocks")

        for block in text_blocks:
            if is_valid_chapter_candidate(block, avg_font_size):
                chapter_indices.append(page_num)
                chapter_titles.append(block[4].strip())  # Store the cleaned chapter title
                print(f"Chapter detected: '{block[4].strip()}' on page {page_num + 1}")
                break  # Stop after detecting the first chapter title on a page

    # If no chapters were found, exit the function
    if not chapter_indices:
        print("No chapters detected. Exiting...")
        return

    # Add a final index to cover the last chapter to the end of the document
    chapter_indices.append(len(doc))

    # Write each chapter to a separate file
    for i in range(len(chapter_indices) - 1):
        start_page = chapter_indices[i]
        end_page = chapter_indices[i + 1]
        chapter_title = chapter_titles[i]

        # Extract text for the current chapter
        chapter_text = ""
        for j in range(start_page, end_page):
            page = doc.load_page(j)
            chapter_text += page.get_text("text")

        # Sanitize the chapter title to use as filename
        sanitized_title = re.sub(r'[\\/*?:"<>|]', "", chapter_title)  # Remove invalid filename chars
        chapter_filename = f"{output_dir}/{sanitized_title}_Chapter_{i+1}.txt"
        
        with open(chapter_filename, "w", encoding="utf-8") as chapter_file:
            chapter_file.write(chapter_text)

        print(f"Chapter '{chapter_title}' written to {chapter_filename}")

# Example usage
pdf_path = "Designing Data Intensive Applications.pdf"
extract_chapters(pdf_path)


No chapters detected. Exiting...


In [11]:
# version 5
import os
import re
import fitz  # PyMuPDF

# Define a list of patterns to identify chapter names
chapter_patterns = [
    r"^Chapter \d+",         # e.g., "Chapter 1"
    r"^Chapter [A-Za-z]+",   # e.g., "Chapter One"
    r"^Part \d+",            # e.g., "Part 1"
    r"^Section \d+",         # e.g., "Section 1"
    r"^\d+\.\d+",            # e.g., "1.1"
    r"^\d+$",                # e.g., "1" (sometimes used for chapter names)
]

# Function to detect chapter titles based on patterns
def is_chapter_title(text):
    for pattern in chapter_patterns:
        if re.match(pattern, text.strip()):
            return True
    return False

# Function to extract chapters and write them to separate files
def extract_chapters(pdf_path):
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Get the book name and create output directory
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = f"./books/{book_name}"
    os.makedirs(output_dir, exist_ok=True)

    # Initialize variables
    chapter_indices = []  # Store start page indices of chapters
    chapter_titles = []   # Store chapter titles

    # Iterate over each page and detect chapter names
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("blocks")

        for block in text_blocks:
            block_text = block[4].strip()  # Extract the actual text from the block

            # Check if the block text matches any chapter patterns
            if is_chapter_title(block_text):
                chapter_indices.append(page_num)
                chapter_titles.append(block_text)
                print(f"Chapter detected: '{block_text}' on page {page_num + 1}")
                break  # Stop after detecting the first chapter title on a page

    # If no chapters were found, exit the function
    if not chapter_indices:
        print("No chapters detected. Exiting...")
        return

    # Add a final index to cover the last chapter to the end of the document
    chapter_indices.append(len(doc))

    # Write each chapter to a separate file
    for i in range(len(chapter_indices) - 1):
        start_page = chapter_indices[i]
        end_page = chapter_indices[i + 1]
        chapter_title = chapter_titles[i]

        # Extract text for the current chapter
        chapter_text = ""
        for j in range(start_page, end_page):
            page = doc.load_page(j)
            chapter_text += page.get_text("text")

        # Sanitize the chapter title to use as filename
        sanitized_title = re.sub(r'[\\/*?:"<>|]', "", chapter_title)  # Remove invalid filename chars
        chapter_filename = f"{output_dir}/{sanitized_title}_Chapter_{i+1}.txt"
        
        with open(chapter_filename, "w", encoding="utf-8") as chapter_file:
            chapter_file.write(chapter_text)

        print(f"Chapter '{chapter_title}' written to {chapter_filename}")

# Example usage
pdf_path = "Designing Data Intensive Applications.pdf"
extract_chapters(pdf_path)


Chapter detected: '3' on page 25
Chapter detected: '27' on page 49
Chapter detected: '69' on page 91
Chapter detected: '111' on page 133
Chapter detected: '151' on page 173
Chapter detected: '199' on page 221
Chapter detected: '221' on page 243
Chapter detected: '273' on page 295
Chapter detected: '321' on page 343
Chapter detected: 'Chapter 8 (see “Relying on Synchronized Clocks” on page 291) is another
attempt to introduce order into a disorderly world, for example to determine
which one of two writes happened later.' on page 361
Chapter detected: '389' on page 411
Chapter detected: '216.58.210.78 - - [27/Feb/2015:17:55:11 +0000] "GET /css/typography.css HTTP/1.1"
200 3377 "http://martin.kleppmann.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X
10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115
Safari/537.36"' on page 413
Chapter detected: '439' on page 461
Chapter detected: '489' on page 511
Chapter detected: '232' on page 581
Chapter detected: '451' on page 582
Chapt

FileNotFoundError: [Errno 2] No such file or directory: './books/Designing Data Intensive Applications/Chapter 8 (see “Relying on Synchronized Clocks” on page 291) is another\nattempt to introduce order into a disorderly world, for example to determine\nwhich one of two writes happened later._Chapter_10.txt'

In [12]:
import os
import re
import fitz  # PyMuPDF

# Define a list of patterns to identify chapter names
chapter_patterns = [
    r"^Chapter \d+",         # e.g., "Chapter 1"
    r"^Chapter [A-Za-z]+",   # e.g., "Chapter One"
    r"^Part \d+",            # e.g., "Part 1"
    r"^Section \d+",         # e.g., "Section 1"
    r"^\d+\.\d+",            # e.g., "1.1"
    r"^\d+$",                # e.g., "1" (sometimes used for chapter names)
]

# Function to detect chapter titles based on patterns
def is_chapter_title(text):
    for pattern in chapter_patterns:
        if re.match(pattern, text.strip()):
            return True
    return False

# Function to sanitize and simplify chapter titles for filenames
def simplify_chapter_title(title, chapter_num):
    # Remove invalid filename characters
    sanitized_title = re.sub(r'[\\/*?:"<>|]', "", title)
    
    # Remove excessive details in parentheses or square brackets
    sanitized_title = re.sub(r'\(.*?\)|\[.*?\]', "", sanitized_title).strip()
    
    # Limit length to 50 characters for filename safety
    if len(sanitized_title) > 50:
        sanitized_title = sanitized_title[:50] + "..."
    
    # If the title is empty after sanitization, use a fallback title
    if not sanitized_title:
        return f"Chapter_{chapter_num}"
    
    return sanitized_title

# Function to extract chapters and write them to separate files
def extract_chapters(pdf_path):
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Get the book name and create output directory
    book_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = f"./books/{book_name}"
    os.makedirs(output_dir, exist_ok=True)

    # Initialize variables
    chapter_indices = []  # Store start page indices of chapters
    chapter_titles = []   # Store chapter titles

    # Iterate over each page and detect chapter names
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text_blocks = page.get_text("blocks")

        for block in text_blocks:
            block_text = block[4].strip()  # Extract the actual text from the block

            # Check if the block text matches any chapter patterns
            if is_chapter_title(block_text):
                chapter_indices.append(page_num)
                chapter_titles.append(block_text)
                print(f"Chapter detected: '{block_text}' on page {page_num + 1}")
                break  # Stop after detecting the first chapter title on a page

    # If no chapters were found, exit the function
    if not chapter_indices:
        print("No chapters detected. Exiting...")
        return

    # Add a final index to cover the last chapter to the end of the document
    chapter_indices.append(len(doc))

    # Write each chapter to a separate file
    for i in range(len(chapter_indices) - 1):
        start_page = chapter_indices[i]
        end_page = chapter_indices[i + 1]
        chapter_title = chapter_titles[i]

        # Extract text for the current chapter
        chapter_text = ""
        for j in range(start_page, end_page):
            page = doc.load_page(j)
            chapter_text += page.get_text("text")

        # Skip chapters that are too short to be valid (less than 100 characters)
        if len(chapter_text) < 100:
            print(f"Skipping short chapter: '{chapter_title}'")
            continue

        # Generate a safe filename for the chapter
        sanitized_title = simplify_chapter_title(chapter_title, i+1)
        chapter_filename = f"{output_dir}/{sanitized_title}_Chapter_{i+1}.txt"
        
        # Write the chapter content to the file
        try:
            with open(chapter_filename, "w", encoding="utf-8") as chapter_file:
                chapter_file.write(chapter_text)
            print(f"Chapter '{chapter_title}' written to {chapter_filename}")
        except Exception as e:
            print(f"Error writing chapter '{chapter_title}': {e}")
            continue

# Example usage
pdf_path = "Designing Data Intensive Applications.pdf"
extract_chapters(pdf_path)


Chapter detected: '3' on page 25
Chapter detected: '27' on page 49
Chapter detected: '69' on page 91
Chapter detected: '111' on page 133
Chapter detected: '151' on page 173
Chapter detected: '199' on page 221
Chapter detected: '221' on page 243
Chapter detected: '273' on page 295
Chapter detected: '321' on page 343
Chapter detected: 'Chapter 8 (see “Relying on Synchronized Clocks” on page 291) is another
attempt to introduce order into a disorderly world, for example to determine
which one of two writes happened later.' on page 361
Chapter detected: '389' on page 411
Chapter detected: '216.58.210.78 - - [27/Feb/2015:17:55:11 +0000] "GET /css/typography.css HTTP/1.1"
200 3377 "http://martin.kleppmann.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X
10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115
Safari/537.36"' on page 413
Chapter detected: '439' on page 461
Chapter detected: '489' on page 511
Chapter detected: '232' on page 581
Chapter detected: '451' on page 582
Chapt

In [14]:
# Install required packages
!pip install PyPDF2

# Import necessary libraries
import PyPDF2
import json
import os

# Prompt user for the PDF name and book name
pdf_name = input("Enter the name of the PDF file (with extension, e.g., book.pdf): ")
book_name = input("Enter the book name to create a directory under 'books/': ")

# Define paths
output_directory = f"books/{book_name}"
json_output_file = os.path.join(output_directory, f"{book_name}_metadata.json")

# Create the directory if it does not exist
os.makedirs(output_directory, exist_ok=True)

# Initialize an empty structure to hold PDF metadata and outline
pdf_metadata = {
    "file_name": pdf_name,
    "metadata": {},
    "outline": []
}

try:
    # Open the PDF file
    with open(pdf_name, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        
        # Extract document information (metadata)
        pdf_metadata["metadata"] = reader.metadata
        
        # Extract outline (if available)
        if reader.outline:
            def parse_outline(outlines, depth=0):
                outline_structure = []
                for outline in outlines:
                    if isinstance(outline, list):
                        outline_structure.append(parse_outline(outline, depth + 1))
                    else:
                        # Capture title and the destination (page number)
                        outline_structure.append({
                            "title": outline.title,
                            "page": reader.get_destination_page_number(outline)
                        })
                return outline_structure

            pdf_metadata["outline"] = parse_outline(reader.outline)

        # Write metadata to a JSON file
        with open(json_output_file, 'w') as json_file:
            json.dump(pdf_metadata, json_file, indent=4)

        print(f"Metadata extracted and saved to {json_output_file}")

except FileNotFoundError:
    print(f"File '{pdf_name}' not found. Please check the file path and name.")
except Exception as e:
    print(f"An error occurred: {e}")

Metadata extracted and saved to books/Designing Data Intensive Applications\Designing Data Intensive Applications_metadata.json


In [None]:
Designing Data Intensive Applications.pdf

In [25]:
# Install required packages
!pip install PyPDF2

# Import necessary libraries
import PyPDF2
import os
import re

# Prompt user for the PDF name
pdf_name = input("Enter the name of the PDF file (with extension, e.g., book.pdf): ")

# Define the book name based on the PDF file name (without extension)
book_name = os.path.splitext(os.path.basename(pdf_name))[0]

# Define the output directory for the extracted chapters
output_directory = f"books/{book_name}"
os.makedirs(output_directory, exist_ok=True)

try:
    # Open the PDF file
    with open(pdf_name, 'rb') as file:
        reader = PyPDF2.PdfReader(file)

        # Extract outline (if available)
        outline = reader.outline

        # Helper function to extract text by page range
        def extract_text(start_page, end_page):
            """Extracts text from start_page to end_page (exclusive)."""
            text = ""
            for page_num in range(start_page, end_page):
                page = reader.pages[page_num]
                text += page.extract_text() or ""  # Handle cases where extract_text returns None
            return text

        # Helper function to clean up the text
        def clean_text(text):
            """Removes page numbers, footers, references, and common unwanted patterns."""
            # Define a pattern for footers with page numbers and chapter titles (e.g., "14 | Chapter 1: Title")
            footer_pattern_1 = re.compile(r'^\s*\d+\s*\|\s*Chapter \d+:.*$', re.MULTILINE)

            # Define a pattern for footers like "Scalability | 13"
            footer_pattern_2 = re.compile(r'^\s*[a-zA-Z\s]+\s*\|\s*\d+\s*$', re.MULTILINE)

            # Add a refined pattern to catch more variations like "Scalability | 13"
            footer_pattern_3 = re.compile(r'^[A-Za-z\s]+(\s*[\|\-]\s*)\d+\s*$', re.MULTILINE)

            # Additional refined pattern to catch inline footers (e.g., within paragraphs)
            # Match patterns like "Scalability | 13" if followed by a newline or paragraph end
            inline_footer_pattern = re.compile(r'([a-zA-Z\s]+)\s*\|\s*\d+(?=\s*[\n\r])')

            # Remove common footers using all defined patterns
            text = re.sub(footer_pattern_1, '', text)
            text = re.sub(footer_pattern_2, '', text)
            text = re.sub(footer_pattern_3, '', text)

            # Remove inline footers that appear within paragraphs
            text = re.sub(inline_footer_pattern, '', text)

            # Remove page numbers at the beginning or end of lines
            text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)

            # Remove entire reference sections starting with "References" or "Bibliography"
            reference_section_pattern = re.compile(r'(References|Bibliography)\s*\n.*?(?=\n\n|$)', re.DOTALL)
            text = re.sub(reference_section_pattern, '', text)

            # Remove specific bracketed reference patterns like [1], [ 2 ], [2 ], etc.
            bracketed_reference_pattern = re.compile(r'\[\s*\d+\s*\]')
            text = re.sub(bracketed_reference_pattern, '', text)

            # Handle cases with multiple consecutive references like [1][2][3] or [ 1 ] [ 2 ] [ 3 ]
            multiple_references_pattern = re.compile(r'(\s*\[\s*\d+\s*\]){2,}')
            text = re.sub(multiple_references_pattern, '', text)

            # Normalize multiple consecutive newlines to a single newline
            text = re.sub(r'\n+', '\n', text)

            # Remove leading/trailing whitespace
            return text.strip()

        # Parse the outline into a flat list of chapters with their starting pages
        def parse_outline(outlines, reader):
            chapter_structure = []
            chapter_pattern = re.compile(r'Chapter \d+|^Ch\.\s?\d+', re.IGNORECASE)  # Patterns for typical chapter titles
            for item in outlines:
                if isinstance(item, list):
                    chapter_structure.extend(parse_outline(item, reader))
                else:
                    title = item.title.strip()
                    # Consider only valid chapter entries
                    if chapter_pattern.search(title):
                        chapter_structure.append({
                            "title": title,
                            "start_page": reader.get_destination_page_number(item)
                        })
            return chapter_structure

        # Parse the outline into a flat list of chapters with their starting pages
        chapters = parse_outline(outline, reader)

        if not chapters:
            raise ValueError("No chapters found in the outline. Ensure the PDF has a structured outline.")

        # Loop through chapters to extract content and save to text files
        for i in range(len(chapters)):
            chapter_title = chapters[i]["title"]
            start_page = chapters[i]["start_page"]
            end_page = chapters[i + 1]["start_page"] if i + 1 < len(chapters) else len(reader.pages)

            # Extract chapter text and clean it up
            chapter_text = extract_text(start_page, end_page)
            chapter_text = clean_text(chapter_text)

            # Skip empty chapters
            if not chapter_text:
                continue

            # Define chapter filename and save the content
            safe_title = re.sub(r'[^a-zA-Z0-9 ]', '', chapter_title)  # Remove special characters
            chapter_filename = f"{output_directory}/{safe_title or f'Chapter_{i + 1}'}.txt"

            # Prepend book name at the top of the chapter content
            chapter_header = f"Book name: {book_name}\n\n"
            chapter_content = chapter_header + chapter_text

            # Write to file
            with open(chapter_filename, 'w', encoding='utf-8') as chapter_file:
                chapter_file.write(chapter_content)

            print(f"Chapter '{chapter_title}' extracted to '{chapter_filename}'")

except FileNotFoundError:
    print(f"File '{pdf_name}' not found. Please check the file path and name.")
except ValueError as ve:
    print(ve)
except Exception as e:
    print(f"An error occurred: {e}")


Chapter 'Chapter 1. Reliable, Scalable, and Maintainable Applications' extracted to 'books/Designing Data Intensive Applications/Chapter 1 Reliable Scalable and Maintainable Applications.txt'
Chapter 'Chapter 2. Data Models and Query Languages' extracted to 'books/Designing Data Intensive Applications/Chapter 2 Data Models and Query Languages.txt'
Chapter 'Chapter 3. Storage and Retrieval' extracted to 'books/Designing Data Intensive Applications/Chapter 3 Storage and Retrieval.txt'
Chapter 'Chapter 4. Encoding and Evolution' extracted to 'books/Designing Data Intensive Applications/Chapter 4 Encoding and Evolution.txt'
Chapter 'Chapter 5. Replication' extracted to 'books/Designing Data Intensive Applications/Chapter 5 Replication.txt'
Chapter 'Chapter 6. Partitioning' extracted to 'books/Designing Data Intensive Applications/Chapter 6 Partitioning.txt'
Chapter 'Chapter 7. Transactions' extracted to 'books/Designing Data Intensive Applications/Chapter 7 Transactions.txt'
Chapter 'Chapt