# Gutenberg to Audio with GPT-4o

This notebook converts Project Gutenberg books to high-quality audiobooks using OpenAI's GPT-4o and TTS models. The process includes:

1. Downloading and cleaning text from Project Gutenberg
2. Analyzing the text for TTS-specific issues (abbreviations, formatting, etc.)
3. Processing the text to make it suitable for text-to-speech
4. Generating audio using OpenAI's TTS models
5. Quality control and concatenation of audio files

## Requirements
- OpenAI API key configured in your environment
- Python libraries: requests, os, re, json, math, openai, pydub
- Sufficient OpenAI API credits for text processing and audio generation

## Usage
- Set the `BOOK_ID` variable to your desired Project Gutenberg book ID
- Adjust chapter ranges with `CHAPTER_START` and `CHAPTER_STOP` variables
- Run cells sequentially to process the book

## Download and Clean Text from Project Gutenberg

This cell downloads a book from Project Gutenberg using the specified `BOOK_ID` and cleans the text for processing.

The cleaning process:
- Removes Project Gutenberg header and footer content
- Removes footnotes, page numbers, and other non-narrative elements
- Normalizes formatting, including dashes and whitespace
- Processes line breaks to create a flowing text

**Note:** Modify the `BOOK_ID` variable to download a different book. You can find book IDs in Project Gutenberg URLs.


In [9]:
import requests
import os
import re

BOOK_ID = "46468"

# Define start and end strings for cleaning
startstrings = (
    '*** START OF THE PROJECT GUTENBERG EBOOK',
    '***START OF THE PROJECT GUTENBERG EBOOK',
    '*** START OF THIS PROJECT GUTENBERG EBOOK',
    ' *** START OF THIS PROJECT GUTENBERG EBOOK',
    'START OF THIS PROJECT GUTENBERG EBOOK',
    '*** START OF PROJECT GUTENBERG EBOOK',
    '*** START OF THE PROJECT GUTENBERG ETEXT',
    '*END THE SMALL PRINT!',
    '*END*THE SMALL PRINT!',
    '**END THE SMALL PRINT!',
    '*SMALL PRINT! Ver',
    '**The Project Gutenberg Etext',
    '*****These eBooks Were Prepared By Thousands of Volunteers'
)

endstrings = (
    '*** END OF THE PROJECT GUTENBERG EBOOK',
    '***END OF THE PROJECT GUTENBERG EBOOK',
    'End of the Project Gutenberg EBook',
    '*** END OF THIS PROJECT GUTENBERG EBOOK',
    'END OF PROJECT GUTENBERG ETEXT',
    'End of The Project Gutenberg Etext',
    'End of the Project Gutenberg etext',
    'End of Project Gutenberg Etext',
    "End of Project Gutenberg's Etext",
    ' *** END OF THIS PROJECT GUTENBERG EBOOK',
    '      *** END OF THIS PROJECT GUTENBERG EBOOK',
    '       *** END OF THIS PROJECT GUTENBERG EBOOK',
    'End of this Etext',
    'End of this Project Gutenberg Etext',
    'End of the Project Gutenberg Etext',
    'End of The Project Gutenberg EBook',
    '*** START: FULL LICENSE ***'
)

url = f"https://www.gutenberg.org/cache/epub/{BOOK_ID}/pg{BOOK_ID}.txt"
response = requests.get(url)

# Clean the text by removing content before start string and after end string
def clean_gutenberg_text(text, startstrings, endstrings):
    # Find the position of the start string
    start_pos = len(text)
    for start_str in startstrings:
        pos = text.find(start_str)
        if pos != -1 and pos < start_pos:
            start_pos = pos

    # Find the position of the end string
    end_pos = -1
    for end_str in endstrings:
        pos = text.find(end_str)
        if pos != -1 and (end_pos == -1 or pos > end_pos):
            end_pos = pos

    # Extract the text between start and end positions
    if start_pos < len(text) and end_pos > start_pos:
        # Find the end of the line containing the start string
        start_line_end = text.find('\n', start_pos)
        if start_line_end != -1:
            start_pos = start_line_end + 1

        # Clean text is everything after the start string line and before the end string
        raw_text = text[start_pos:end_pos].strip()

        # Remove paragraphs starting with [digit]
        raw_text = re.sub(r'(?m)^\[\d+\].*?(?=\n\n|\Z)', '', raw_text)

        # Remove text in square brackets
        raw_text = re.sub(r'\[.*?\]', '', raw_text, flags=re.DOTALL)

        # Replace double-dashes with a single en-dash, ensuring spaces before and after
        raw_text = re.sub(r'(\S)--(\S)', r'\1 – \2', raw_text)  # No spaces on either side
        raw_text = re.sub(r'(\S)--\s', r'\1 – ', raw_text)      # No space before, space after
        raw_text = re.sub(r'\s--(\S)', r' – \1', raw_text)      # Space before, no space after
        raw_text = re.sub(r'\s--\s', r' – ', raw_text)          # Spaces on both sides

        # Remove whitespace and invisible characters at the beginning and end of each line
        # Filter out lines that contain only visible, non-alphanumeric characters
        clean_lines = []
        for line in raw_text.splitlines():
            line = line.strip()

            # Keep the line if:
            # 1. It contains at least one alphanumeric character, OR
            # 2. It's completely empty
            # Remove the line if it contains only visible, non-alphanumeric characters
            if re.search(r'[a-zA-Z0-9]', line) or not line:
                clean_lines.append(line)
            else:
                # Check if line has only visible, non-alphanumeric characters
                visible_special_chars_only = True
                for char in line:
                    if char.isalnum() or not char.isprintable():
                        visible_special_chars_only = False
                        break

                # If the line doesn't consist only of visible special characters, keep it
                if not visible_special_chars_only:
                    clean_lines.append(line)

        # Join lines with newlines
        text_with_clean_lines = '\n'.join(clean_lines)

        # Process line breaks according to requirements
        # First, normalize all line breaks to a standard form
        normalized_text = re.sub(r'\n{3,}', '\n\n\n', text_with_clean_lines)  # Temporarily convert 3+ newlines to 3

        # Replace single newlines with spaces
        normalized_text = re.sub(r'([^\n])\n([^\n])', r'\1 \2', normalized_text)

        # Replace triple newlines with double newlines
        normalized_text = re.sub(r'\n\n\n', '\n\n', normalized_text)

        return normalized_text
    else:
        # If no start or end string found, apply the same cleaning to the original text
        # Remove paragraphs starting with [digit]
        text = re.sub(r'(?m)^\[\d+\].*?(?=\n\n|\Z)', '', text)

        # Remove text in square brackets
        text = re.sub(r'\[.*?\]', '', text)

        # Replace double-dashes with a single en-dash, ensuring spaces before and after
        text = re.sub(r'(\S)--(\S)', r'\1 – \2', text)  # No spaces on either side
        text = re.sub(r'(\S)--\s', r'\1 – ', text)      # No space before, space after
        text = re.sub(r'\s--(\S)', r' – \1', text)      # Space before, no space after
        text = re.sub(r'\s--\s', r' – ', text)          # Spaces on both sides

        # add the <chapter> Tag one line before each chapter headline (Text starts with "CHAPTER")
        text = text.replace("\nCHAPTER", "<chapter>\nCHAPTER")
        # remove repeating <chapter> tags (2 or more)
        text = text.replace("<chapter><chapter>", "<chapter>")
        text = text.replace("<chapter><chapter>", "<chapter>")
        text = text.replace("<chapter><chapter>", "<chapter>")

        clean_lines = []
        for line in text.splitlines():
            line = line.strip()

            # Same logic as above
            if re.search(r'[a-zA-Z0-9]', line) or not line:
                clean_lines.append(line)
            else:
                # Check if line has only visible, non-alphanumeric characters
                visible_special_chars_only = True
                for char in line:
                    if char.isalnum() or not char.isprintable():
                        visible_special_chars_only = False
                        break

                # If the line doesn't consist only of visible special characters, keep it
                if not visible_special_chars_only:
                    clean_lines.append(line)

        text_with_clean_lines = '\n'.join(clean_lines)

        # Process line breaks
        normalized_text = re.sub(r'\n{3,}', '\n\n\n', text_with_clean_lines)
        normalized_text = re.sub(r'([^\n])\n([^\n])', r'\1 \2', normalized_text)
        normalized_text = re.sub(r'\n\n\n', '\n\n', normalized_text)

        return normalized_text


# Clean the text
cleaned_text = clean_gutenberg_text(response.text, startstrings, endstrings)

# Ensure the books directory exists
if not os.path.exists("books"):
    os.makedirs("books")
    print("Created 'books' directory")

# Save the text to a file in the "books" folder
if not os.path.exists(f"books/{BOOK_ID}"):
    os.makedirs(f"books/{BOOK_ID}")

# Save cleaned text
with open(f"books/{BOOK_ID}/gutenberg_{BOOK_ID}.txt", "w") as f:
    f.write(cleaned_text)


## Analyze Text for TTS Preparation

This cell uses OpenAI's o3-mini model to analyze the text and identify elements that need special handling for text-to-speech:

- Abbreviations that should be spelled out
- Outdated terms or place names that need modernization
- Currency expressions and how they should be pronounced
- Special formatting that needs conversion
- Punctuation or number formatting issues
- Outdated units of measurement

The text is processed in chunks to stay within API limits, with results saved as JSON files.


In [11]:
import json
import os
import math
import requests
from openai import OpenAI

# Load the cleaned text
with open(f"books/{BOOK_ID}/gutenberg_{BOOK_ID}.txt", "r") as f:
    cleaned_text = f.read()

# Function to split text into chunks of max_size characters
def split_text_into_chunks(text, max_size=50000):
    # If text is shorter than max_size, return it as a single chunk
    if len(text) <= max_size:
        return [text]

    chunks = []
    # Find a good splitting point (at paragraph breaks if possible)
    for i in range(0, len(text), max_size):
        if i + max_size >= len(text):
            chunks.append(text[i:])
        else:
            # Try to find a paragraph break near the max_size point
            split_point = text.rfind('\n\n', i, i + max_size)
            if split_point == -1 or split_point < i + max_size // 2:
                # If no good paragraph break, find the nearest sentence end
                split_point = text.rfind('. ', i, i + max_size)
                if split_point == -1 or split_point < i + max_size // 2:
                    # If no good sentence break either, just split at max_size
                    split_point = i + max_size
                else:
                    split_point += 2  # Include the period and space
            else:
                split_point += 2  # Include the paragraph break

            chunks.append(text[i:split_point])

    return chunks

# Function to get hints from o3-mini for a text chunk
def get_tts_hints(text_chunk, client):
    system_prompt = """You are an expert in preparing text for text-to-speech processing."""

    user_prompt = f"""I want you to analyze the following text (a chapter from a book) and give me hints on how it can be made suitable for text-to-speech.

Please create a list of hints that could cover the following aspects:
1. What abbreviations appear and how should they be spelled out?
2. What outdated terms, place names, or currencies that are probably no longer understandable today appear and how could they be translated into modern language? Only such terms should be modernized that are no longer understandable or can no longer be classified for today's readers.
3. For currency conversions: How are they pronounced?
4. What special formatting (e.g. tables, lists) appears and how should it be converted into readable text?
5. Are there any peculiarities in punctuation or number formatting that could be problematic for TTS?
6. Are there outdated units of measurement and how can they be converted into metric units?

Your hints will later be used to prepare the text uniformly for text-to-speech. Only return the hints, no explanations or comments. Only give hints on aspects that are relevant in this chapter. Leave out everything that is irrelevant for this chapter. Be brief and do not give explanations or comments. Only mention the replacements that should be made.

{text_chunk}"""

    response_format_prompt = """Please only respond with a one-dimensional list of hints in JSON format, without any additional text. It should only be a simple array of strings. Only consider those aspects that are relevant in this chapter. Be brief and do not give explanations or comments. Only mention the replacements that should be made. Very important: Only consider those aspects that are relevant in this chapter. All other aspects should not be mentioned."""

    response = client.chat.completions.create(
        model="o3-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "user", "content": response_format_prompt}
        ],
        max_completion_tokens=20000,
        reasoning_effort="high"
    )

    # Extract the JSON response
    try:
        hints = json.loads(response.choices[0].message.content)
        return hints
    except json.JSONDecodeError:
        print("Error: Could not parse JSON response")
        print(response.choices[0].message.content)
        return {"hints": []}

# Process the text in chunks and save results
def process_text_for_tts(text, book_id):
    # Create the OpenAI client
    client = OpenAI()

    # Split the text into chunks
    chunks = split_text_into_chunks(text)
    print(f"Split text into {len(chunks)} chunks")

    # Create directory for JSON files if it doesn't exist
    json_dir = f"books/{book_id}/hints"
    if not os.path.exists(json_dir):
        os.makedirs(json_dir)

    # Process each chunk and save results
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}")
        hints = get_tts_hints(chunk, client)

        # Save hints to JSON file
        with open(f"{json_dir}/hints_{i:03d}.json", "w") as f:
            json.dump(hints, f, indent=2)

        print(f"Saved hints for chunk {i+1} to {json_dir}/hints_{i:03d}.json")

# Run the processing
process_text_for_tts(cleaned_text, BOOK_ID)

Split text into 6 chunks
Processing chunk 1/6
Saved hints for chunk 1 to books/46468/json/hints_000.json
Processing chunk 2/6
Saved hints for chunk 2 to books/46468/json/hints_001.json
Processing chunk 3/6
Saved hints for chunk 3 to books/46468/json/hints_002.json
Processing chunk 4/6
Saved hints for chunk 4 to books/46468/json/hints_003.json
Processing chunk 5/6
Saved hints for chunk 5 to books/46468/json/hints_004.json
Processing chunk 6/6
Saved hints for chunk 6 to books/46468/json/hints_005.json


## Consolidate TTS Preparation Hints

This cell combines all the hints generated from individual text chunks into a single, deduplicated list.

The consolidated hints will be used to guide the text-to-speech preparation process, ensuring consistent handling of abbreviations, terms, and formatting throughout the book.

The final list is saved as `hints.json` in the book's directory.


In [14]:
def consolidate_hints(book_id):
    """
    Consolidate all hints from individual JSON files into a single, deduplicated list.
    Then ask o3-mini to summarize and organize these hints into a simple array.
    """
    # Path to the JSON directory
    json_dir = f"books/{book_id}/hints"

    # Get all hint files
    hint_files = [f for f in os.listdir(json_dir) if f.startswith("hints_") and f.endswith(".json")]
    hint_files.sort()  # Sort to process in order

    # Collect all hints
    all_hints = []
    for file in hint_files:
        with open(os.path.join(json_dir, file), "r") as f:
            try:
                data = json.load(f)
                # Handle different possible structures
                if isinstance(data, list):
                    all_hints.extend(data)
                elif isinstance(data, dict) and "hints" in data:
                    all_hints.extend(data["hints"])
                elif any(key in data for key in
                         ["abbreviations", "terms", "currencies", "formatting", "punctuation", "units"]):
                    # If it's a structured format, flatten it
                    for category in data.values():
                        if isinstance(category, list):
                            all_hints.extend(category)
            except json.JSONDecodeError:
                print(f"Error: Could not parse JSON in {file}")

    # Remove duplicates while preserving order
    unique_hints = []
    for hint in all_hints:
        if hint not in unique_hints and hint.strip():  # Skip empty hints
            unique_hints.append(hint)

    print(f"Collected {len(all_hints)} hints, {len(unique_hints)} after deduplication")

    # Ask o3-mini to organize and summarize the hints
    client = OpenAI()

    system_prompt = """You are an expert in preparing text for text-to-speech processing."""

    user_prompt = f"""I have collected the following hints for text-to-speech preparation from different parts of a book:

{json.dumps(unique_hints, indent=2)}

Please consolidate these hints into a comprehensive, non-redundant list of replacements that apply to the entire book.

Important requirements:
1. Only include replacements that should be applied throughout the entire book
2. Do NOT include spelling corrections. Do not replace dashes (–) or apostrophes.
3. Focus only on:
   - Abbreviations and how they should be spelled out
   - Outdated terms/place names/currencies and their modern equivalents
   - How currencies should be pronounced
   - Special formatting instructions
   - Punctuation or number formatting issues
   - Outdated units of measurement and their metric conversions

Each hint should be a clear instruction for replacement, like "Replace 'Dr.' with 'Doctor'" or "Convert '£5' to 'five pounds'".
"""

    response_format_prompt = """Please respond ONLY with a one-dimensional array (simple list) of strings in JSON format.
Do NOT use an associative array or object with categories.
The response should be a flat list like:
["Replace 'Dr.' with 'Doctor'", "Replace 'Mr.' with 'Mister'", ...]

Very important: Do not include any explanations or comments outside the JSON array. The response must be valid JSON that can be parsed directly."""

    response = client.chat.completions.create(
        model="o3-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "user", "content": response_format_prompt}
        ],
        max_completion_tokens=20000,
        reasoning_effort="high"
    )

    # Extract and save the consolidated hints
    try:
        consolidated_hints = json.loads(response.choices[0].message.content)

        # Verify it's a simple list
        if not isinstance(consolidated_hints, list):
            print("Warning: Response is not a simple list. Converting to list format.")
            # If it's not a list, try to extract a list from it
            if isinstance(consolidated_hints, dict):
                temp_list = []
                for category in consolidated_hints.values():
                    if isinstance(category, list):
                        temp_list.extend(category)
                consolidated_hints = temp_list
            else:
                consolidated_hints = [str(consolidated_hints)]

        # Save consolidated hints
        with open(f"{json_dir}/hints.json", "w") as f:
            json.dump(consolidated_hints, f, indent=2)

        print(f"Saved consolidated hints to {json_dir}/hints.json")
        return consolidated_hints
    except json.JSONDecodeError:
        print("Error: Could not parse JSON response")
        print(response.choices[0].message.content)
        return {"error": "Failed to consolidate hints"}


# Run the consolidation after processing all chunks
consolidated_hints = consolidate_hints(BOOK_ID)


Collected 56 hints, 56 after deduplication
Saved consolidated hints to books/46468/hints/hints.json


## Manual Text Preparation (Required)

**IMPORTANT:** Before running the text processing cells, you must manually edit the cleaned Gutenberg text file to:

1. **Remove irrelevant content:**
   - This only requires rough cleaning at the beginning and end of the document
   - Focus on obvious text blocks that shouldn't be read aloud
   - Table of contents
   - Publishing information
   - Appendices
   - Indexes
   - Any other non-narrative content at the beginning or end


2. **Insert chapter markers:**
   - Add `<chapter>` tags on a line by themselves immediately before each chapter heading (if not already present)
   - Example:
     ```
     <chapter>
     CHAPTER I. The Beginning

     It was a dark and stormy night...
     ```
   - Do NOT place a `<chapter>` tag at the very beginning or end of the document
   - Only place tags between actual chapters

This manual step typically takes about 5 minutes and is necessary because:
1. Large books cannot be processed by language models in their entirety
2. Automated detection of chapter boundaries is unreliable across different book formats
3. Non-narrative content can negatively affect the audio quality and listening experience

**What NOT to manually edit** (these will be handled automatically):
- Footnotes and endnotes
- Abbreviations (e.g., "Dr.", "Mr.", etc.)
- Numbers that need to be spelled out
- Formatting issues like dashes or quotation marks
- Special characters or symbols
- Currency expressions

The file to edit is located at: `books/{BOOK_ID}/gutenberg_{BOOK_ID}.txt`


## Process Text for Text-to-Speech

This cell applies the consolidated hints to prepare the text for text-to-speech conversion.

The process:
1. Splits the book into chapters and sections
2. Processes each section with o3-mini to:
   - Spell out abbreviations
   - Convert numbers to words
   - Apply specific replacements from the hints
   - Remove elements unsuitable for reading aloud

You can control which chapters to process by modifying `CHAPTER_START` and `CHAPTER_STOP` variables.

Processed text is saved in the `books/{BOOK_ID}/txt/` directory.


In [27]:
import re

CHAPTER_START = 0
CHAPTER_STOP = 99

# Load the cleaned text
with open(f"books/{BOOK_ID}/gutenberg_{BOOK_ID}.txt", "r") as f:
    cleaned_text = f.read()

# Load the consolidated hints
with open(f"books/{BOOK_ID}/hints/hints.json", "r") as f:
    import json

    hints = json.load(f)


# Function to split text into chapters based on <chapter> tags
def split_into_chapters(text):
    # Split by <chapter> tag
    chapters = re.split(r'<chapter>', text)
    # Remove empty chapters
    chapters = [chapter.strip() for chapter in chapters if chapter.strip()]
    return chapters


# Function to split chapter into sections of max 3500 characters
def split_chapter_into_sections(chapter_text, max_chars=3500):
    sections = []
    paragraphs = chapter_text.split('\n\n')
    current_section = ""

    for paragraph in paragraphs:
        # If adding this paragraph would exceed the limit, save current section and start a new one
        if len(current_section + paragraph) > max_chars and current_section:
            sections.append(current_section.strip())
            current_section = paragraph + "\n\n"
        else:
            current_section += paragraph + "\n\n"

    # Add the last section if it's not empty
    if current_section.strip():
        sections.append(current_section.strip())

    return sections


# Function to process a section with o3-mini
def process_section_with_o3mini(section_text, hints, client):
    # Create a string with all the hints
    hints_text = "\n".join(hints)

    system_prompt = """You are an expert in preparing text for text-to-speech processing. Your task is to convert text to be suitable for reading aloud. Return ONLY the converted text without any comments, explanations, or additional formatting."""

    user_prompt = f"""I want you to convert the following text (a chapter from a book) so that it is suitable for text-to-speech.
It should be able to be read aloud unambiguously and no longer contain abbreviations.
So it should be a readable text flow. All numbers should be converted to words. All abbreviations should be spelled out. If there is a year like 1995, it should be converted to 'nineteen ninety-five'.

Additionally, please remove any elements that are not suitable for reading aloud, such as:
- Footnotes and endnotes
- Page numbers
- Bibliographic references
- Table of contents entries
- Image captions or figure references
- Any metadata or formatting instructions
- Special characters or symbols that are not commonly used in written text. For example Markdown syntax or HTML tags.

Please apply the following specific replacements and guidelines:
{hints_text}

Here is the text to convert:
{section_text}"""

    response_format_prompt = """IMPORTANT: Please respond ONLY with the converted text itself. Do not include any explanations, comments, or formatting instructions. Do not wrap the text in quotes or code blocks. Just return the plain converted text that could be read aloud."""

    response = client.chat.completions.create(
        model="o3-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "user", "content": response_format_prompt}
        ],
        max_completion_tokens=20000,
        reasoning_effort="high"
    )

    return response.choices[0].message.content


# Main processing function
def process_book_for_tts(book_id, cleaned_text, hints, chapter_start=0, chapter_stop=99):
    # Create OpenAI client
    client = OpenAI()

    # Create directory for processed sections
    output_dir = f"books/{book_id}/txt"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Split text into chapters
    all_chapters = split_into_chapters(cleaned_text)
    print(f"Found {len(all_chapters)} chapters in total")

    # Filter chapters based on start and stop indices
    chapter_stop = min(chapter_stop, len(all_chapters) - 1)  # Ensure chapter_stop doesn't exceed available chapters
    chapters_to_process = all_chapters[chapter_start:chapter_stop + 1]  # +1 because we want to include chapter_stop

    print(f"Processing chapters {chapter_start} to {chapter_stop} ({len(chapters_to_process)} chapters)")

    # Process each chapter
    for relative_idx, chapter in enumerate(chapters_to_process):
        chapter_idx = chapter_start + relative_idx
        print(f"Processing chapter {chapter_idx} ({relative_idx+1}/{len(chapters_to_process)})")

        # Split chapter into sections
        sections = split_chapter_into_sections(chapter)
        print(f"  Split chapter {chapter_idx} into {len(sections)} sections")

        # Process each section
        for section_idx, section in enumerate(sections):
            print(f"  Processing section {section_idx + 1}/{len(sections)} of chapter {chapter_idx}")

            # Process section with o3-mini
            processed_section = process_section_with_o3mini(section, hints, client)

            # Save processed section as plain text
            output_file = f"{output_dir}/clean_text_{chapter_idx:03d}_{section_idx:03d}.txt"
            with open(output_file, "w") as f:
                f.write(processed_section)

            print(f"  Saved processed section to {output_file}")


# Run the processing
process_book_for_tts(BOOK_ID, cleaned_text, hints, CHAPTER_START, CHAPTER_STOP)


Found 13 chapters in total
Processing chapters 9 to 12 (4 chapters)
Processing chapter 9 (1/4)
  Split chapter 10 into 8 sections
  Processing section 1/8 of chapter 10
  Saved processed section to books/46468/txt/clean_text_009_000.txt
  Processing section 2/8 of chapter 10
  Saved processed section to books/46468/txt/clean_text_009_001.txt
  Processing section 3/8 of chapter 10
  Saved processed section to books/46468/txt/clean_text_009_002.txt
  Processing section 4/8 of chapter 10
  Saved processed section to books/46468/txt/clean_text_009_003.txt
  Processing section 5/8 of chapter 10
  Saved processed section to books/46468/txt/clean_text_009_004.txt
  Processing section 6/8 of chapter 10
  Saved processed section to books/46468/txt/clean_text_009_005.txt
  Processing section 7/8 of chapter 10
  Saved processed section to books/46468/txt/clean_text_009_006.txt
  Processing section 8/8 of chapter 10
  Saved processed section to books/46468/txt/clean_text_009_007.txt
Processing cha

## Generate Narration Style Guidelines

This cell analyzes a sample of the processed text to generate narration style guidelines for the text-to-speech system.

The guidelines cover:
- Tone: Overall vocal quality
- Pacing: Reading speed and variations
- Pronunciation: Specific enunciation guidance
- Emotion: Emotional qualities to convey
- Inflection: Pitch, emphasis, and vocal variation
- Word Choice: Vocabulary considerations

These guidelines help the TTS system produce more natural and appropriate narration for the book's style and content.

In [18]:

def generate_reader_hints(book_id):
    # Create OpenAI client
    client = OpenAI()

    # Create txt directory if it doesn't exist
    output_dir = f"books/{book_id}/txt"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Find all processed text files
    txt_files = [f for f in os.listdir(output_dir) if f.startswith("clean_text_") and f.endswith(".txt")]
    txt_files.sort()

    # Determine a middle chapter
    if not txt_files:
        print("No processed text files found. Please process chapters first.")
        return

    # Extract chapter numbers from filenames
    chapter_numbers = sorted(list(set([int(f.split('_')[2]) for f in txt_files])))

    if not chapter_numbers:
        print("Could not determine chapter numbers from filenames.")
        return

    # Select a middle chapter
    middle_chapter = chapter_numbers[len(chapter_numbers) // 2]
    print(f"Selected middle chapter: {middle_chapter}")

    # Find the first section of the middle chapter
    first_section_file = None
    for f in txt_files:
        if f.startswith(f"clean_text_{middle_chapter:03d}_000"):
            first_section_file = f
            break

    if not first_section_file:
        # Try to find any section from the middle chapter
        for f in txt_files:
            if f.startswith(f"clean_text_{middle_chapter:03d}_"):
                first_section_file = f
                break

    if not first_section_file:
        print(f"No sections found for chapter {middle_chapter}.")
        return

    print(f"Using file: {first_section_file}")

    # Load the text from the file
    with open(os.path.join(output_dir, first_section_file), "r") as f:
        section_text = f.read()

    # Create prompt for o3-mini
    system_prompt = """You are an expert in providing guidance for audiobook narrators. Your task is to analyze a text sample and provide specific, detailed instructions for how it should be read aloud."""

    user_prompt = f"""Please analyze the following text sample from a book and provide detailed guidance for a narrator who will be reading this aloud to an educated adult audience.

Your guidance should cover the following aspects:
1. Tone: The overall vocal quality and character that should be used
2. Pacing: How quickly or slowly the text should be read, including any variations
3. Pronunciation: Any specific guidance on how words should be enunciated
4. Emotion: The emotional quality that should be conveyed
5. Inflection: Guidance on pitch, emphasis, and vocal variation
6. Word Choice: Any specific vocabulary considerations

Please be specific and detailed in your guidance, tailoring it to the style and content of the text. Think about what would appeal to an educated adult audience.

Here is the text sample:
{section_text}"""

    response_format_prompt = """Format your response as follows:

Tone: [detailed description]

Pacing: [detailed description]

Pronunciation: [detailed description]

Emotion: [detailed description]

Inflection: [detailed description]

Word Choice: [detailed description]

IMPORTANT: Provide only these six sections with their descriptions. Do not include any additional comments, explanations, or introductory text."""

    response = client.chat.completions.create(
        model="o3-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "user", "content": response_format_prompt}
        ],
        max_completion_tokens=20000,
        reasoning_effort="high"
    )

    reader_hints = response.choices[0].message.content

    # Save the reader hints
    with open(f"{output_dir}/reader_hints.txt", "w") as f:
        f.write(reader_hints)

    print(f"Saved reader hints to {output_dir}/reader_hints.txt")
    return reader_hints


# Generate reader hints
reader_hints = generate_reader_hints(BOOK_ID)
print("\nReader Hints Preview:")
print("---------------------")
print(reader_hints[:500] + "..." if len(reader_hints) > 500 else reader_hints)


Selected middle chapter: 4
Using file: clean_text_004_000.txt
Saved reader hints to books/46468/txt/reader_hints.txt

Reader Hints Preview:
---------------------
Tone: Adopt a clear, authoritative tone with a measured formality that conveys the historical weight and vivid detail of the narrative. The overall quality should be confident and slightly reserved, inviting the listener into an account of exploration and cultural encounter without losing the narrative’s inherent warmth.

Pacing: Read the text at a moderate pace that allows the listener to absorb the intricate descriptions and historical nuances. Slow down during passages that describe detailed ...


## Generate Audio with OpenAI TTS

This cell converts the processed text to speech using OpenAI's TTS models.

Features:
- Uses the reader hints to guide the narration style
- Processes each section separately
- Supports different voice options (default: "coral")
- Works with both standard TTS models and GPT-4o-mini-TTS

You can control which chapters to process by modifying `CHAPTER_START` and `CHAPTER_STOP` variables.

Audio files are saved in the `books/{BOOK_ID}/audio/` directory.


In [39]:
import os
from openai import OpenAI

# Configuration variables
BOOK_ID = "46468"
CHAPTER_START = 0
CHAPTER_STOP = 99
# Available voices: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
VOICE_NAME = "coral"  # Default voice

def text_to_wav_openai_tts(text: str, book_id: str, chapter_id: int, section_id: int,
                           voice_name: str, model: str = "gpt-4o-mini-tts",
                           instructions: str = None, speed: float = 1.0, response_format: str = "mp3") -> str:
    """Convert text to speech using OpenAI TTS and save as an audio file."""
    # Remove line breaks and whitespace from the beginning and end of the text
    text = text.strip()
    text = text + " "

    # Check text length (max 4096 characters)
    if len(text) > 4096:
        print(f"Warning: Text is too long ({len(text)} characters). Truncating to 4096 characters.")
        text = text[:4093] + "..."

    # Check speed (between 0.25 and 4.0)
    if speed < 0.25 or speed > 4.0:
        print(f"Warning: Invalid speed {speed}. Setting to 1.0.")
        speed = 1.0

    # Initialize OpenAI Client
    client = OpenAI()

    # Format IDs with leading zeros
    chapter_id_str = str(chapter_id).zfill(3)
    section_id_str = str(section_id).zfill(3)

    # Create directory if it doesn't exist
    os.makedirs(f"books/{book_id}/audio", exist_ok=True)

    # Generate audio with OpenAI TTS
    try:
        # Create filename with audio format
        file_extension = response_format if response_format != "pcm" else "wav"
        filename = f"books/{book_id}/audio/gutenberg_{book_id}_{chapter_id_str}_{section_id_str}.{file_extension}"

        # Create parameters for the API
        params = {
            "model": model,
            "voice": voice_name,
            "input": text,
            "response_format": response_format,
            "speed": speed
        }

        # Add instructions if available and the model is not tts-1 or tts-1-hd
        if instructions and model not in ["tts-1", "tts-1-hd"]:
            params["instructions"] = instructions

        # Call the API
        with client.audio.speech.with_streaming_response.create(**params) as response:
            # Open the file in binary write mode
            with open(filename, 'wb') as f:
                # Write the response content to the file
                for chunk in response.iter_bytes():
                    f.write(chunk)

        return filename

    except Exception as e:
        print(f"Error generating audio with OpenAI TTS: {e}")
        return ""


# Load reader hints
reader_hints_file = f"books/{BOOK_ID}/txt/reader_hints.txt"
if os.path.exists(reader_hints_file):
    with open(reader_hints_file, "r") as f:
        reader_hints = f.read()
    print("Loaded reader hints for TTS instructions")
else:
    reader_hints = ""
    print("No reader hints found, will use default TTS settings")


# Function to process a section with TTS
def process_section_with_tts(book_id, chapter_id, section_id, voice_name=VOICE_NAME, model="gpt-4o-mini-tts"):
    # Load the section text
    section_file = f"books/{book_id}/txt/clean_text_{chapter_id:03d}_{section_id:03d}.txt"

    if not os.path.exists(section_file):
        print(f"Section file {section_file} not found")
        return

    with open(section_file, "r") as f:
        section_text = f.read()

    print(f"Processing section from chapter {chapter_id}, section {section_id}")

    # Generate audio for the entire section
    filename = text_to_wav_openai_tts(
        text=section_text,
        book_id=book_id,
        chapter_id=chapter_id,
        section_id=section_id,
        voice_name=voice_name,
        model=model,
        instructions=reader_hints if model == "gpt-4o-mini-tts" else None
    )

    if filename:
        print(f"Generated audio: {filename}")
        return filename
    else:
        print(f"Failed to generate audio for chapter {chapter_id}, section {section_id}")
        return None

def process_chapters_with_tts(book_id, chapter_start, chapter_stop, voice_name=VOICE_NAME, model="gpt-4o-mini-tts"):
    """Process all chapters within the specified range with TTS."""
    print(f"Processing chapters {chapter_start} to {chapter_stop} with TTS using voice: {voice_name}")

    # Get all available text files
    txt_dir = f"books/{book_id}/txt"
    if not os.path.exists(txt_dir):
        print(f"Text directory {txt_dir} not found")
        return

    # Find all chapter files within the range
    processed_files = 0
    for chapter_id in range(chapter_start, chapter_stop + 1):
        # Find all section files for this chapter
        section_files = [f for f in os.listdir(txt_dir)
                         if f.startswith(f"clean_text_{chapter_id:03d}_") and f.endswith(".txt")]

        if not section_files:
            print(f"No sections found for chapter {chapter_id}")
            continue

        section_files.sort()
        print(f"Found {len(section_files)} sections for chapter {chapter_id}")

        # Process each section
        for section_file in section_files:
            # Extract section ID from filename
            section_id = int(section_file.split('_')[3].split('.')[0])

            print(f"Processing chapter {chapter_id}, section {section_id}")
            result = process_section_with_tts(
                book_id=book_id,
                chapter_id=chapter_id,
                section_id=section_id,
                voice_name=voice_name,
                model=model
            )

            if result:
                processed_files += 1

    print(f"Finished processing {processed_files} sections across chapters {chapter_start} to {chapter_stop}")

# Process chapters with the configured voice
process_chapters_with_tts(BOOK_ID, CHAPTER_START, CHAPTER_STOP, voice_name=VOICE_NAME)


Loaded reader hints for TTS instructions
Processing chapters 0 to 1 with TTS
Found 1 sections for chapter 0
Processing chapter 0, section 0
Processing section from chapter 0, section 0
Fehler bei der Generierung von Audio mit OpenAI TTS: Speech.create() got an unexpected keyword argument 'temperature'
Failed to generate audio for chapter 0, section 0
Found 3 sections for chapter 1
Processing chapter 1, section 0
Processing section from chapter 1, section 0
Fehler bei der Generierung von Audio mit OpenAI TTS: Speech.create() got an unexpected keyword argument 'temperature'
Failed to generate audio for chapter 1, section 0
Processing chapter 1, section 1
Processing section from chapter 1, section 1
Fehler bei der Generierung von Audio mit OpenAI TTS: Speech.create() got an unexpected keyword argument 'temperature'
Failed to generate audio for chapter 1, section 1
Processing chapter 1, section 2
Processing section from chapter 1, section 2
Fehler bei der Generierung von Audio mit OpenAI T

## Quality Control: Detect Issues in Audio Files

This cell analyzes the generated audio files to identify potential quality issues:

- Silent segments that might indicate processing problems
- Length discrepancies between expected and actual duration
- Other anomalies that could affect listening experience

The analysis helps identify files that may need regeneration due to quality issues.


In [34]:
from audio_processing import detect_silence_in_mp3_parallel

# Run the silence detection with the book ID from the notebook
silence_results = detect_silence_in_mp3_parallel(
    book_id=BOOK_ID,
    silence_threshold=-50,
    min_silence_duration=3000,
    expected_seconds_per_1000_chars=60,
    length_tolerance=0.4,
    max_workers=4  # Adjust based on your CPU cores
)

print("\nSummary:")
print(f"Found {len(silence_results)} files with issues")

# Count files with silence issues
silence_issue_count = sum(1 for file_info in silence_results if file_info.get('has_silence_issue', False))
print(f"Files with silence issues: {silence_issue_count}")

# Count files with length issues
length_issue_count = sum(1 for file_info in silence_results if file_info.get('has_length_issue', False))
print(f"Files with length issues: {length_issue_count}")

# Detailed report
for file_info in silence_results:
    print(f"\nChapter {file_info['chapter_id']}, Section {file_info['section_id']}:")

    # Report silence issues
    if file_info.get('has_silence_issue', False):
        silent_segments = file_info['silent_segments']
        print(f"  Silence issues: {len(silent_segments)} silent segments")
        # Calculate percentage of silence
        total_silence = sum(duration for _, _, duration in silent_segments)
        silence_percentage = (total_silence / file_info['duration']) * 100 if file_info['duration'] > 0 else 0
        print(f"  Total silence: {total_silence:.2f}s ({silence_percentage:.2f}% of {file_info['duration']:.2f}s total)")

    # Report length issues
    if file_info.get('has_length_issue', False):
        print(f"  Length issue: Text length: {file_info.get('text_length', 'N/A')} chars")
        print(f"  Expected duration: {file_info.get('expected_duration', 'N/A'):.2f}s")
        print(f"  Actual duration: {file_info['duration']:.2f}s")
        print(f"  Difference: {file_info.get('duration_diff_percent', 'N/A'):.2f}%")

Found 56 MP3 files to analyze
Chapter 004, Section 000 contains 1 silent segments:
  Silent from 267.43s to 271.61s (duration: 4.18s)
Chapter 011, Section 008 contains 1 silent segments:
  Silent from 252.88s to 256.80s (duration: 3.92s)

Summary:
Found 2 files with issues
Files with silence issues: 2
Files with length issues: 0

Chapter 004, Section 000:
  Silence issues: 1 silent segments
  Total silence: 4.18s (1.54% of 271.61s total)

Chapter 011, Section 008:
  Silence issues: 1 silent segments
  Total silence: 3.92s (1.53% of 256.80s total)


## Regenerate Problematic Audio Files

This cell regenerates any audio files that were identified as having issues in the previous step.

The process:
1. Identifies files with silence or length issues
2. Regenerates those specific files using the same voice and model
3. Re-analyzes the regenerated files to confirm issues are resolved

**IMPORTANT:** Run this cell and the silence detection cell repeatedly until no more issues are found. Sometimes regenerated files may still have problems and require multiple regeneration attempts to achieve optimal quality.

This ensures all audio segments meet quality standards before final assembly.


In [37]:
def regenerate_files_with_issues(book_id, voice_name=VOICE_NAME, model="gpt-4o-mini-tts"):
    """Regenerate files that have silence or length issues."""

    if not silence_results:
        print("No files with issues detected. Nothing to regenerate.")
        return

    print(f"\nFound {len(silence_results)} files with issues. Regenerating with voice: {voice_name}")

    # Regenerate each file with issues
    for file_info in silence_results:
        # Convert chapter_id and section_id to integers if they're strings
        chapter_id = int(file_info['chapter_id']) if isinstance(file_info['chapter_id'], str) else file_info['chapter_id']
        section_id = int(file_info['section_id']) if isinstance(file_info['section_id'], str) else file_info['section_id']

        print(f"\nRegenerating Chapter {chapter_id}, Section {section_id}")

        # Report the issues
        if file_info.get('has_silence_issue', False):
            print(f"Original file had {len(file_info['silent_segments'])} silent segments")

        if file_info.get('has_length_issue', False):
            print(f"Original file had length issue: expected {file_info.get('expected_duration', 'N/A'):.2f}s, got {file_info['duration']:.2f}s")

        # Process this section with the new voice
        result = process_section_with_tts(
            book_id=book_id,
            chapter_id=chapter_id,
            section_id=section_id,
            voice_name=voice_name,
            model=model
        )

        if result:
            print(f"Successfully regenerated: {result}")
        else:
            print(f"Failed to regenerate Chapter {chapter_id}, Section {section_id}")

    print("\nFinished regenerating files with issues")

# Run the regeneration process
regenerate_files_with_issues(BOOK_ID)


# Run the silence detection with the book ID from the notebook
silence_results = detect_silence_in_mp3_parallel(
    book_id=BOOK_ID,
    silence_threshold=-50,  # Adjust this threshold as needed (-50 dB is a good starting point)
    min_silence_duration=3000,  # Minimum 3 seconds of silence (in milliseconds)
    expected_seconds_per_1000_chars=60,  # Expect 60 seconds of audio per 1000 characters
    length_tolerance=0.4,  # Allow 40% deviation from expected length
    silence_results=silence_results,
    max_workers=4  # For max_workers CPU cores

)

print("\nSummary:")
print(f"Found {len(silence_results)} files with issues")

# Count files with silence issues
silence_issue_count = sum(1 for file_info in silence_results if file_info.get('has_silence_issue', False))
print(f"Files with silence issues: {silence_issue_count}")

# Count files with length issues
length_issue_count = sum(1 for file_info in silence_results if file_info.get('has_length_issue', False))
print(f"Files with length issues: {length_issue_count}")


Found 1 files with issues. Regenerating...

Regenerating Chapter 4, Section 0
Original file had 2 silent segments
Original file had length issue: expected 234.12s, got 416.40s
Processing section from chapter 4, section 0
Generated audio: books/46468/audio/gutenberg_46468_004_000.mp3
Successfully regenerated: books/46468/audio/gutenberg_46468_004_000.mp3

Finished regenerating files with issues
Rechecking 1 previously identified files with silence issues

Summary:
Found 0 files with issues
Files with silence issues: 0
Files with length issues: 0


## Assemble Final Audiobook Chapters

This cell combines the individual audio section files into complete chapter files.

The process:
1. Groups audio files by chapter
2. Sorts sections in correct order
3. Concatenates all sections for each chapter
4. Adds a brief silence at the end of each chapter
5. Exports complete chapter files in MP3 format

The final chapter files are saved in the `books/{BOOK_ID}/chapters/` directory, ready for listening or further processing.


In [38]:
import os
from pydub import AudioSegment
import glob

def concatenate_mp3_to_chapters(book_id, add_silence_ms=1500):
    """
    Concatenate all MP3 files for each chapter into a single MP3 file.
    Add specified amount of silence at the end of each chapter.

    Args:
        book_id (str): The ID of the book
        add_silence_ms (int): Milliseconds of silence to add at the end of each chapter
    """
    # Create output directory for concatenated chapters
    output_dir = f"books/{book_id}/chapters"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Get all MP3 files in the audio directory
    audio_dir = f"books/{book_id}/audio"
    if not os.path.exists(audio_dir):
        print(f"Audio directory {audio_dir} not found")
        return

    # Group files by chapter
    chapter_files = {}
    for file_path in glob.glob(f"{audio_dir}/gutenberg_{book_id}_*.mp3"):
        # Extract chapter and section IDs from filename
        filename = os.path.basename(file_path)
        parts = filename.split('_')
        if len(parts) >= 4:
            chapter_id = parts[2]
            section_id = parts[3].split('.')[0]

            if chapter_id not in chapter_files:
                chapter_files[chapter_id] = []

            chapter_files[chapter_id].append((file_path, int(section_id)))

    # Process each chapter
    for chapter_id, files in chapter_files.items():
        print(f"Processing chapter {chapter_id}")

        # Sort files by section ID
        files.sort(key=lambda x: x[1])

        # Concatenate all sections for this chapter
        combined = AudioSegment.empty()
        for file_path, _ in files:
            print(f"  Adding {os.path.basename(file_path)}")
            audio = AudioSegment.from_mp3(file_path)
            combined += audio

        # Add silence at the end of the chapter
        silence = AudioSegment.silent(duration=add_silence_ms)
        combined += silence

        # Export the combined audio
        output_file = f"{output_dir}/chapter_{chapter_id}.mp3"
        combined.export(output_file, format="mp3")

        print(f"Created chapter file: {output_file} ({len(combined)/1000:.2f} seconds)")

    print(f"Finished concatenating {len(chapter_files)} chapters")

# Run the concatenation process
concatenate_mp3_to_chapters(BOOK_ID)

Processing chapter 003
  Adding gutenberg_46468_003_000.mp3
  Adding gutenberg_46468_003_001.mp3
  Adding gutenberg_46468_003_002.mp3
  Adding gutenberg_46468_003_003.mp3
Created chapter file: books/46468/chapters/chapter_003.mp3 (735.44 seconds)
Processing chapter 002
  Adding gutenberg_46468_002_000.mp3
  Adding gutenberg_46468_002_001.mp3
  Adding gutenberg_46468_002_002.mp3
  Adding gutenberg_46468_002_003.mp3
  Adding gutenberg_46468_002_004.mp3
Created chapter file: books/46468/chapters/chapter_002.mp3 (1134.97 seconds)
Processing chapter 012
  Adding gutenberg_46468_012_000.mp3
  Adding gutenberg_46468_012_001.mp3
  Adding gutenberg_46468_012_002.mp3
Created chapter file: books/46468/chapters/chapter_012.mp3 (549.01 seconds)
Processing chapter 011
  Adding gutenberg_46468_011_000.mp3
  Adding gutenberg_46468_011_001.mp3
  Adding gutenberg_46468_011_002.mp3
  Adding gutenberg_46468_011_003.mp3
  Adding gutenberg_46468_011_004.mp3
  Adding gutenberg_46468_011_005.mp3
  Adding gute