# Audiobook Pipeline Testing

End-to-end audiobook pipeline: PDF → section extraction → LLM narration → TTS rendering.

**Prerequisites:** `uv sync --extra mlx` (Kokoro + Chatterbox).

**What you can do here:**
- Extract sections from a PDF and inspect them
- Generate LLM-adapted narration for a section
- Render narration to audio with Kokoro or Chatterbox TTS
- Compare backends side by side
- Save rendered audio to WAV files

Run cells 1–2 (setup + config) first, then run sections in order.

In [None]:
%load_ext autoreload
%autoreload 2

# 1. Setup — add src/ to path so we can import all packages
import sys
import time
from pathlib import Path

import numpy as np
from IPython.display import Audio, display

sys.path.insert(0, str(Path.cwd().parents[1] / "src"))

from shared.extract import extract_sections
from shared.markdown_parser import Section
from shared.providers import (
    KokoroTTS, ChatterboxTTS,
    MLXLLM, OllamaLLM,
    get_tts_runtime,
)
from audiobook import (
    AudiobookConfig, NarrationConfig,
    adapt_narration_section,
    load_tts_model, render_section, SAMPLE_RATE,
)

print(f"TTS runtime: {get_tts_runtime()}")
print("Imports OK")


In [None]:
# Helper — render audio and show inline player with stats

def play(audio: np.ndarray, sr: int = SAMPLE_RATE, label: str = ""):
    """Display an inline audio player with duration stats."""
    duration = len(audio) / sr
    prefix = f"{label}: " if label else ""
    print(f"{prefix}{duration:.1f}s, {len(audio):,} samples @ {sr} Hz")
    display(Audio(audio, rate=sr))

print("play() helper defined")


---
# 1. Configuration

Set your LLM backend, TTS backend, and extraction parameters.

In [None]:
# 1.1 Pipeline configuration
#
# Language: source_lang and target_lang propagate to the entire pipeline:
#   - LLM prompts are automatically instructed to write in target_lang
#   - TTS voices are auto-selected from target_lang if tts= is omitted
#
# If you set tts= explicitly, make sure the voice matches target_lang.
# e.g. target_lang="fr" needs a French voice (ff_siwis), not English (af_heart).

config = AudiobookConfig(
    narration=NarrationConfig(
        source_lang="en",
        target_lang="en",
    ),
    # LLM — uncomment one
    llm=MLXLLM(model="Qwen/Qwen3-14B-MLX-4bit"),
    # llm=OllamaLLM(model="qwen3:14b", num_ctx=40960, temperature=0.3),

    # TTS — uncomment one (or omit to auto-select from target_lang)
    tts=KokoroTTS(voices=("af_heart",), speed=0.95),
    # tts=KokoroTTS(lang="fr"),  # auto-selects French voice
    # tts=ChatterboxTTS(),
)

# Extraction settings
MAX_TOC_LEVEL = 1       # 1 = chapters, 2 = sub-chapters
CONTEXT_BUDGET = 20_000  # max tokens per section
PDF_BACKEND = "pymupdf"  # "pymupdf" or "docling"

print(f"LLM: {type(config.llm).__name__} / {config.llm.model}")
print(f"TTS: {type(config.tts).__name__}")
print(f"Language: {config.narration.source_lang} → {config.narration.target_lang}")


---
# 2. Extract Sections from PDF

Runs the TOC analysis and per-section markdown extraction pipeline.

In [None]:
# 2.1 Extract sections from a PDF
#     Downloads the Adam optimizer paper (Kingma & Ba, 2015) from arxiv.
#     Cached locally after first download.

source = "https://arxiv.org/pdf/1412.6980"
# source = "../../inputs/your-book.pdf"         # or use a local PDF
# source = "https://example.com/article"        # or a webpage URL

print(f"Source: {source}")
print(f"TOC level: {MAX_TOC_LEVEL}, budget: {CONTEXT_BUDGET:,} tokens\n")

t0 = time.time()
raw_sections = extract_sections(
    source,
    max_toc_level=MAX_TOC_LEVEL,
    context_budget=CONTEXT_BUDGET,
    backend=PDF_BACKEND,
)
elapsed = time.time() - t0

print(f"\nExtracted {len(raw_sections)} sections in {elapsed:.1f}s:\n")
for i, (title, content) in enumerate(raw_sections):
    chars = len(content)
    tokens_est = chars // 4
    print(f"  {i+1}. {title} ({chars:,} chars, ~{tokens_est:,} tokens)")


In [None]:
# 2.2 Inspect a section

section_idx = 0  # change this to inspect different sections
title, content = raw_sections[section_idx]

print(f"Section {section_idx + 1}: {title}")
print(f"Length: {len(content):,} chars, ~{len(content)//4:,} tokens")
print("=" * 60)
print(content[:2000])
if len(content) > 2000:
    print(f"\n... ({len(content) - 2000:,} more chars)")


---
# 3. LLM Narration Adaptation

Send a section to the LLM to produce narration-ready text with pause markers.

In [None]:
# 3.1 Adapt one section to narration
#     Uses the section selected in 2.2 above.

section = Section.from_content(title, content, language=config.narration.target_lang)

print(f"Adapting: {section.title}")
print(f"  has_table={section.has_table}, has_list={section.has_list}")
print(f"  LLM: {type(config.llm).__name__} / {config.llm.model}")
print("Generating narration...\n")

t0 = time.time()
narration = adapt_narration_section(
    section,
    llm=config.llm,
    source_lang=config.narration.source_lang,
    target_lang=config.narration.target_lang,
)
elapsed = time.time() - t0

print(f"Narration generated in {elapsed:.1f}s")
print(f"Output: {len(narration):,} chars, {len(narration.split())} words")
print("=" * 60)
print(narration[:3000])
if len(narration) > 3000:
    print(f"\n... ({len(narration) - 3000:,} more chars)")


---
# 4. TTS Rendering

Render the narration to audio with the configured TTS backend.

In [None]:
# 4.1 Load TTS model and render

print(f"Loading {type(config.tts).__name__} model...")
t0 = time.time()
model = load_tts_model(config.tts)
print(f"Model loaded in {time.time() - t0:.1f}s")

print(f"\nRendering {len(narration.split())} words...")
t0 = time.time()
audio = render_section(narration, config.tts, model=model)
render_time = time.time() - t0

print(f"Render time: {render_time:.1f}s")
play(audio, label="Audiobook")


---
# 5. Backend Comparison

Load all three TTS backends and compare the same narration.

In [None]:
# 5.1 Load Kokoro + Chatterbox
backends = {
    "Kokoro": KokoroTTS(voices=("af_heart",), speed=0.95),
    "Chatterbox": ChatterboxTTS(),
}

models = {}
for name, tts_cfg in backends.items():
    print(f"{name}: {tts_cfg}")
    t0 = time.time()
    models[name] = load_tts_model(tts_cfg)
    print(f"  Loaded in {time.time() - t0:.1f}s\n")


In [None]:
# 5.2 Render the same narration with all backends

results = {}
for name, tts_cfg in backends.items():
    print(f"Rendering with {name}...")
    t0 = time.time()
    audio_out = render_section(narration, tts_cfg, model=models[name])
    elapsed = time.time() - t0
    results[name] = (audio_out, elapsed)

# Stats table
print(f"\n{'Metric':<20}", end="")
for name in results:
    print(f" {name:>16}", end="")
print()
print(f"{'-' * 20}", end="")
for _ in results:
    print(f" {'-' * 16}", end="")
print()

print(f"{'Render time (s)':<20}", end="")
for audio_out, elapsed in results.values():
    print(f" {elapsed:>16.1f}", end="")
print()

print(f"{'Audio duration (s)':<20}", end="")
for audio_out, elapsed in results.values():
    print(f" {len(audio_out) / SAMPLE_RATE:>16.1f}", end="")
print()

print(f"{'Samples':<20}", end="")
for audio_out, elapsed in results.values():
    print(f" {len(audio_out):>16,}", end="")
print()

# Play each
for name, (audio_out, elapsed) in results.items():
    print(f"\n--- {name} ---")
    play(audio_out)
