# Podcast Pipeline Testing

End-to-end podcast pipeline: PDF → section extraction → LLM outline → LLM dialogue → TTS rendering.

**Prerequisites:** `uv sync --extra mlx` (Kokoro + Chatterbox).

**What you can do here:**
- Extract sections from a PDF and inspect them
- Generate a podcast outline from all sections
- Generate two-speaker dialogue for a section
- Render dialogue to audio with Kokoro or Chatterbox TTS
- Compare backends side by side
- Save rendered audio to WAV files

Run cells 1–2 (setup + config) first, then run sections in order.

In [None]:
%load_ext autoreload
%autoreload 2

# 1. Setup — add src/ to path so we can import all packages
import sys
import time
from pathlib import Path

import numpy as np
from IPython.display import Audio, display

sys.path.insert(0, str(Path.cwd().parents[1] / "src"))

from shared.extract import extract_sections
from shared.providers import (
    KokoroTTS, ChatterboxTTS,
    MLXLLM, OllamaLLM,
    get_tts_runtime,
)
from podcast import (
    PodcastConfig, DialogueConfig,
    generate_outline, generate_dialogue_segment, generate_intro_outro,
    load_tts_model, render_dialogue, get_sample_rate,
)

print(f"TTS runtime: {get_tts_runtime()}")
print("Imports OK")


In [None]:
# Helper — render audio and show inline player with stats

def play(audio: np.ndarray, sr: int, label: str = ""):
    """Display an inline audio player with duration stats."""
    duration = len(audio) / sr
    prefix = f"{label}: " if label else ""
    print(f"{prefix}{duration:.1f}s, {len(audio):,} samples @ {sr} Hz")
    display(Audio(audio, rate=sr))

print("play() helper defined")


---
# 1. Configuration

Set your LLM backend, TTS backend, dialogue format, and extraction parameters.

In [None]:
# 1.1 Pipeline configuration
#
# Language: source_lang and target_lang propagate to the entire pipeline:
#   - LLM prompts are automatically instructed to write in target_lang
#   - TTS voices are auto-selected from target_lang if tts= is omitted
#
# If you set tts= explicitly, make sure voices match target_lang.
# e.g. target_lang="fr" needs French voices (ff_siwis), not English (bf_emma).

config = PodcastConfig(
    dialogue=DialogueConfig(
        format="host_guest",           # "two_hosts" | "host_guest" — see docs/api_reference.md#dialogue-formats
        speaker1_name="Alex",
        speaker2_name="Sam",
        source_lang="en",
        target_lang="en",
        segment_target_words=1200,    # ~8 min per segment
        words_per_minute=150,
    ),
    # LLM — uncomment one
    # llm=MLXLLM(model="Qwen/Qwen3-14B-MLX-4bit"),
    llm=OllamaLLM(model="qwen3:14b", num_ctx=40960, temperature=0.7),

    # TTS — uncomment one (or omit to auto-select from target_lang)
    tts=KokoroTTS(voices=("bf_emma", "bm_george"), speeds=(1.0, 1.2)),
    # tts=KokoroTTS(lang="fr"),  # auto-selects French voices
    # tts=ChatterboxTTS(audio_prompts=("voices/host.wav", "voices/guest.wav")),
)

# Extraction settings
MAX_TOC_LEVEL = 1       # 1 = chapters, 2 = sub-chapters
CONTEXT_BUDGET = 20_000  # max tokens per section
PDF_BACKEND = "pymupdf"  # "pymupdf" or "docling"

dlg = config.dialogue
print(f"Format: {dlg.format} ({dlg.speaker1_name} & {dlg.speaker2_name})")
print(f"LLM: {type(config.llm).__name__} / {config.llm.model}")
print(f"TTS: {type(config.tts).__name__}")
print(f"Language: {dlg.source_lang} → {dlg.target_lang}")


---
# 2. Extract Sections from PDF

Runs the TOC analysis and per-section markdown extraction pipeline.

In [None]:
# 2.1 Extract sections from a PDF
#     Downloads the Adam optimizer paper (Kingma & Ba, 2015) from arxiv.
#     Cached locally after first download.

source = "https://arxiv.org/pdf/1412.6980"
# source = "../../inputs/your-book.pdf"         # or use a local PDF
# source = "https://example.com/article"        # or a webpage URL

print(f"Source: {source}")
print(f"TOC level: {MAX_TOC_LEVEL}, budget: {CONTEXT_BUDGET:,} tokens\n")

t0 = time.time()
raw_sections = extract_sections(
    source,
    max_toc_level=MAX_TOC_LEVEL,
    context_budget=CONTEXT_BUDGET,
    backend=PDF_BACKEND,
)
elapsed = time.time() - t0

print(f"\nExtracted {len(raw_sections)} sections in {elapsed:.1f}s:\n")
for i, (title, content) in enumerate(raw_sections):
    chars = len(content)
    tokens_est = chars // 4
    print(f"  {i+1}. {title} ({chars:,} chars, ~{tokens_est:,} tokens)")


---
# 3. LLM Outline Generation

Generate a podcast outline from all extracted sections. This guides the dialogue generation.

In [None]:
# 3.1 Generate podcast outline

print(f"Generating outline from {len(raw_sections)} sections...")
print(f"LLM: {type(config.llm).__name__} / {config.llm.model}\n")

t0 = time.time()
outline = generate_outline(raw_sections, config)
elapsed = time.time() - t0

print(f"Outline generated in {elapsed:.1f}s")
print(f"Episode title: {outline.title}")
print("=" * 60)
print(outline.raw_text[:2000])
if len(outline.raw_text) > 2000:
    print(f"\n... ({len(outline.raw_text) - 2000:,} more chars)")


---
# 4. LLM Dialogue Generation

Generate two-speaker dialogue for one section, using the outline for context.

In [None]:
# 4.1 Generate dialogue for one section

section_idx = 0  # change this to generate dialogue for different sections
title, content = raw_sections[section_idx]

dlg = config.dialogue
print(f"Generating dialogue for: {title}")
print(f"  Target: ~{dlg.segment_target_words} words (~{dlg.segment_target_words // dlg.words_per_minute} min)")
print(f"  LLM: {type(config.llm).__name__} / {config.llm.model}\n")

t0 = time.time()
result = generate_dialogue_segment(
    section_content=content,
    section_title=title,
    outline=outline,
    segment_index=section_idx,
    rolling_summary="",
    covered_topics=[],
    config=config,
)
elapsed = time.time() - t0

dialogue = result.dialogue
word_count = len(dialogue.split())
est_minutes = word_count / dlg.words_per_minute

print(f"Dialogue generated in {elapsed:.1f}s")
print(f"Output: {len(dialogue):,} chars, {word_count} words (~{est_minutes:.1f} min)")
print(f"Topics covered: {len(result.covered_topics)}")
print("=" * 60)
print(dialogue[:3000])
if len(dialogue) > 3000:
    print(f"\n... ({len(dialogue) - 3000:,} more chars)")


---
# 5. TTS Rendering

Render the dialogue to audio with the configured TTS backend.

In [None]:
# 5.1 Load TTS model and render

sr = get_sample_rate(config.tts)

print(f"Loading {type(config.tts).__name__} model...")
t0 = time.time()
model = load_tts_model(config.tts)
print(f"Model loaded in {time.time() - t0:.1f}s")

print(f"\nRendering {len(dialogue.split())} words...")
t0 = time.time()
audio = render_dialogue(dialogue, config.tts, model=model)
render_time = time.time() - t0

print(f"Render time: {render_time:.1f}s")
play(audio, sr=sr, label="Podcast")


---
# 6. Backend Comparison

Load all three TTS backends and compare the same dialogue.

In [None]:
# 6.1 Load backends to compare
backends = {
    "Kokoro": KokoroTTS(voices=("bf_emma", "bm_george")),
    "Chatterbox": ChatterboxTTS(),
}

models = {}
for name, tts_cfg in backends.items():
    print(f"{name}: {tts_cfg}")
    t0 = time.time()
    models[name] = load_tts_model(tts_cfg)
    print(f"  Loaded in {time.time() - t0:.1f}s\n")


In [None]:
# 6.2 Render the same dialogue with all backends
#     Uses the dialogue generated in section 4.

results = {}
for name, tts_cfg in backends.items():
    sr = get_sample_rate(tts_cfg)
    print(f"Rendering with {name}...")
    t0 = time.time()
    audio_out = render_dialogue(dialogue, tts_cfg, model=models[name])
    elapsed = time.time() - t0
    results[name] = (audio_out, elapsed, sr)

# Stats table
print(f"\n{'Metric':<20}", end="")
for name in results:
    print(f" {name:>16}", end="")
print()
print(f"{'-' * 20}", end="")
for _ in results:
    print(f" {'-' * 16}", end="")
print()

print(f"{'Render time (s)':<20}", end="")
for audio_out, elapsed, sr in results.values():
    print(f" {elapsed:>16.1f}", end="")
print()

print(f"{'Audio duration (s)':<20}", end="")
for audio_out, elapsed, sr in results.values():
    print(f" {len(audio_out) / sr:>16.1f}", end="")
print()

# Play each
for name, (audio_out, elapsed, sr) in results.items():
    print(f"\n--- {name} ---")
    play(audio_out, sr=sr)
