# Narration Prompt Testing

Interactive workbench for testing audiobook narration prompts in isolation — without running the full pipeline.

**What you can do here:**
- Walk through `adapt_narration_section()` step by step
- Measure quality metrics (word ratio, pause density, residual artifacts)
- A/B compare language pairs or models side by side
- Test custom narration prompts with raw `llm_generate()` calls

Run cells 0-3 (setup) first, then jump to any section.

In [None]:
%load_ext autoreload
%autoreload 2

# Setup — add src/ to path so we can import all packages
import sys
import re
import time
from pathlib import Path

sys.path.insert(0, str(Path.cwd().parent / "src"))

from shared.providers import (
    OllamaLLM, MLXLLM,
    llm_generate, ollama_preflight,
)
from shared.markdown_parser import Section
from audiobook import adapt_narration_section, NARRATION_SYSTEM_PROMPT

print("Imports OK")

In [None]:
# LLM backend — uncomment the one you want to use

## Ollama (default)
llm = OllamaLLM(model="qwen3:14b", temperature=0.7)

## MLX (Apple Silicon local)
# llm = MLXLLM(model="Qwen/Qwen3-14B-MLX-4bit", temperature=0.7)

# Preflight check for Ollama
if isinstance(llm, OllamaLLM):
    ollama_preflight(llm)
    print(f"Ollama ready: {llm.model} @ {llm.url}")
else:
    print(f"Using: {type(llm).__name__} ({llm.model})")

In [None]:
# Sample content — auto-discover from previous runs, or use inline fallback
OUTPUT_DIR = Path.cwd().parent / "output"

# Find section .md files from previous pipeline runs
section_files = sorted(OUTPUT_DIR.glob("*/sections/*.md")) if OUTPUT_DIR.exists() else []

if section_files:
    # Use the first available section
    sample_path = section_files[0]
    sample_content = sample_path.read_text()
    sample_title = sample_path.stem.replace("_", " ").title()
    print(f"Loaded: {sample_path.relative_to(OUTPUT_DIR)}")
    print(f"Title: {sample_title}")
    print(f"Length: {len(sample_content):,} chars")
    if len(section_files) > 1:
        print(f"\n{len(section_files)} sections available — change sample_path above to try others:")
        for f in section_files[:10]:
            print(f"  {f.relative_to(OUTPUT_DIR)}")
        if len(section_files) > 10:
            print(f"  ... and {len(section_files) - 10} more")
else:
    # Inline fallback with table, list, and numbers to exercise narration rules
    sample_title = "Performance Benchmarks"
    sample_content = """## Performance Benchmarks

The following table summarizes the results across all architectures tested in Q3 2024.

| Model        | Params | Accuracy | Latency (ms) | Memory (GB) |
|--------------|--------|----------|--------------|-------------|
| Baseline CNN | 25.6M  | 87.61%   | 12.3         | 2.1         |
| ResNet-50    | 25.6M  | 91.42%   | 18.7         | 3.4         |
| ViT-B/16     | 86.6M  | 93.18%   | 24.1         | 6.2         |
| MLP-Mixer    | 59.9M  | 89.73%   | 15.9         | 4.8         |

Key findings from the evaluation:

1. ViT-B/16 achieved the highest accuracy at 93.18%, outperforming the CNN baseline by 5.57 percentage points
2. The latency-accuracy tradeoff favors ResNet-50 for production deployments
3. MLP-Mixer offers a compelling middle ground — 89.73% accuracy with only 15.9ms latency
4. All models were evaluated on the ImageNet-1k validation set (50,000 images)

For details see https://arxiv.org/abs/2024.12345 and the full results at github.com/example/benchmarks.

The `torch.compile()` optimization reduced inference latency by approximately 30% across all architectures.
"""
    print("Using inline fallback content (no previous runs found)")
    print(f"Title: {sample_title}")
    print(f"Length: {len(sample_content):,} chars")

# Build Section object with structural detection
has_table = bool(re.search(r"\|.*\|.*\|", sample_content))
has_list = bool(re.search(r"^\s*[-*\d]+[.)\]]?\s", sample_content, re.MULTILINE))

sample_section = Section(
    title=sample_title,
    content=sample_content,
    has_table=has_table,
    has_list=has_list,
    language="en",
)
print(f"\nSection flags: has_table={has_table}, has_list={has_list}")

---
# 1. Narration Walkthrough

In [None]:
# 1.1 Base system prompt
print("Base system prompt (NARRATION_SYSTEM_PROMPT):\n")
print(f"  {len(NARRATION_SYSTEM_PROMPT):,} chars")
print(f"  First 200 chars: {NARRATION_SYSTEM_PROMPT[:200]}...")
print("\nLanguage instruction is appended dynamically for non-English pairs.")
print("Any language pair the LLM can handle is supported.")

---
## 1.2 Narration adaptation

In [None]:
# 1.2 Run adapt_narration_section() on sample content
source_lang = "en"
target_lang = "en"

print(f"Language pair: {source_lang} -> {target_lang}")
print(f"Input: {len(sample_section.content):,} chars, "
      f"has_table={sample_section.has_table}, has_list={sample_section.has_list}")
print("Generating...\n")

t0 = time.time()
narration_output = adapt_narration_section(
    sample_section, llm,
    source_lang=source_lang,
    target_lang=target_lang,
)
narration_elapsed = time.time() - t0

print(f"--- Output ({narration_elapsed:.1f}s) ---\n")
print(narration_output)

In [None]:
# 1.3 Narration quality metrics
input_words = len(sample_section.content.split())
output_words = len(narration_output.split())
ratio = output_words / input_words if input_words else 0

pause_short = narration_output.count("[PAUSE_SHORT]")
pause_medium = narration_output.count("[PAUSE_MEDIUM]")
pause_long = narration_output.count("[PAUSE_LONG]")
total_pauses = pause_short + pause_medium + pause_long

# Residual artifact checks
residual_md = len(re.findall(r"[*_#`|]", narration_output))
residual_urls = len(re.findall(r"https?://\S+", narration_output))
residual_code = len(re.findall(r"`[^`]+`", narration_output))

print("Narration Quality Metrics")
print("=" * 40)
print(f"Input words:       {input_words:>6,}")
print(f"Output words:      {output_words:>6,}")
print(f"Ratio (out/in):    {ratio:>6.2f}x")
print(f"Generation time:   {narration_elapsed:>6.1f}s")
print(f"Words/sec:         {output_words / narration_elapsed:>6.1f}")
print()
print(f"Pauses: {total_pauses} total")
print(f"  [PAUSE_SHORT]:   {pause_short}")
print(f"  [PAUSE_MEDIUM]:  {pause_medium}")
print(f"  [PAUSE_LONG]:    {pause_long}")
print()
print(f"Residual artifacts:")
print(f"  Markdown chars:  {residual_md}" + ("  *** check output" if residual_md > 5 else ""))
print(f"  URLs:            {residual_urls}" + ("  *** should be 0" if residual_urls else ""))
print(f"  Code spans:      {residual_code}" + ("  *** should be 0" if residual_code else ""))

---
# 2. A/B Comparison

In [None]:
# 2.0 Generic A/B comparison helper

def compare_ab(label_a, label_b, run_a, run_b):
    """Run two callables and display a comparison summary.

    Each callable should return (text, elapsed_seconds).
    """
    print(f"Running A: {label_a}...")
    text_a, time_a = run_a()
    print(f"Running B: {label_b}...")
    text_b, time_b = run_b()

    words_a = len(text_a.split())
    words_b = len(text_b.split())

    pauses_a = text_a.count("[PAUSE")
    pauses_b = text_b.count("[PAUSE")

    # Summary table
    print(f"\n{'Metric':<20} {'A: ' + label_a:>20} {'B: ' + label_b:>20}")
    print(f"{'-' * 20} {'-' * 20} {'-' * 20}")
    print(f"{'Words':<20} {words_a:>20,} {words_b:>20,}")
    print(f"{'Pauses':<20} {pauses_a:>20} {pauses_b:>20}")
    print(f"{'Time (s)':<20} {time_a:>20.1f} {time_b:>20.1f}")
    print(f"{'Words/sec':<20} {words_a / time_a:>20.1f} {words_b / time_b:>20.1f}")

    # Show truncated outputs
    max_chars = 2000
    print(f"\n{'=' * 60}")
    print(f"A: {label_a}")
    print(f"{'=' * 60}")
    print(text_a[:max_chars])
    if len(text_a) > max_chars:
        print(f"\n... ({len(text_a) - max_chars:,} chars truncated)")

    print(f"\n{'=' * 60}")
    print(f"B: {label_b}")
    print(f"{'=' * 60}")
    print(text_b[:max_chars])
    if len(text_b) > max_chars:
        print(f"\n... ({len(text_b) - max_chars:,} chars truncated)")

print("compare_ab() defined")

---
## 2.1 Language pairs

In [None]:
# 2.1 Compare en->en vs en->fr narration

def run_en_en():
    t0 = time.time()
    text = adapt_narration_section(sample_section, llm, source_lang="en", target_lang="en")
    return text, time.time() - t0

def run_en_fr():
    t0 = time.time()
    text = adapt_narration_section(sample_section, llm, source_lang="en", target_lang="fr")
    return text, time.time() - t0

compare_ab("en -> en", "en -> fr", run_en_en, run_en_fr)

---
## 2.2 Different models

In [None]:
# 2.2 Compare two models — edit llm_b to test your preferred alternative

## Ollama alternative
llm_b = OllamaLLM(model="qwen3:8b", temperature=0.7)

## MLX alternative
# llm_b = MLXLLM(model="Qwen/Qwen3-14B-MLX-4bit", temperature=0.7)

# Preflight for Ollama model B
if isinstance(llm_b, OllamaLLM):
    ollama_preflight(llm_b)

label_a = f"{type(llm).__name__}({llm.model})"
label_b = f"{type(llm_b).__name__}({llm_b.model})"

def run_model_a():
    t0 = time.time()
    text = adapt_narration_section(sample_section, llm, source_lang="en", target_lang="en")
    return text, time.time() - t0

def run_model_b():
    t0 = time.time()
    text = adapt_narration_section(sample_section, llm_b, source_lang="en", target_lang="en")
    return text, time.time() - t0

compare_ab(label_a, label_b, run_model_a, run_model_b)

---
# 3. Custom Prompt Testing

In [None]:
# 3.1 Custom narration prompt — edit the system prompt below and run
custom_system = """\
You are a narrator adapting written content for audiobook narration.

Rules:
1. Convert all tables into clear comparative statements
2. Expand abbreviations on first occurrence
3. Write numbers as spoken words ("twenty-five" not "25")
4. Remove all markdown formatting, URLs, and code references
5. Add [PAUSE_SHORT], [PAUSE_MEDIUM], [PAUSE_LONG] for natural pacing
6. Use an engaging, authoritative tone

Output ONLY the narration text. No commentary.
"""

user_message = f"Section: {sample_title}\n\nContent:\n{sample_content}"

print(f"System prompt: {len(custom_system)} chars")
print(f"User message: {len(user_message)} chars")
print("Generating...\n")

t0 = time.time()
custom_narration = llm_generate(custom_system, user_message, llm)
custom_elapsed = time.time() - t0

print(f"--- Output ({custom_elapsed:.1f}s, {len(custom_narration.split())} words) ---\n")
print(custom_narration)

---
## 3.2 Free-form prompt testing

In [None]:
# 3.2 Blank slate — edit both prompts and run
system_prompt = """You are a helpful assistant."""

user_message = """Summarize the following in 3 bullet points:

[paste your content here]
"""

print(f"System: {len(system_prompt)} chars")
print(f"User: {len(user_message)} chars")
print("Generating...\n")

t0 = time.time()
result = llm_generate(system_prompt, user_message, llm)
elapsed = time.time() - t0

print(f"--- Output ({elapsed:.1f}s, {len(result.split())} words) ---\n")
print(result)