# Dialogue Prompt Testing

Interactive workbench for testing podcast dialogue prompts in isolation — without running the full pipeline.

**What you can do here:**
- Walk through outline, dialogue segment, and intro/outro generation step by step
- Measure quality metrics (word count vs target, speaker balance, turn lengths)
- A/B compare `two_hosts` vs `host_guest` formats side by side
- Test custom dialogue prompts with raw `llm_generate()` calls

Run cells 0-3 (setup) first, then jump to any section.

In [None]:
# Setup — add src/ to path so we can import all packages
import sys
import re
import time
from pathlib import Path

sys.path.insert(0, str(Path.cwd().parent / "src"))

from shared.providers import (
    OllamaLLM, MLXLLM,
    llm_generate, ollama_preflight,
)
from podcast import (
    DialogueConfig, PodcastConfig,
    PodcastOutline, DialogueSegment,
    generate_outline, generate_dialogue_segment, generate_intro_outro,
)
from podcast.prompts import (
    DIALOGUE_SYSTEM_PROMPTS, OUTLINE_SYSTEM_PROMPT,
    INTRO_PROMPT, OUTRO_PROMPT,
)

print("Imports OK")

In [None]:
# LLM backend — uncomment the one you want to use

## Ollama (default)
llm = OllamaLLM(model="qwen3:14b", temperature=0.7)

## MLX (Apple Silicon local)
# llm = MLXLLM(model="Qwen/Qwen3-14B-MLX-4bit", temperature=0.7)

# Preflight check for Ollama
if isinstance(llm, OllamaLLM):
    ollama_preflight(llm)
    print(f"Ollama ready: {llm.model} @ {llm.url}")
else:
    print(f"Using: {type(llm).__name__} ({llm.model})")

In [None]:
# Sample content — auto-discover from previous runs, or use inline fallback
OUTPUT_DIR = Path.cwd().parent / "output"

# Find section .md files from previous pipeline runs
section_files = sorted(OUTPUT_DIR.glob("*/sections/*.md")) if OUTPUT_DIR.exists() else []

if section_files:
    # Use the first available section
    sample_path = section_files[0]
    sample_content = sample_path.read_text()
    sample_title = sample_path.stem.replace("_", " ").title()
    print(f"Loaded: {sample_path.relative_to(OUTPUT_DIR)}")
    print(f"Title: {sample_title}")
    print(f"Length: {len(sample_content):,} chars")
    if len(section_files) > 1:
        print(f"\n{len(section_files)} sections available — change sample_path above to try others:")
        for f in section_files[:10]:
            print(f"  {f.relative_to(OUTPUT_DIR)}")
        if len(section_files) > 10:
            print(f"  ... and {len(section_files) - 10} more")
else:
    # Inline fallback with table, list, and numbers
    sample_title = "Performance Benchmarks"
    sample_content = """## Performance Benchmarks

The following table summarizes the results across all architectures tested in Q3 2024.

| Model        | Params | Accuracy | Latency (ms) | Memory (GB) |
|--------------|--------|----------|--------------|-------------|
| Baseline CNN | 25.6M  | 87.61%   | 12.3         | 2.1         |
| ResNet-50    | 25.6M  | 91.42%   | 18.7         | 3.4         |
| ViT-B/16     | 86.6M  | 93.18%   | 24.1         | 6.2         |
| MLP-Mixer    | 59.9M  | 89.73%   | 15.9         | 4.8         |

Key findings from the evaluation:

1. ViT-B/16 achieved the highest accuracy at 93.18%, outperforming the CNN baseline by 5.57 percentage points
2. The latency-accuracy tradeoff favors ResNet-50 for production deployments
3. MLP-Mixer offers a compelling middle ground — 89.73% accuracy with only 15.9ms latency
4. All models were evaluated on the ImageNet-1k validation set (50,000 images)

For details see https://arxiv.org/abs/2024.12345 and the full results at github.com/example/benchmarks.

The `torch.compile()` optimization reduced inference latency by approximately 30% across all architectures.
"""
    print("Using inline fallback content (no previous runs found)")
    print(f"Title: {sample_title}")
    print(f"Length: {len(sample_content):,} chars")

print(f"\nReady — sample_title and sample_content set")

---
# 1. Dialogue Walkthrough

In [None]:
# 1.1 Configure podcast settings (smaller targets for fast testing)
dialogue_config = DialogueConfig(
    format="two_hosts",
    speaker1_name="Alex",
    speaker2_name="Sam",
    source_lang="en",
    target_lang="en",
    segment_target_words=800,  # smaller than production 1200 for faster testing
)

podcast_config = PodcastConfig(
    dialogue=dialogue_config,
    llm=llm,
)

print(f"Format:       {dialogue_config.format}")
print(f"Speakers:     {dialogue_config.speaker1_name} & {dialogue_config.speaker2_name}")
print(f"Language:     {dialogue_config.source_lang} -> {dialogue_config.target_lang}")
print(f"Target words: {dialogue_config.segment_target_words} per segment")
print(f"LLM:          {type(llm).__name__} ({llm.model})")
print()
print(f"Available formats: {list(DIALOGUE_SYSTEM_PROMPTS.keys())}")

---
## 1.2 Outline generation

In [None]:
# 1.2 Generate podcast outline from sample sections
# Sections are (title, content) tuples
sample_sections = [(sample_title, sample_content)]

print(f"Input: {len(sample_sections)} section(s)")
print("Generating outline...\n")

t0 = time.time()
outline = generate_outline(sample_sections, podcast_config)
outline_elapsed = time.time() - t0

print(f"--- Outline ({outline_elapsed:.1f}s) ---")
print(f"Title: {outline.title}")
print(f"Segments: {len(outline.segments)}")
print()
print(outline.raw_text)

---
## 1.3 Dialogue segment generation

In [None]:
# 1.3 Generate dialogue for the first section
print(f"Generating dialogue for: {sample_title}")
print(f"Target: ~{dialogue_config.segment_target_words} words")
print("Generating...\n")

t0 = time.time()
segment = generate_dialogue_segment(
    section_content=sample_content,
    section_title=sample_title,
    outline=outline,
    segment_index=0,
    rolling_summary="",
    covered_topics=[],
    config=podcast_config,
)
dialogue_elapsed = time.time() - t0

print(f"--- Dialogue ({dialogue_elapsed:.1f}s) ---\n")
print(segment.dialogue)
print(f"\n--- Rolling Summary ---\n")
print(segment.updated_summary)
print(f"\n--- Covered Topics ---\n")
for topic in segment.covered_topics:
    print(f"  - {topic}")

In [None]:
# 1.4 Dialogue quality metrics
dialogue_text = segment.dialogue
dialogue_words = len(dialogue_text.split())
target = dialogue_config.segment_target_words

# Speaker turn analysis
s1_turns = re.findall(r"\[S1\](.*?)(?=\[S[12]\]|$)", dialogue_text, re.DOTALL)
s2_turns = re.findall(r"\[S2\](.*?)(?=\[S[12]\]|$)", dialogue_text, re.DOTALL)

s1_words = [len(t.split()) for t in s1_turns]
s2_words = [len(t.split()) for t in s2_turns]

s1_total = sum(s1_words)
s2_total = sum(s2_words)
total_speaker_words = s1_total + s2_total
balance = min(s1_total, s2_total) / max(s1_total, s2_total) if max(s1_total, s2_total) > 0 else 0

all_turn_words = s1_words + s2_words

# Pause counts
pauses = len(re.findall(r"\[PAUSE\]", dialogue_text))

print("Dialogue Quality Metrics")
print("=" * 40)
print(f"Total words:       {dialogue_words:>6,}")
print(f"Target words:      {target:>6,}")
print(f"Hit rate:          {dialogue_words / target * 100:>5.0f}%")
print(f"Generation time:   {dialogue_elapsed:>6.1f}s")
print(f"Words/sec:         {dialogue_words / dialogue_elapsed:>6.1f}")
print()
print(f"Speaker Balance")
print(f"  {dialogue_config.speaker1_name} (S1): {len(s1_turns)} turns, {s1_total} words")
print(f"  {dialogue_config.speaker2_name} (S2): {len(s2_turns)} turns, {s2_total} words")
print(f"  Balance ratio:   {balance:.2f}" + ("  (good)" if balance > 0.6 else "  *** imbalanced"))
print()
if all_turn_words:
    print(f"Turn Length (words)")
    print(f"  Average:         {sum(all_turn_words) / len(all_turn_words):>6.0f}")
    print(f"  Min:             {min(all_turn_words):>6}")
    print(f"  Max:             {max(all_turn_words):>6}")
print()
print(f"Pauses [PAUSE]:    {pauses}")

---
## 1.5 Intro & outro generation

In [None]:
# 1.5 Generate intro and outro
all_topics = segment.covered_topics

print("Generating intro and outro...\n")

t0 = time.time()
intro, outro = generate_intro_outro(outline, all_topics, podcast_config)
intro_outro_elapsed = time.time() - t0

print(f"--- Intro ({len(intro.split())} words) ---\n")
print(intro)
print(f"\n--- Outro ({len(outro.split())} words) ---\n")
print(outro)
print(f"\nGeneration time: {intro_outro_elapsed:.1f}s")

---
# 2. A/B Comparison

In [None]:
# 2.0 Generic A/B comparison helper

def compare_ab(label_a, label_b, run_a, run_b):
    """Run two callables and display a comparison summary.

    Each callable should return (text, elapsed_seconds).
    Auto-detects podcast dialogue by [S1]/[S2] presence.
    """
    print(f"Running A: {label_a}...")
    text_a, time_a = run_a()
    print(f"Running B: {label_b}...")
    text_b, time_b = run_b()

    words_a = len(text_a.split())
    words_b = len(text_b.split())

    pauses_a = text_a.count("[PAUSE")
    pauses_b = text_b.count("[PAUSE")

    # Speaker balance (auto-detected)
    def speaker_balance(text):
        s1 = sum(len(t.split()) for t in re.findall(r"\[S1\](.*?)(?=\[S[12]\]|$)", text, re.DOTALL))
        s2 = sum(len(t.split()) for t in re.findall(r"\[S2\](.*?)(?=\[S[12]\]|$)", text, re.DOTALL))
        if s1 + s2 == 0:
            return None
        return min(s1, s2) / max(s1, s2)

    bal_a = speaker_balance(text_a)
    bal_b = speaker_balance(text_b)

    # Summary table
    print(f"\n{'Metric':<20} {'A: ' + label_a:>20} {'B: ' + label_b:>20}")
    print(f"{'-' * 20} {'-' * 20} {'-' * 20}")
    print(f"{'Words':<20} {words_a:>20,} {words_b:>20,}")
    print(f"{'Pauses':<20} {pauses_a:>20} {pauses_b:>20}")
    print(f"{'Time (s)':<20} {time_a:>20.1f} {time_b:>20.1f}")
    print(f"{'Words/sec':<20} {words_a / time_a:>20.1f} {words_b / time_b:>20.1f}")
    if bal_a is not None or bal_b is not None:
        bal_a_str = f"{bal_a:.2f}" if bal_a is not None else "n/a"
        bal_b_str = f"{bal_b:.2f}" if bal_b is not None else "n/a"
        print(f"{'Speaker balance':<20} {bal_a_str:>20} {bal_b_str:>20}")

    # Show truncated outputs
    max_chars = 2000
    print(f"\n{'=' * 60}")
    print(f"A: {label_a}")
    print(f"{'=' * 60}")
    print(text_a[:max_chars])
    if len(text_a) > max_chars:
        print(f"\n... ({len(text_a) - max_chars:,} chars truncated)")

    print(f"\n{'=' * 60}")
    print(f"B: {label_b}")
    print(f"{'=' * 60}")
    print(text_b[:max_chars])
    if len(text_b) > max_chars:
        print(f"\n... ({len(text_b) - max_chars:,} chars truncated)")

print("compare_ab() defined")

---
## 2.1 Format comparison

In [None]:
# 2.1 Compare two_hosts vs host_guest podcast formats

def run_two_hosts():
    cfg = PodcastConfig(
        dialogue=DialogueConfig(
            format="two_hosts",
            speaker1_name="Alex",
            speaker2_name="Sam",
            segment_target_words=600,
        ),
        llm=llm,
    )
    t0 = time.time()
    ol = generate_outline([(sample_title, sample_content)], cfg)
    seg = generate_dialogue_segment(
        sample_content, sample_title, ol, 0, "", [], cfg,
    )
    return seg.dialogue, time.time() - t0

def run_host_guest():
    cfg = PodcastConfig(
        dialogue=DialogueConfig(
            format="host_guest",
            speaker1_name="Alex",
            speaker2_name="Dr. Chen",
            segment_target_words=600,
        ),
        llm=llm,
    )
    t0 = time.time()
    ol = generate_outline([(sample_title, sample_content)], cfg)
    seg = generate_dialogue_segment(
        sample_content, sample_title, ol, 0, "", [], cfg,
    )
    return seg.dialogue, time.time() - t0

compare_ab("two_hosts", "host_guest", run_two_hosts, run_host_guest)

---
# 3. Custom Prompt Testing

In [None]:
# 3.1 Custom dialogue prompt — edit the system prompt with {speaker1}/{speaker2} placeholders
speaker1 = "Alex"
speaker2 = "Sam"

custom_dialogue_system = f"""\
You are writing a podcast dialogue between {speaker1} and {speaker2}.

Rules:
1. Use [S1] for {speaker1}'s lines and [S2] for {speaker2}'s lines
2. Keep it conversational — short turns, natural reactions
3. Cover the key points from the source material
4. Target approximately 600 words total
5. Start with [S1] and alternate naturally

Output ONLY the dialogue. No stage directions or commentary.
"""

dialogue_user_msg = f"Topic: {sample_title}\n\nSource material:\n{sample_content}"

print(f"Speakers: {speaker1} & {speaker2}")
print("Generating...\n")

t0 = time.time()
custom_dialogue = llm_generate(custom_dialogue_system, dialogue_user_msg, llm)
custom_dialogue_elapsed = time.time() - t0

# Quick stats
cd_words = len(custom_dialogue.split())
cd_s1 = len(re.findall(r"\[S1\]", custom_dialogue))
cd_s2 = len(re.findall(r"\[S2\]", custom_dialogue))

print(f"--- Output ({custom_dialogue_elapsed:.1f}s, {cd_words} words, {cd_s1}+{cd_s2} turns) ---\n")
print(custom_dialogue)

---
## 3.2 Free-form prompt testing

In [None]:
# 3.2 Blank slate — edit both prompts and run
system_prompt = """You are a helpful assistant."""

user_message = """Summarize the following in 3 bullet points:

[paste your content here]
"""

print(f"System: {len(system_prompt)} chars")
print(f"User: {len(user_message)} chars")
print("Generating...\n")

t0 = time.time()
result = llm_generate(system_prompt, user_message, llm)
elapsed = time.time() - t0

print(f"--- Output ({elapsed:.1f}s, {len(result.split())} words) ---\n")
print(result)