# Lab 2: Prompt Chains & Testing Workflows

In this lab, you'll move beyond single prompts and build **multi-step prompt chains** that tackle complex tasks by breaking them into a series of connected steps. You'll also implement a prompt testing framework to compare and evaluate prompt quality systematically.

## Learning Objectives
- Build multi-step prompt chains for complex tasks
- Implement prompt testing and evaluation
- Design iterative refinement workflows

**Duration:** 45 minutes | **Difficulty:** Intermediate

## Part 1: Simple Prompt Chains

A **prompt chain** breaks a complex task into discrete steps where the output of one step feeds into the next. This mirrors how humans tackle large problems: research first, then outline, then draft.

Since we are working without a live LLM, each step uses a **transform function** that simulates realistic AI output. In production, you would replace these with actual API calls.

In [None]:
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Optional, Tuple
import textwrap
import re
import random
import matplotlib.pyplot as plt
import numpy as np


@dataclass
class ChainStep:
    """A single step in a prompt chain."""
    name: str
    prompt_template: str
    transform_fn: Optional[Callable[[str], str]] = None


@dataclass
class StepResult:
    """Record of what happened at one step."""
    step_name: str
    prompt_sent: str
    output: str


class PromptChain:
    """A sequential chain of prompts where each step's output feeds the next."""

    def __init__(self):
        self._steps: List[ChainStep] = []
        self._trace: List[StepResult] = []

    def add_step(self, name: str, prompt_template: str,
                 transform_fn: Optional[Callable[[str], str]] = None) -> 'PromptChain':
        """Add a step to the chain.

        Args:
            name: Human-readable step name.
            prompt_template: Prompt string with {input} placeholder.
            transform_fn: Simulates AI output for this step.
                          Receives the formatted prompt, returns output text.
        Returns:
            self (for fluent chaining).
        """
        self._steps.append(ChainStep(name, prompt_template, transform_fn))
        return self

    def run(self, initial_input: str) -> str:
        """Execute the chain end-to-end."""
        self._trace = []
        current = initial_input

        for step in self._steps:
            prompt = step.prompt_template.replace("{input}", current)

            if step.transform_fn:
                output = step.transform_fn(prompt)
            else:
                output = prompt  # pass-through if no transform

            self._trace.append(StepResult(
                step_name=step.name,
                prompt_sent=prompt,
                output=output
            ))
            current = output

        return current

    def show_trace(self) -> None:
        """Pretty-print what happened at each step."""
        for i, result in enumerate(self._trace, 1):
            print(f"{'=' * 60}")
            print(f"  Step {i}: {result.step_name}")
            print(f"{'=' * 60}")
            print(f"  PROMPT:")
            print(textwrap.indent(textwrap.shorten(result.prompt_sent, width=200), "    "))
            print(f"\n  OUTPUT:")
            print(textwrap.indent(result.output, "    "))
            print()

    def get_trace(self) -> List[StepResult]:
        """Return the raw trace list."""
        return list(self._trace)


# ── Simulated transform functions ─────────────────────────────────

def sim_extract_topics(prompt: str) -> str:
    """Simulate extracting key topics from raw text."""
    return (
        "Key Topics Identified:\n"
        "1. Current adoption rates and growth trajectory\n"
        "2. Primary use cases in clinical settings\n"
        "3. Regulatory landscape and compliance requirements\n"
        "4. Cost-benefit analysis for hospital systems\n"
        "5. Patient outcome improvements backed by studies"
    )

def sim_create_outline(prompt: str) -> str:
    """Simulate creating a structured outline from topics."""
    return (
        "Article Outline:\n"
        "I. Introduction - The AI revolution in healthcare\n"
        "   A. Hook: startling statistic on diagnostic errors\n"
        "   B. Thesis: AI is transforming clinical outcomes\n"
        "II. Adoption Landscape\n"
        "   A. Current rates: 38% of hospitals using some form of AI\n"
        "   B. Growth: projected 45% CAGR through 2028\n"
        "III. Clinical Use Cases\n"
        "   A. Diagnostic imaging (radiology, pathology)\n"
        "   B. Predictive patient monitoring\n"
        "   C. Drug discovery acceleration\n"
        "IV. Regulatory & Compliance\n"
        "   A. FDA approval pathways for AI/ML devices\n"
        "   B. HIPAA considerations for training data\n"
        "V. ROI and Outcomes\n"
        "   A. Cost savings from reduced misdiagnosis\n"
        "   B. Improved patient satisfaction scores\n"
        "VI. Conclusion and Recommendations"
    )

def sim_write_intro(prompt: str) -> str:
    """Simulate writing an introduction from an outline."""
    return (
        "Every year, an estimated 12 million Americans receive a misdiagnosis "
        "in outpatient settings, according to a study published in BMJ Quality "
        "& Safety. For hospital administrators grappling with this challenge, "
        "artificial intelligence offers a compelling answer. From radiology "
        "suites where deep learning algorithms flag suspicious lesions to ICU "
        "floors where predictive models identify patients at risk of sepsis "
        "hours before traditional vital signs change, AI is no longer a "
        "futuristic concept\u2014it is a clinical reality.\n\n"
        "With 38% of US hospitals already deploying some form of AI and the "
        "market growing at a projected 45% compound annual growth rate, the "
        "question is no longer whether to adopt, but how to adopt "
        "responsibly. This article examines the current landscape, highlights "
        "proven use cases, navigates the regulatory environment, and makes "
        "the financial case for AI investment in clinical settings."
    )


# ── Build and run a 3-step chain ──────────────────────────────────

chain = PromptChain()
chain.add_step(
    "Extract Topics",
    "Extract the 5 most important topics from the following subject area: {input}",
    transform_fn=sim_extract_topics
)
chain.add_step(
    "Create Outline",
    "Create a detailed article outline organized around these topics:\n{input}",
    transform_fn=sim_create_outline
)
chain.add_step(
    "Write Introduction",
    "Write a compelling 150-word introduction for an article with this outline:\n{input}",
    transform_fn=sim_write_intro
)

result = chain.run("AI in healthcare")
chain.show_trace()

print("=" * 60)
print("  FINAL OUTPUT")
print("=" * 60)
print(textwrap.fill(result, width=72))

## Part 2: Workflow Patterns

Real-world prompt workflows go beyond simple sequences. Three common patterns:

| Pattern | Description | Use Case |
|---------|-------------|----------|
| **Sequential** | Steps run one after another | Article writing, report generation |
| **Fan-out** | Same prompt applied to many inputs, results collected | Batch analysis, multi-document review |
| **Iterative** | Same prompt re-applied to progressively refine output | Editing, polishing, improving quality |

In [None]:
class WorkflowEngine:
    """Engine supporting three common prompt workflow patterns."""

    @staticmethod
    def sequential(steps: List[Tuple[str, Callable[[str], str]]],
                   initial_input: str) -> List[Tuple[str, str]]:
        """Run steps in order, passing output forward.

        Args:
            steps: List of (step_name, transform_function) tuples.
            initial_input: Starting text.
        Returns:
            List of (step_name, output) tuples.
        """
        results = []
        current = initial_input
        for name, fn in steps:
            current = fn(current)
            results.append((name, current))
        return results

    @staticmethod
    def fan_out(transform_fn: Callable[[str], str],
                inputs: List[str]) -> List[Tuple[str, str]]:
        """Apply the same transform to multiple inputs.

        Returns:
            List of (input_text, output_text) tuples.
        """
        return [(inp, transform_fn(inp)) for inp in inputs]

    @staticmethod
    def iterative(transform_fn: Callable[[str, int], str],
                  initial_input: str,
                  iterations: int = 3) -> List[Tuple[int, str]]:
        """Refine output over multiple iterations.

        Args:
            transform_fn: Receives (current_text, iteration_number) and
                          returns refined text.
            initial_input: Starting text.
            iterations: Number of refinement rounds.
        Returns:
            List of (round_number, output_text) tuples.
        """
        results = []
        current = initial_input
        for i in range(1, iterations + 1):
            current = transform_fn(current, i)
            results.append((i, current))
        return results


engine = WorkflowEngine()

# ── Demo 1: Sequential ─────────────────────────────────────────
print("=" * 60)
print("  PATTERN: Sequential")
print("=" * 60)

seq_steps = [
    ("Brainstorm", lambda text: (
        f"Ideas generated from '{text}':\n"
        "- Personalized learning paths using ML\n"
        "- Automated grading with NLP\n"
        "- Intelligent tutoring chatbots"
    )),
    ("Prioritize", lambda text: (
        "Prioritized ideas (by impact):\n"
        "1. [HIGH]  Personalized learning paths - addresses diverse needs\n"
        "2. [MED]   Intelligent tutoring chatbots - scalable support\n"
        "3. [LOW]   Automated grading - saves time but lower strategic value"
    )),
    ("Action Plan", lambda text: (
        "90-Day Action Plan:\n"
        "Month 1: Pilot personalized learning with 2 courses\n"
        "Month 2: Integrate tutoring chatbot on platform\n"
        "Month 3: Evaluate results, plan automated grading pilot"
    )),
]

seq_results = engine.sequential(seq_steps, "AI in education")
for name, output in seq_results:
    print(f"\n  [{name}]")
    print(textwrap.indent(output, "    "))

# ── Demo 2: Fan-out ────────────────────────────────────────────
print("\n" + "=" * 60)
print("  PATTERN: Fan-out (Sentiment Analysis)")
print("=" * 60)

reviews = [
    "Absolutely love this product! Best purchase I have made all year.",
    "It works okay but the battery life is disappointing for the price.",
    "Terrible customer service. Waited 3 weeks for a response.",
    "Good value for money. Solid build quality and fast shipping.",
    "The software crashes constantly. Returning it this week."
]

sentiment_map = {
    0: ("POSITIVE", 0.92),
    1: ("MIXED",    0.61),
    2: ("NEGATIVE", 0.15),
    3: ("POSITIVE", 0.78),
    4: ("NEGATIVE", 0.08),
}

def sim_sentiment(review: str) -> str:
    idx = reviews.index(review) if review in reviews else 0
    label, conf = sentiment_map.get(idx, ("NEUTRAL", 0.50))
    return f"{label} (confidence: {conf:.0%})"

fan_results = engine.fan_out(sim_sentiment, reviews)
print(f"\n  {'Review (truncated)':<55} {'Sentiment'}")
print("  " + "-" * 75)
for review, sentiment in fan_results:
    short = review[:52] + "..." if len(review) > 52 else review
    print(f"  {short:<55} {sentiment}")

# ── Demo 3: Iterative ──────────────────────────────────────────
print("\n" + "=" * 60)
print("  PATTERN: Iterative (Product Description Refinement)")
print("=" * 60)

refinement_stages = [
    "SmartFit Pro is a fitness tracker. It tracks steps and heart rate. "
    "It has a screen and a band. Buy it now.",

    "The SmartFit Pro fitness tracker monitors your steps, heart rate, "
    "and sleep patterns in real time. Its vibrant OLED display and "
    "comfortable silicone band make it ideal for all-day wear.",

    "Meet SmartFit Pro\u2014the fitness tracker that works as hard as you do. "
    "Track steps, heart rate, and sleep with medical-grade sensors. "
    "The brilliant OLED display delivers real-time insights at a glance, "
    "while the ultra-light silicone band ensures all-day comfort. "
    "With 7-day battery life and water resistance to 50m, SmartFit Pro "
    "keeps up no matter where your day takes you."
]

def sim_refine(text: str, iteration: int) -> str:
    idx = min(iteration, len(refinement_stages)) - 1
    return refinement_stages[idx]

iter_results = engine.iterative(sim_refine, refinement_stages[0], iterations=3)
for round_num, output in iter_results:
    word_count = len(output.split())
    print(f"\n  [Round {round_num}] ({word_count} words)")
    print(textwrap.indent(textwrap.fill(output, width=68), "    "))

## Exercise 1: Build a Content Production Chain

Create a 4-step content production chain that takes a topic and produces a polished piece of content. You need to:

1. **Research** - Extract key points from a topic
2. **Outline** - Structure the key points into sections
3. **Draft** - Write a first draft from the outline
4. **Polish** - Refine the tone and formatting

Write a `transform_fn` for each step that simulates realistic output. Run the chain on the topic "AI in healthcare" and display the trace.

In [None]:
# YOUR CODE HERE
# Build a 4-step content production chain:
# Step 1: Research - Extract key points from a topic
# Step 2: Outline - Structure the key points
# Step 3: Draft - Write first draft from outline
# Step 4: Polish - Refine tone and format

# Define your transform functions
# def research_fn(prompt: str) -> str:
#     return "..."  # Simulate extracted key points

# def outline_fn(prompt: str) -> str:
#     return "..."  # Simulate structured outline

# def draft_fn(prompt: str) -> str:
#     return "..."  # Simulate first draft

# def polish_fn(prompt: str) -> str:
#     return "..."  # Simulate polished final version

chain = PromptChain()

# Add your steps here
# chain.add_step("Research", "Identify the 5 most important aspects of: {input}",
#                transform_fn=research_fn)
# chain.add_step("Outline", "Create a structured outline from: {input}",
#                transform_fn=outline_fn)
# chain.add_step("Draft", "Write a 200-word first draft based on: {input}",
#                transform_fn=draft_fn)
# chain.add_step("Polish", "Refine this draft for a professional audience: {input}",
#                transform_fn=polish_fn)

# Run the chain
# result = chain.run("AI in healthcare")
# chain.show_trace()

## Part 3: Prompt Testing & Evaluation

Professional prompt engineers don't just write prompts and hope for the best. They **test** them against known inputs, score the outputs, and run A/B comparisons. The `PromptTester` class below provides a lightweight framework for doing exactly that.

Each test case specifies:
- An **input** that will be fed to the prompt
- **Expected keywords** the output should contain
- An **expected format** pattern the output should match

In [None]:
@dataclass
class TestCase:
    """A single test case for prompt evaluation."""
    name: str
    input_text: str
    expected_keywords: List[str]
    expected_format: str  # regex pattern the output should match


@dataclass
class TestResult:
    """Evaluation result for one test case against one prompt."""
    test_name: str
    relevance: int       # 1-5: does the output address the input?
    completeness: int    # 1-5: how many expected keywords appear?
    format_compliance: int  # 1-5: does the output match expected format?
    consistency: int     # 1-5: is the quality consistent across tests?

    @property
    def total(self) -> int:
        return self.relevance + self.completeness + self.format_compliance + self.consistency

    @property
    def dimensions(self) -> Dict[str, int]:
        return {
            "relevance": self.relevance,
            "completeness": self.completeness,
            "format_compliance": self.format_compliance,
            "consistency": self.consistency,
        }


class PromptTester:
    """Framework for testing and comparing prompts."""

    def __init__(self):
        self._results: Dict[str, List[TestResult]] = {}  # prompt_label -> results

    def evaluate(self, prompt_label: str,
                 prompt_template: str,
                 simulate_fn: Callable[[str], str],
                 test_cases: List[TestCase]) -> List[TestResult]:
        """Run test cases against a prompt and score the outputs.

        Args:
            prompt_label: Name for this prompt variant (e.g. 'Prompt A').
            prompt_template: Prompt string with {input} placeholder.
            simulate_fn: Function that simulates AI output for the prompt.
            test_cases: List of TestCase objects.
        Returns:
            List of TestResult objects.
        """
        results = []
        for tc in test_cases:
            filled = prompt_template.replace("{input}", tc.input_text)
            output = simulate_fn(filled)
            output_lower = output.lower()

            # Relevance: does the output reference the input topic?
            input_words = set(tc.input_text.lower().split())
            overlap = sum(1 for w in input_words if w in output_lower)
            relevance = min(1 + int(overlap / max(len(input_words) * 0.2, 1)), 5)

            # Completeness: what fraction of expected keywords appear?
            found = sum(1 for kw in tc.expected_keywords if kw.lower() in output_lower)
            ratio = found / max(len(tc.expected_keywords), 1)
            completeness = max(1, min(int(ratio * 5) + 1, 5))

            # Format compliance: does the output match the expected pattern?
            if tc.expected_format:
                match = re.search(tc.expected_format, output, re.IGNORECASE | re.DOTALL)
                format_compliance = 5 if match else 2
            else:
                format_compliance = 3

            # Consistency: based on output length being reasonable
            word_count = len(output.split())
            consistency = 5 if 30 <= word_count <= 500 else (3 if word_count > 10 else 1)

            results.append(TestResult(
                test_name=tc.name,
                relevance=relevance,
                completeness=completeness,
                format_compliance=format_compliance,
                consistency=consistency,
            ))

        self._results[prompt_label] = results
        return results

    def compare(self, label_a: str, label_b: str) -> str:
        """Compare two evaluated prompts and declare a winner."""
        if label_a not in self._results or label_b not in self._results:
            return "Both prompts must be evaluated first."

        total_a = sum(r.total for r in self._results[label_a])
        total_b = sum(r.total for r in self._results[label_b])

        print(f"\n  {label_a}: {total_a} pts  vs  {label_b}: {total_b} pts")
        if total_a > total_b:
            winner = label_a
        elif total_b > total_a:
            winner = label_b
        else:
            winner = "TIE"
        print(f"  Winner: {winner}\n")
        return winner

    def report(self) -> None:
        """Print a formatted evaluation report for all tested prompts."""
        dims = ["relevance", "completeness", "format_compliance", "consistency"]
        for label, results in self._results.items():
            print("=" * 60)
            print(f"  Evaluation Report: {label}")
            print("=" * 60)
            header = f"  {'Test Case':<20}"
            for d in dims:
                header += f" {d[:8]:>8}"
            header += f" {'TOTAL':>7}"
            print(header)
            print("  " + "-" * (len(header) - 2))

            grand_total = 0
            for r in results:
                row = f"  {r.test_name:<20}"
                for d in dims:
                    row += f" {r.dimensions[d]:>7}/5"
                row += f" {r.total:>6}/20"
                print(row)
                grand_total += r.total

            max_possible = len(results) * 20
            print(f"\n  Grand Total: {grand_total}/{max_possible} "
                  f"({grand_total/max_possible:.0%})")
            print()


# ── Demo: comparing two customer support prompts ────────────────

# Test cases for a customer support response task
test_cases = [
    TestCase(
        name="refund_request",
        input_text="I want a refund for order #12345. The product arrived damaged.",
        expected_keywords=["apologize", "refund", "order", "damaged", "process", "timeline"],
        expected_format=r"(sorry|apologize|apolog).*(refund|return)"
    ),
    TestCase(
        name="feature_question",
        input_text="Does your premium plan include API access and team management?",
        expected_keywords=["premium", "API", "team", "plan", "features", "included"],
        expected_format=r"(premium|plan).*(API|api).*(team|manage)"
    ),
    TestCase(
        name="complaint",
        input_text="Your app has been crashing every day this week. Very frustrated.",
        expected_keywords=["sorry", "crash", "issue", "team", "fix", "update"],
        expected_format=r"(sorry|apologize|understand).*(fix|resolv|investigat)"
    ),
]

# Prompt A: basic approach
prompt_a = "Reply to this customer message: {input}"

# Prompt B: CRAFT-enhanced
prompt_b = (
    "Context: You are handling a support ticket for a SaaS product. "
    "The customer's satisfaction score is at risk. "
    "Role: Act as a senior customer success representative with 8 years "
    "of experience in tech support. "
    "Task: Respond to the following customer message. Acknowledge their "
    "concern, provide a specific solution or next step, and include a "
    "timeline. Message: {input} "
    "Format: Greeting, empathy statement, solution paragraph, and "
    "closing with next steps. "
    "Tone: Empathetic, professional, and solution-oriented."
)

# Simulated outputs for Prompt A (basic - less complete)
sim_outputs_a = {
    "refund_request": (
        "Hi, we can process your refund. Please send the item back "
        "and we will handle it. Let us know if you need anything else."
    ),
    "feature_question": (
        "Yes, the premium plan has API access. You can manage your "
        "team from the dashboard. Check our pricing page for details."
    ),
    "complaint": (
        "We know about the crash issue. Our team is working on it. "
        "Try restarting the app for now."
    ),
}

# Simulated outputs for Prompt B (CRAFT - more thorough)
sim_outputs_b = {
    "refund_request": (
        "Dear Customer,\n\n"
        "I sincerely apologize that your order #12345 arrived damaged. "
        "That is not the experience we want for our customers, and I "
        "completely understand your frustration.\n\n"
        "I have initiated a full refund to your original payment method. "
        "You should see the credit within 3-5 business days. There is no "
        "need to return the damaged item. Additionally, I would like to "
        "offer you a 15% discount on your next order as a gesture of "
        "goodwill.\n\n"
        "If you have any questions, please reply here or call us at "
        "1-800-555-0199. We value your business and want to make this right.\n\n"
        "Best regards,\nCustomer Success Team"
    ),
    "feature_question": (
        "Hello!\n\n"
        "Great question about our Premium plan. Yes, both features are "
        "included:\n\n"
        "- **API Access**: Full REST API with 10,000 requests/month, "
        "webhook support, and SDKs for Python, JavaScript, and Go.\n"
        "- **Team Management**: Add up to 25 team members, assign roles "
        "(admin, editor, viewer), and manage permissions from a central "
        "dashboard.\n\n"
        "The Premium plan is $49/month when billed annually. I would be "
        "happy to set up a 14-day free trial so your team can evaluate "
        "both features. Just reply and I will send the activation link.\n\n"
        "Best,\nCustomer Success Team"
    ),
    "complaint": (
        "Hi there,\n\n"
        "I am truly sorry about the repeated crashes you have been "
        "experiencing. I understand how frustrating that must be, "
        "especially when it disrupts your daily workflow.\n\n"
        "Our engineering team has identified the issue and a fix is "
        "scheduled for release in version 4.2.1 this Thursday. In the "
        "meantime, clearing the app cache (Settings > Storage > Clear "
        "Cache) should reduce crash frequency. I have also flagged your "
        "account for priority support.\n\n"
        "I will personally follow up with you on Friday to confirm the "
        "update resolved the issue. Thank you for your patience.\n\n"
        "Warm regards,\nCustomer Success Team"
    ),
}

def sim_a(prompt: str) -> str:
    for key, output in sim_outputs_a.items():
        if key.replace("_", " ") in prompt.lower() or \
           any(w in prompt.lower() for w in key.split("_")):
            return output
    return "Thank you for reaching out. We will look into this."

def sim_b(prompt: str) -> str:
    for key, output in sim_outputs_b.items():
        if key.replace("_", " ") in prompt.lower() or \
           any(w in prompt.lower() for w in key.split("_")):
            return output
    return "Thank you for reaching out. We will look into this."

# Run evaluation
tester = PromptTester()
tester.evaluate("Prompt A (basic)", prompt_a, sim_a, test_cases)
tester.evaluate("Prompt B (CRAFT)", prompt_b, sim_b, test_cases)

# Report and compare
tester.report()
tester.compare("Prompt A (basic)", "Prompt B (CRAFT)")

## Exercise 2: A/B Test Your Prompts

Pick a task (e.g., writing product descriptions, answering technical questions, summarizing documents). Create two prompt variants and three test cases, then evaluate and compare them.

**Requirements:**
- Two meaningfully different prompt variants (not just rewording)
- Three test cases with expected keywords and format patterns
- Simulated output functions for each variant
- A comparison report identifying the winner

In [None]:
# YOUR CODE HERE
# Create two versions of a prompt for the same task
# Define 3 test cases and compare them

prompt_a = "Version A: {input}"
prompt_b = "Version B: {input}"

# Define test cases
my_test_cases = [
    # TestCase(name="test_1", input_text="...",
    #          expected_keywords=[...], expected_format=r"..."),
    # TestCase(name="test_2", input_text="...",
    #          expected_keywords=[...], expected_format=r"..."),
    # TestCase(name="test_3", input_text="...",
    #          expected_keywords=[...], expected_format=r"..."),
]

# Define simulated output functions
# def my_sim_a(prompt: str) -> str:
#     return "..."

# def my_sim_b(prompt: str) -> str:
#     return "..."

# Evaluate and compare
# my_tester = PromptTester()
# my_tester.evaluate("My Prompt A", prompt_a, my_sim_a, my_test_cases)
# my_tester.evaluate("My Prompt B", prompt_b, my_sim_b, my_test_cases)
# my_tester.report()
# my_tester.compare("My Prompt A", "My Prompt B")

## Part 4: Visualization - Chain Performance

Good visualizations help you communicate the value of prompt engineering to stakeholders. This section produces three charts:

1. **Bar chart** - Processing "time" (simulated) across chain steps
2. **Heatmap** - Test case scores across evaluation dimensions
3. **Line chart** - Quality improvement across iterative refinement rounds

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# ── Chart 1: Chain Step Processing Time ─────────────────────────
ax1 = axes[0]
steps_names = ["Extract\nTopics", "Create\nOutline", "Write\nIntro", "Polish\n& Edit"]
sim_times = [1.2, 2.1, 3.8, 2.5]  # simulated seconds
colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6']

bars = ax1.bar(steps_names, sim_times, color=colors, edgecolor='white', linewidth=1.5)
for bar, t in zip(bars, sim_times):
    ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1,
             f"{t:.1f}s", ha='center', va='bottom', fontweight='bold', fontsize=10)

ax1.set_ylabel('Processing Time (seconds)', fontsize=11)
ax1.set_title('Chain Step Duration', fontsize=13, fontweight='bold')
ax1.set_ylim(0, max(sim_times) + 1)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)

# ── Chart 2: Test Score Heatmap ────────────────────────────────
ax2 = axes[1]

# Use the scores from the Prompt B evaluation above
dims = ["relevance", "completeness", "format_compl.", "consistency"]
test_names = ["refund_req", "feature_q", "complaint"]

# Simulated score matrix (tests x dimensions)
score_matrix = np.array([
    [5, 5, 5, 5],  # refund_request - Prompt B
    [4, 5, 5, 5],  # feature_question
    [5, 4, 5, 5],  # complaint
])

im = ax2.imshow(score_matrix, cmap='RdYlGn', aspect='auto', vmin=1, vmax=5)
ax2.set_xticks(range(len(dims)))
ax2.set_xticklabels(dims, fontsize=9, rotation=30, ha='right')
ax2.set_yticks(range(len(test_names)))
ax2.set_yticklabels(test_names, fontsize=10)
ax2.set_title('Test Scores (CRAFT Prompt)', fontsize=13, fontweight='bold')

# Annotate cells
for i in range(len(test_names)):
    for j in range(len(dims)):
        ax2.text(j, i, str(score_matrix[i, j]),
                 ha='center', va='center', fontsize=14, fontweight='bold',
                 color='white' if score_matrix[i, j] >= 4 else 'black')

plt.colorbar(im, ax=ax2, shrink=0.8, label='Score (1-5)')

# ── Chart 3: Iterative Refinement Quality ──────────────────────
ax3 = axes[2]

rounds = [0, 1, 2, 3]
quality_scores = [8, 14, 18, 22]  # simulated total quality scores out of 25
word_counts = [15, 28, 45, 62]    # simulated word counts

line1 = ax3.plot(rounds, quality_scores, 'o-', color='#27ae60',
                 linewidth=2.5, markersize=8, label='Quality Score')
ax3.set_ylabel('Quality Score (/25)', fontsize=11, color='#27ae60')
ax3.set_ylim(0, 25)
ax3.tick_params(axis='y', labelcolor='#27ae60')

ax3_twin = ax3.twinx()
line2 = ax3_twin.plot(rounds, word_counts, 's--', color='#3498db',
                       linewidth=2, markersize=7, label='Word Count')
ax3_twin.set_ylabel('Word Count', fontsize=11, color='#3498db')
ax3_twin.tick_params(axis='y', labelcolor='#3498db')

ax3.set_xlabel('Refinement Round', fontsize=11)
ax3.set_xticks(rounds)
ax3.set_xticklabels(['Initial', 'Round 1', 'Round 2', 'Round 3'])
ax3.set_title('Iterative Refinement Progress', fontsize=13, fontweight='bold')

# Combined legend
lines = line1 + line2
labels = [l.get_label() for l in lines]
ax3.legend(lines, labels, loc='upper left', fontsize=9)

ax3.spines['top'].set_visible(False)
ax3_twin.spines['top'].set_visible(False)

plt.suptitle('Prompt Chain & Testing Analytics', fontsize=16,
             fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()