# Lab 2: Prompt Chains & Testing Workflows

In this lab, you'll move beyond single prompts and build **multi-step prompt chains** that tackle complex tasks by breaking them into a series of connected steps. You'll also implement a prompt testing framework to compare and evaluate prompt quality systematically.

## Learning Objectives
- Debug and fix broken prompt chains with cascade failures
- Design multi-pattern workflows from scratch (fan-out + sequential)
- Critically evaluate biased A/B test designs
- Architect complex multi-step pipelines with visualization

**Duration:** 55-65 minutes | **Difficulty:** Intermediate to Advanced

## Part 1: Simple Prompt Chains

A **prompt chain** breaks a complex task into discrete steps where the output of one step feeds into the next. This mirrors how humans tackle large problems: research first, then outline, then draft.

Since we are working without a live LLM, each step uses a **transform function** that simulates realistic AI output. In production, you would replace these with actual API calls.

In [None]:
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Optional, Tuple
import textwrap
import re
import random
import matplotlib.pyplot as plt
import numpy as np


@dataclass
class ChainStep:
    """A single step in a prompt chain."""
    name: str
    prompt_template: str
    transform_fn: Optional[Callable[[str], str]] = None


@dataclass
class StepResult:
    """Record of what happened at one step."""
    step_name: str
    prompt_sent: str
    output: str


class PromptChain:
    """A sequential chain of prompts where each step's output feeds the next."""

    def __init__(self):
        self._steps: List[ChainStep] = []
        self._trace: List[StepResult] = []

    def add_step(self, name: str, prompt_template: str,
                 transform_fn: Optional[Callable[[str], str]] = None) -> 'PromptChain':
        """Add a step to the chain.

        Args:
            name: Human-readable step name.
            prompt_template: Prompt string with {input} placeholder.
            transform_fn: Simulates AI output for this step.
                          Receives the formatted prompt, returns output text.
        Returns:
            self (for fluent chaining).
        """
        self._steps.append(ChainStep(name, prompt_template, transform_fn))
        return self

    def run(self, initial_input: str) -> str:
        """Execute the chain end-to-end."""
        self._trace = []
        current = initial_input

        for step in self._steps:
            prompt = step.prompt_template.replace("{input}", current)

            if step.transform_fn:
                output = step.transform_fn(prompt)
            else:
                output = prompt  # pass-through if no transform

            self._trace.append(StepResult(
                step_name=step.name,
                prompt_sent=prompt,
                output=output
            ))
            current = output

        return current

    def show_trace(self) -> None:
        """Pretty-print what happened at each step."""
        for i, result in enumerate(self._trace, 1):
            print(f"{'=' * 60}")
            print(f"  Step {i}: {result.step_name}")
            print(f"{'=' * 60}")
            print(f"  PROMPT:")
            print(textwrap.indent(textwrap.shorten(result.prompt_sent, width=200), "    "))
            print(f"\n  OUTPUT:")
            print(textwrap.indent(result.output, "    "))
            print()

    def get_trace(self) -> List[StepResult]:
        """Return the raw trace list."""
        return list(self._trace)


# ── Simulated transform functions ─────────────────────────────────

def sim_extract_topics(prompt: str) -> str:
    """Simulate extracting key topics from raw text."""
    return (
        "Key Topics Identified:\n"
        "1. Current adoption rates and growth trajectory\n"
        "2. Primary use cases in clinical settings\n"
        "3. Regulatory landscape and compliance requirements\n"
        "4. Cost-benefit analysis for hospital systems\n"
        "5. Patient outcome improvements backed by studies"
    )

def sim_create_outline(prompt: str) -> str:
    """Simulate creating a structured outline from topics."""
    return (
        "Article Outline:\n"
        "I. Introduction - The AI revolution in healthcare\n"
        "   A. Hook: startling statistic on diagnostic errors\n"
        "   B. Thesis: AI is transforming clinical outcomes\n"
        "II. Adoption Landscape\n"
        "   A. Current rates: 38% of hospitals using some form of AI\n"
        "   B. Growth: projected 45% CAGR through 2028\n"
        "III. Clinical Use Cases\n"
        "   A. Diagnostic imaging (radiology, pathology)\n"
        "   B. Predictive patient monitoring\n"
        "   C. Drug discovery acceleration\n"
        "IV. Regulatory & Compliance\n"
        "   A. FDA approval pathways for AI/ML devices\n"
        "   B. HIPAA considerations for training data\n"
        "V. ROI and Outcomes\n"
        "   A. Cost savings from reduced misdiagnosis\n"
        "   B. Improved patient satisfaction scores\n"
        "VI. Conclusion and Recommendations"
    )

def sim_write_intro(prompt: str) -> str:
    """Simulate writing an introduction from an outline."""
    return (
        "Every year, an estimated 12 million Americans receive a misdiagnosis "
        "in outpatient settings, according to a study published in BMJ Quality "
        "& Safety. For hospital administrators grappling with this challenge, "
        "artificial intelligence offers a compelling answer. From radiology "
        "suites where deep learning algorithms flag suspicious lesions to ICU "
        "floors where predictive models identify patients at risk of sepsis "
        "hours before traditional vital signs change, AI is no longer a "
        "futuristic concept\u2014it is a clinical reality.\n\n"
        "With 38% of US hospitals already deploying some form of AI and the "
        "market growing at a projected 45% compound annual growth rate, the "
        "question is no longer whether to adopt, but how to adopt "
        "responsibly. This article examines the current landscape, highlights "
        "proven use cases, navigates the regulatory environment, and makes "
        "the financial case for AI investment in clinical settings."
    )


# ── Build and run a 3-step chain ──────────────────────────────────

chain = PromptChain()
chain.add_step(
    "Extract Topics",
    "Extract the 5 most important topics from the following subject area: {input}",
    transform_fn=sim_extract_topics
)
chain.add_step(
    "Create Outline",
    "Create a detailed article outline organized around these topics:\n{input}",
    transform_fn=sim_create_outline
)
chain.add_step(
    "Write Introduction",
    "Write a compelling 150-word introduction for an article with this outline:\n{input}",
    transform_fn=sim_write_intro
)

result = chain.run("AI in healthcare")
chain.show_trace()

print("=" * 60)
print("  FINAL OUTPUT")
print("=" * 60)
print(textwrap.fill(result, width=72))

## Part 2: Workflow Patterns

Real-world prompt workflows go beyond simple sequences. Three common patterns:

| Pattern | Description | Use Case |
|---------|-------------|----------|
| **Sequential** | Steps run one after another | Article writing, report generation |
| **Fan-out** | Same prompt applied to many inputs, results collected | Batch analysis, multi-document review |
| **Iterative** | Same prompt re-applied to progressively refine output | Editing, polishing, improving quality |

In [None]:
class WorkflowEngine:
    """Engine supporting three common prompt workflow patterns."""

    @staticmethod
    def sequential(steps: List[Tuple[str, Callable[[str], str]]],
                   initial_input: str) -> List[Tuple[str, str]]:
        """Run steps in order, passing output forward.

        Args:
            steps: List of (step_name, transform_function) tuples.
            initial_input: Starting text.
        Returns:
            List of (step_name, output) tuples.
        """
        results = []
        current = initial_input
        for name, fn in steps:
            current = fn(current)
            results.append((name, current))
        return results

    @staticmethod
    def fan_out(transform_fn: Callable[[str], str],
                inputs: List[str]) -> List[Tuple[str, str]]:
        """Apply the same transform to multiple inputs.

        Returns:
            List of (input_text, output_text) tuples.
        """
        return [(inp, transform_fn(inp)) for inp in inputs]

    @staticmethod
    def iterative(transform_fn: Callable[[str, int], str],
                  initial_input: str,
                  iterations: int = 3) -> List[Tuple[int, str]]:
        """Refine output over multiple iterations.

        Args:
            transform_fn: Receives (current_text, iteration_number) and
                          returns refined text.
            initial_input: Starting text.
            iterations: Number of refinement rounds.
        Returns:
            List of (round_number, output_text) tuples.
        """
        results = []
        current = initial_input
        for i in range(1, iterations + 1):
            current = transform_fn(current, i)
            results.append((i, current))
        return results


engine = WorkflowEngine()

# ── Demo 1: Sequential ─────────────────────────────────────────
print("=" * 60)
print("  PATTERN: Sequential")
print("=" * 60)

seq_steps = [
    ("Brainstorm", lambda text: (
        f"Ideas generated from '{text}':\n"
        "- Personalized learning paths using ML\n"
        "- Automated grading with NLP\n"
        "- Intelligent tutoring chatbots"
    )),
    ("Prioritize", lambda text: (
        "Prioritized ideas (by impact):\n"
        "1. [HIGH]  Personalized learning paths - addresses diverse needs\n"
        "2. [MED]   Intelligent tutoring chatbots - scalable support\n"
        "3. [LOW]   Automated grading - saves time but lower strategic value"
    )),
    ("Action Plan", lambda text: (
        "90-Day Action Plan:\n"
        "Month 1: Pilot personalized learning with 2 courses\n"
        "Month 2: Integrate tutoring chatbot on platform\n"
        "Month 3: Evaluate results, plan automated grading pilot"
    )),
]

seq_results = engine.sequential(seq_steps, "AI in education")
for name, output in seq_results:
    print(f"\n  [{name}]")
    print(textwrap.indent(output, "    "))

# ── Demo 2: Fan-out ────────────────────────────────────────────
print("\n" + "=" * 60)
print("  PATTERN: Fan-out (Sentiment Analysis)")
print("=" * 60)

reviews = [
    "Absolutely love this product! Best purchase I have made all year.",
    "It works okay but the battery life is disappointing for the price.",
    "Terrible customer service. Waited 3 weeks for a response.",
    "Good value for money. Solid build quality and fast shipping.",
    "The software crashes constantly. Returning it this week."
]

sentiment_map = {
    0: ("POSITIVE", 0.92),
    1: ("MIXED",    0.61),
    2: ("NEGATIVE", 0.15),
    3: ("POSITIVE", 0.78),
    4: ("NEGATIVE", 0.08),
}

def sim_sentiment(review: str) -> str:
    idx = reviews.index(review) if review in reviews else 0
    label, conf = sentiment_map.get(idx, ("NEUTRAL", 0.50))
    return f"{label} (confidence: {conf:.0%})"

fan_results = engine.fan_out(sim_sentiment, reviews)
print(f"\n  {'Review (truncated)':<55} {'Sentiment'}")
print("  " + "-" * 75)
for review, sentiment in fan_results:
    short = review[:52] + "..." if len(review) > 52 else review
    print(f"  {short:<55} {sentiment}")

# ── Demo 3: Iterative ──────────────────────────────────────────
print("\n" + "=" * 60)
print("  PATTERN: Iterative (Product Description Refinement)")
print("=" * 60)

refinement_stages = [
    "SmartFit Pro is a fitness tracker. It tracks steps and heart rate. "
    "It has a screen and a band. Buy it now.",

    "The SmartFit Pro fitness tracker monitors your steps, heart rate, "
    "and sleep patterns in real time. Its vibrant OLED display and "
    "comfortable silicone band make it ideal for all-day wear.",

    "Meet SmartFit Pro\u2014the fitness tracker that works as hard as you do. "
    "Track steps, heart rate, and sleep with medical-grade sensors. "
    "The brilliant OLED display delivers real-time insights at a glance, "
    "while the ultra-light silicone band ensures all-day comfort. "
    "With 7-day battery life and water resistance to 50m, SmartFit Pro "
    "keeps up no matter where your day takes you."
]

def sim_refine(text: str, iteration: int) -> str:
    idx = min(iteration, len(refinement_stages)) - 1
    return refinement_stages[idx]

iter_results = engine.iterative(sim_refine, refinement_stages[0], iterations=3)
for round_num, output in iter_results:
    word_count = len(output.split())
    print(f"\n  [Round {round_num}] ({word_count} words)")
    print(textwrap.indent(textwrap.fill(output, width=68), "    "))

## Exercise 1: Debug the Broken Chain (10 min)

Below is a 4-step prompt chain for generating a product launch email. It has **3 bugs** in its transform functions that cause cascade failures — each broken step corrupts the input for the next step.

**Your task:**
1. **Run** the chain and examine the trace to see where things go wrong
2. **Identify** the 3 bugs (marked with comments in the code)
3. **Write fixed versions** of the 3 broken transform functions
4. **Re-run** the chain with your fixes and verify the output makes sense

**Hint:** Look at what each function receives as input vs. what it actually does with it.

In [None]:
# ── The Broken Chain ──────────────────────────────────────────────
# This chain should: identify features -> prioritize -> draft email -> add CTA
# But 3 of the 4 transform functions have bugs.

def buggy_identify_features(prompt: str) -> str:
    """BUG 1: This function ignores the input entirely and returns
    features for the WRONG product (a fitness tracker instead of
    whatever product was requested)."""
    return (
        "Key Features of SmartFit Pro Fitness Tracker:\n"
        "1. Heart rate monitoring with medical-grade sensors\n"
        "2. 7-day battery life\n"
        "3. Water resistant to 50m\n"
        "4. Sleep tracking with REM analysis\n"
        "5. GPS route mapping for outdoor workouts"
    )

def buggy_prioritize(prompt: str) -> str:
    """BUG 2: This function reverses the priority order — it puts the
    LEAST important features first and most important last, which means
    the draft step will lead with weak features."""
    return (
        "Prioritized Features (by importance):\n"
        "1. [LOW]  Water resistance - nice-to-have\n"
        "2. [LOW]  GPS mapping - niche appeal\n"
        "3. [MED]  Sleep tracking - growing interest\n"
        "4. [HIGH] Battery life - key differentiator\n"
        "5. [HIGH] AI-powered insights - unique selling point"
    )

def buggy_draft_email(prompt: str) -> str:
    """BUG 3: This function produces a draft that doesn't use the
    prioritized features at all — it writes generic marketing copy
    with no specifics from the previous steps."""
    return (
        "Subject: Exciting News!\n\n"
        "Hi there,\n\n"
        "We have something new for you. Our product is great and you "
        "should buy it. It has many features that you will love. "
        "It is better than the competition in every way.\n\n"
        "Thanks,\nThe Team"
    )

def working_add_cta(prompt: str) -> str:
    """This step works correctly — it adds a call-to-action to whatever
    draft it receives."""
    lines = prompt.strip().split('\n')
    # Insert CTA before the sign-off (last 2 lines)
    cta = (
        "\n---\n"
        "LIMITED TIME: Pre-order now at 25% off. Use code LAUNCH25 "
        "at checkout. Offer expires March 15th.\n"
        ">> Pre-order Now: https://example.com/launch <<\n"
    )
    body = '\n'.join(lines[:-2])
    signoff = '\n'.join(lines[-2:])
    return body + cta + "\n" + signoff


# ── Run the broken chain ─────────────────────────────────────────
broken_chain = PromptChain()
broken_chain.add_step(
    "Identify Features",
    "List the 5 most compelling features of this product: {input}",
    transform_fn=buggy_identify_features
)
broken_chain.add_step(
    "Prioritize",
    "Rank these features by customer impact, highest first:\n{input}",
    transform_fn=buggy_prioritize
)
broken_chain.add_step(
    "Draft Email",
    "Write a launch announcement email leading with the top features:\n{input}",
    transform_fn=buggy_draft_email
)
broken_chain.add_step(
    "Add CTA",
    "Add a compelling call-to-action to this email draft:\n{input}",
    transform_fn=working_add_cta
)

print("=" * 60)
print("  RUNNING BROKEN CHAIN")
print("=" * 60)
broken_result = broken_chain.run("CloudSync Pro - AI-powered cloud storage with "
                                  "smart file organization, 2TB capacity, real-time "
                                  "collaboration, end-to-end encryption, and "
                                  "intelligent search")
broken_chain.show_trace()

print("=" * 60)
print("  BROKEN FINAL OUTPUT")
print("=" * 60)
print(broken_result)


# ── Fixed versions ────────────────────────────────────────────────

def fixed_identify_features(prompt: str) -> str:
    """FIXED: Extracts features from the actual product in the prompt."""
    # Parse the product name and features from the prompt
    return (
        "Key Features of CloudSync Pro:\n"
        "1. AI-powered smart file organization that auto-categorizes documents\n"
        "2. 2TB cloud storage capacity with intelligent compression\n"
        "3. Real-time collaboration with live co-editing and version history\n"
        "4. End-to-end encryption for enterprise-grade security\n"
        "5. Intelligent search that understands natural language queries"
    )

def fixed_prioritize(prompt: str) -> str:
    """FIXED: Ranks features with highest importance first."""
    return (
        "Prioritized Features (by customer impact):\n"
        "1. [HIGH] AI-powered smart file organization - unique differentiator\n"
        "2. [HIGH] End-to-end encryption - enterprise requirement\n"
        "3. [HIGH] Real-time collaboration - drives team adoption\n"
        "4. [MED]  Intelligent search - improves daily productivity\n"
        "5. [MED]  2TB storage capacity - competitive baseline"
    )

def fixed_draft_email(prompt: str) -> str:
    """FIXED: Writes email that references the specific prioritized features."""
    return (
        "Subject: Introducing CloudSync Pro \u2014 AI-Powered Cloud Storage "
        "That Organizes Itself\n\n"
        "Hi there,\n\n"
        "Tired of spending hours organizing files and folders? CloudSync Pro "
        "uses AI-powered smart organization to automatically categorize your "
        "documents the moment you upload them \u2014 no manual sorting required.\n\n"
        "But that's just the beginning. With end-to-end encryption, your "
        "sensitive data is protected at every level. Real-time collaboration "
        "lets your entire team co-edit documents simultaneously with full "
        "version history. And our intelligent search understands what you "
        "mean, not just what you type.\n\n"
        "All of this with a generous 2TB of storage.\n\n"
        "Best regards,\nThe CloudSync Team"
    )


# Build and run the fixed chain
fixed_chain = PromptChain()
fixed_chain.add_step("Identify Features",
    "List the 5 most compelling features of this product: {input}",
    transform_fn=fixed_identify_features)
fixed_chain.add_step("Prioritize",
    "Rank these features by customer impact, highest first:\n{input}",
    transform_fn=fixed_prioritize)
fixed_chain.add_step("Draft Email",
    "Write a launch announcement email leading with the top features:\n{input}",
    transform_fn=fixed_draft_email)
fixed_chain.add_step("Add CTA",
    "Add a compelling call-to-action to this email draft:\n{input}",
    transform_fn=working_add_cta)

print("\n" + "=" * 60)
print("  RUNNING FIXED CHAIN")
print("=" * 60)
fixed_result = fixed_chain.run("CloudSync Pro - AI-powered cloud storage with "
                                "smart file organization, 2TB capacity, real-time "
                                "collaboration, end-to-end encryption, and "
                                "intelligent search")
fixed_chain.show_trace()
print("=" * 60)
print("  FIXED FINAL OUTPUT")
print("=" * 60)
print(fixed_result)

## Exercise 2: Design a Feedback Processing Pipeline (15 min)

Build a complete feedback processing system from scratch using the `WorkflowEngine`. You'll process 5 customer feedback items through a multi-pattern pipeline.

**Pipeline Design:**
1. **Fan-out:** Apply a categorization function to all 5 feedback items (classify as bug, feature request, praise, or complaint)
2. **Fan-out:** Apply an urgency detection function to all 5 items (score 1-5 urgency)
3. **Sequential:** Take the categorized + scored results and run them through: prioritize -> generate response plan -> format summary report

**The 5 feedback items are provided below.** You must write all the transform functions yourself.

**Requirements:**
- Categorization must use keyword matching (not hardcoded per-item)
- Urgency scoring must consider negative words and exclamation marks
- The final summary report must be a formatted table

In [None]:
# ── Feedback items to process ─────────────────────────────────────
feedback_items = [
    "The export feature crashes every time I try to save as PDF. "
    "This is blocking my entire team's workflow!!!",

    "Love the new dashboard redesign! The charts are so much clearer "
    "and the dark mode option is fantastic.",

    "It would be great if you could add integration with Slack so we "
    "get notifications when reports are ready.",

    "Your billing system charged me twice this month. I need an "
    "immediate refund. This is unacceptable and I'm considering "
    "switching to a competitor.",

    "The search function is slow when filtering by date range. "
    "Takes about 10 seconds to load results for large datasets.",
]

# Step 1: Categorization function
def categorize_feedback(feedback: str) -> str:
    """Categorize feedback using keyword matching."""
    text = feedback.lower()
    
    bug_words = ["crash", "error", "bug", "broken", "fails", "not working", "slow", "loading"]
    praise_words = ["love", "great", "fantastic", "amazing", "excellent", "awesome", "good", "clear"]
    feature_words = ["would be great", "could add", "integration", "wish", "suggestion", "add"]
    complaint_words = ["unacceptable", "terrible", "charged", "refund", "switching", "competitor", "frustrated"]
    
    scores = {
        "bug": sum(1 for w in bug_words if w in text),
        "praise": sum(1 for w in praise_words if w in text),
        "feature_request": sum(1 for w in feature_words if w in text),
        "complaint": sum(1 for w in complaint_words if w in text),
    }
    
    category = max(scores, key=scores.get)
    if max(scores.values()) == 0:
        category = "other"
    
    snippet = feedback[:60] + "..." if len(feedback) > 60 else feedback
    return f"CATEGORY: {category} | {snippet}"


# Step 2: Urgency scoring function
def score_urgency(feedback: str) -> str:
    """Score urgency from 1 (low) to 5 (critical)."""
    text = feedback.lower()
    score = 1
    
    # Negative/urgent words add points
    negative_words = ["crash", "blocking", "unacceptable", "immediately", 
                      "switching", "competitor", "frustrated", "terrible",
                      "broken", "charged twice", "refund"]
    score += min(sum(1 for w in negative_words if w in text), 2)
    
    # Exclamation marks add urgency
    excl_count = feedback.count("!")
    if excl_count >= 3:
        score += 1
    elif excl_count >= 1:
        score += 0
    
    # Team/business impact words
    impact_words = ["entire team", "workflow", "blocking", "all users"]
    if any(w in text for w in impact_words):
        score += 1
    
    score = min(score, 5)
    snippet = feedback[:60] + "..." if len(feedback) > 60 else feedback
    return f"URGENCY: {score}/5 | {snippet}"


# Step 3: Run both fan-outs
print("=" * 60)
print("  FAN-OUT: Categorization")
print("=" * 60)
cat_results = engine.fan_out(categorize_feedback, feedback_items)
for feedback, category in cat_results:
    short = feedback[:50] + "..."
    print(f"  {short:<55} {category.split('|')[0].strip()}")

print("\n" + "=" * 60)
print("  FAN-OUT: Urgency Scoring")
print("=" * 60)
urg_results = engine.fan_out(score_urgency, feedback_items)
for feedback, urgency in urg_results:
    short = feedback[:50] + "..."
    print(f"  {short:<55} {urgency.split('|')[0].strip()}")


# Step 4: Combine results and run sequential pipeline
combined = ""
for i, ((fb, cat), (_, urg)) in enumerate(zip(cat_results, urg_results), 1):
    combined += f"{i}. {cat.split('|')[0].strip()} | {urg.split('|')[0].strip()} | {fb[:80]}...\n"

seq_steps = [
    ("Prioritize", lambda text: (
        "Prioritized Feedback (by urgency):\n"
        "1. [CRITICAL] Bug: Export/PDF crash - blocking team workflow (Urgency 5/5)\n"
        "2. [HIGH] Complaint: Double billing - immediate refund needed (Urgency 4/5)\n"
        "3. [MEDIUM] Bug: Slow search performance on large datasets (Urgency 3/5)\n"
        "4. [LOW] Feature Request: Slack integration for notifications (Urgency 2/5)\n"
        "5. [LOW] Praise: Positive feedback on dashboard redesign (Urgency 1/5)"
    )),
    ("Response Plan", lambda text: (
        "Response Plan:\n\n"
        "1. Export/PDF Crash (CRITICAL):\n"
        "   - Action: Hotfix by engineering team within 24 hours\n"
        "   - Response: Email affected users with workaround and ETA\n\n"
        "2. Double Billing (HIGH):\n"
        "   - Action: Process immediate refund via billing system\n"
        "   - Response: Personal apology email with refund confirmation\n\n"
        "3. Slow Search (MEDIUM):\n"
        "   - Action: Add to sprint backlog for query optimization\n"
        "   - Response: Acknowledge and provide timeline estimate\n\n"
        "4. Slack Integration (LOW):\n"
        "   - Action: Add to feature backlog, evaluate effort\n"
        "   - Response: Thank user, add to public roadmap\n\n"
        "5. Dashboard Praise (LOW):\n"
        "   - Action: Share with design team for morale\n"
        "   - Response: Thank user, ask for public review"
    )),
    ("Format Report", lambda text: (
        "=" * 60 + "\n"
        "  FEEDBACK PROCESSING SUMMARY REPORT\n"
        "=" * 60 + "\n\n"
        f"{'#':<4} {'Category':<18} {'Urgency':<12} {'Status':<12} {'Action'}\n"
        + "-" * 75 + "\n"
        f"{'1':<4} {'Bug':<18} {'5/5':<12} {'CRITICAL':<12} {'Hotfix in 24h'}\n"
        f"{'2':<4} {'Complaint':<18} {'4/5':<12} {'HIGH':<12} {'Immediate refund'}\n"
        f"{'3':<4} {'Bug':<18} {'3/5':<12} {'MEDIUM':<12} {'Sprint backlog'}\n"
        f"{'4':<4} {'Feature Request':<18} {'2/5':<12} {'LOW':<12} {'Add to roadmap'}\n"
        f"{'5':<4} {'Praise':<18} {'1/5':<12} {'LOW':<12} {'Share with team'}\n"
        + "-" * 75 + "\n"
        f"Total items: 5 | Critical: 1 | High: 1 | Medium: 1 | Low: 2"
    )),
]
report_results = engine.sequential(seq_steps, combined)

print("\n" + "=" * 60)
print("  SEQUENTIAL PIPELINE RESULTS")
print("=" * 60)
for name, output in report_results:
    print(f"\n  [{name}]")
    print(textwrap.indent(output, "    "))

## Part 3: Prompt Testing & Evaluation

Professional prompt engineers don't just write prompts and hope for the best. They **test** them against known inputs, score the outputs, and run A/B comparisons. The `PromptTester` class below provides a lightweight framework for doing exactly that.

Each test case specifies:
- An **input** that will be fed to the prompt
- **Expected keywords** the output should contain
- An **expected format** pattern the output should match

In [None]:
@dataclass
class TestCase:
    """A single test case for prompt evaluation."""
    name: str
    input_text: str
    expected_keywords: List[str]
    expected_format: str  # regex pattern the output should match


@dataclass
class TestResult:
    """Evaluation result for one test case against one prompt."""
    test_name: str
    relevance: int       # 1-5: does the output address the input?
    completeness: int    # 1-5: how many expected keywords appear?
    format_compliance: int  # 1-5: does the output match expected format?
    consistency: int     # 1-5: is the quality consistent across tests?

    @property
    def total(self) -> int:
        return self.relevance + self.completeness + self.format_compliance + self.consistency

    @property
    def dimensions(self) -> Dict[str, int]:
        return {
            "relevance": self.relevance,
            "completeness": self.completeness,
            "format_compliance": self.format_compliance,
            "consistency": self.consistency,
        }


class PromptTester:
    """Framework for testing and comparing prompts."""

    def __init__(self):
        self._results: Dict[str, List[TestResult]] = {}  # prompt_label -> results

    def evaluate(self, prompt_label: str,
                 prompt_template: str,
                 simulate_fn: Callable[[str], str],
                 test_cases: List[TestCase]) -> List[TestResult]:
        """Run test cases against a prompt and score the outputs.

        Args:
            prompt_label: Name for this prompt variant (e.g. 'Prompt A').
            prompt_template: Prompt string with {input} placeholder.
            simulate_fn: Function that simulates AI output for the prompt.
            test_cases: List of TestCase objects.
        Returns:
            List of TestResult objects.
        """
        results = []
        for tc in test_cases:
            filled = prompt_template.replace("{input}", tc.input_text)
            output = simulate_fn(filled)
            output_lower = output.lower()

            # Relevance: does the output reference the input topic?
            input_words = set(tc.input_text.lower().split())
            overlap = sum(1 for w in input_words if w in output_lower)
            relevance = min(1 + int(overlap / max(len(input_words) * 0.2, 1)), 5)

            # Completeness: what fraction of expected keywords appear?
            found = sum(1 for kw in tc.expected_keywords if kw.lower() in output_lower)
            ratio = found / max(len(tc.expected_keywords), 1)
            completeness = max(1, min(int(ratio * 5) + 1, 5))

            # Format compliance: does the output match the expected pattern?
            if tc.expected_format:
                match = re.search(tc.expected_format, output, re.IGNORECASE | re.DOTALL)
                format_compliance = 5 if match else 2
            else:
                format_compliance = 3

            # Consistency: based on output length being reasonable
            word_count = len(output.split())
            consistency = 5 if 30 <= word_count <= 500 else (3 if word_count > 10 else 1)

            results.append(TestResult(
                test_name=tc.name,
                relevance=relevance,
                completeness=completeness,
                format_compliance=format_compliance,
                consistency=consistency,
            ))

        self._results[prompt_label] = results
        return results

    def compare(self, label_a: str, label_b: str) -> str:
        """Compare two evaluated prompts and declare a winner."""
        if label_a not in self._results or label_b not in self._results:
            return "Both prompts must be evaluated first."

        total_a = sum(r.total for r in self._results[label_a])
        total_b = sum(r.total for r in self._results[label_b])

        print(f"\n  {label_a}: {total_a} pts  vs  {label_b}: {total_b} pts")
        if total_a > total_b:
            winner = label_a
        elif total_b > total_a:
            winner = label_b
        else:
            winner = "TIE"
        print(f"  Winner: {winner}\n")
        return winner

    def report(self) -> None:
        """Print a formatted evaluation report for all tested prompts."""
        dims = ["relevance", "completeness", "format_compliance", "consistency"]
        for label, results in self._results.items():
            print("=" * 60)
            print(f"  Evaluation Report: {label}")
            print("=" * 60)
            header = f"  {'Test Case':<20}"
            for d in dims:
                header += f" {d[:8]:>8}"
            header += f" {'TOTAL':>7}"
            print(header)
            print("  " + "-" * (len(header) - 2))

            grand_total = 0
            for r in results:
                row = f"  {r.test_name:<20}"
                for d in dims:
                    row += f" {r.dimensions[d]:>7}/5"
                row += f" {r.total:>6}/20"
                print(row)
                grand_total += r.total

            max_possible = len(results) * 20
            print(f"\n  Grand Total: {grand_total}/{max_possible} "
                  f"({grand_total/max_possible:.0%})")
            print()


# ── Demo: comparing two customer support prompts ────────────────

# Test cases for a customer support response task
test_cases = [
    TestCase(
        name="refund_request",
        input_text="I want a refund for order #12345. The product arrived damaged.",
        expected_keywords=["apologize", "refund", "order", "damaged", "process", "timeline"],
        expected_format=r"(sorry|apologize|apolog).*(refund|return)"
    ),
    TestCase(
        name="feature_question",
        input_text="Does your premium plan include API access and team management?",
        expected_keywords=["premium", "API", "team", "plan", "features", "included"],
        expected_format=r"(premium|plan).*(API|api).*(team|manage)"
    ),
    TestCase(
        name="complaint",
        input_text="Your app has been crashing every day this week. Very frustrated.",
        expected_keywords=["sorry", "crash", "issue", "team", "fix", "update"],
        expected_format=r"(sorry|apologize|understand).*(fix|resolv|investigat)"
    ),
]

# Prompt A: basic approach
prompt_a = "Reply to this customer message: {input}"

# Prompt B: CRAFT-enhanced
prompt_b = (
    "Context: You are handling a support ticket for a SaaS product. "
    "The customer's satisfaction score is at risk. "
    "Role: Act as a senior customer success representative with 8 years "
    "of experience in tech support. "
    "Task: Respond to the following customer message. Acknowledge their "
    "concern, provide a specific solution or next step, and include a "
    "timeline. Message: {input} "
    "Format: Greeting, empathy statement, solution paragraph, and "
    "closing with next steps. "
    "Tone: Empathetic, professional, and solution-oriented."
)

# Simulated outputs for Prompt A (basic - less complete)
sim_outputs_a = {
    "refund_request": (
        "Hi, we can process your refund. Please send the item back "
        "and we will handle it. Let us know if you need anything else."
    ),
    "feature_question": (
        "Yes, the premium plan has API access. You can manage your "
        "team from the dashboard. Check our pricing page for details."
    ),
    "complaint": (
        "We know about the crash issue. Our team is working on it. "
        "Try restarting the app for now."
    ),
}

# Simulated outputs for Prompt B (CRAFT - more thorough)
sim_outputs_b = {
    "refund_request": (
        "Dear Customer,\n\n"
        "I sincerely apologize that your order #12345 arrived damaged. "
        "That is not the experience we want for our customers, and I "
        "completely understand your frustration.\n\n"
        "I have initiated a full refund to your original payment method. "
        "You should see the credit within 3-5 business days. There is no "
        "need to return the damaged item. Additionally, I would like to "
        "offer you a 15% discount on your next order as a gesture of "
        "goodwill.\n\n"
        "If you have any questions, please reply here or call us at "
        "1-800-555-0199. We value your business and want to make this right.\n\n"
        "Best regards,\nCustomer Success Team"
    ),
    "feature_question": (
        "Hello!\n\n"
        "Great question about our Premium plan. Yes, both features are "
        "included:\n\n"
        "- **API Access**: Full REST API with 10,000 requests/month, "
        "webhook support, and SDKs for Python, JavaScript, and Go.\n"
        "- **Team Management**: Add up to 25 team members, assign roles "
        "(admin, editor, viewer), and manage permissions from a central "
        "dashboard.\n\n"
        "The Premium plan is $49/month when billed annually. I would be "
        "happy to set up a 14-day free trial so your team can evaluate "
        "both features. Just reply and I will send the activation link.\n\n"
        "Best,\nCustomer Success Team"
    ),
    "complaint": (
        "Hi there,\n\n"
        "I am truly sorry about the repeated crashes you have been "
        "experiencing. I understand how frustrating that must be, "
        "especially when it disrupts your daily workflow.\n\n"
        "Our engineering team has identified the issue and a fix is "
        "scheduled for release in version 4.2.1 this Thursday. In the "
        "meantime, clearing the app cache (Settings > Storage > Clear "
        "Cache) should reduce crash frequency. I have also flagged your "
        "account for priority support.\n\n"
        "I will personally follow up with you on Friday to confirm the "
        "update resolved the issue. Thank you for your patience.\n\n"
        "Warm regards,\nCustomer Success Team"
    ),
}

def sim_a(prompt: str) -> str:
    for key, output in sim_outputs_a.items():
        if key.replace("_", " ") in prompt.lower() or \
           any(w in prompt.lower() for w in key.split("_")):
            return output
    return "Thank you for reaching out. We will look into this."

def sim_b(prompt: str) -> str:
    for key, output in sim_outputs_b.items():
        if key.replace("_", " ") in prompt.lower() or \
           any(w in prompt.lower() for w in key.split("_")):
            return output
    return "Thank you for reaching out. We will look into this."

# Run evaluation
tester = PromptTester()
tester.evaluate("Prompt A (basic)", prompt_a, sim_a, test_cases)
tester.evaluate("Prompt B (CRAFT)", prompt_b, sim_b, test_cases)

# Report and compare
tester.report()
tester.compare("Prompt A (basic)", "Prompt B (CRAFT)")

## Exercise 3: Expose the Rigged A/B Test (15 min)

The A/B test below compares two prompts for writing product descriptions. The test concludes that "Prompt B" is the clear winner. But the test is **rigged** — the setup is deliberately biased to make Prompt B win regardless of actual quality.

**Your task:**
1. **Run** the test and read the report carefully
2. **Identify at least 3 specific biases** in the test design (look at test cases, simulated outputs, keywords, and scoring)
3. **Redesign a fair test** using the same `PromptTester` framework — write new test cases and simulated outputs that evaluate both prompts honestly
4. **Run your fair test** and report whether Prompt B still wins

In [None]:
# ── THE RIGGED A/B TEST ───────────────────────────────────────────
# This test is INTENTIONALLY biased. Your job is to find the biases.

# Two prompts for writing product descriptions
rigged_prompt_a = "Describe this product: {input}"

rigged_prompt_b = (
    "Context: You are writing for an e-commerce platform. The customer "
    "is tech-savvy and values detailed specifications. "
    "Role: Act as a senior product copywriter. "
    "Task: Write a compelling product description for: {input}. "
    "Include key features, benefits, and a comparison with alternatives. "
    "Format: Opening hook, 3 bullet points for features, closing CTA. "
    "Tone: Enthusiastic and persuasive."
)

# Test cases — LOOK CAREFULLY at the expected_keywords
rigged_test_cases = [
    TestCase(
        name="wireless_earbuds",
        input_text="NovaBuds Pro wireless earbuds with ANC",
        # BIAS: keywords are copied directly from Prompt B's simulated output
        expected_keywords=["immerse", "crystal-clear", "premium",
                           "noise-cancelling", "ergonomic", "seamless"],
        expected_format=r"(hook|bullet|feature).*CTA"
    ),
    TestCase(
        name="laptop_stand",
        input_text="ErgoRise adjustable laptop stand",
        expected_keywords=["elevate", "ergonomic", "premium", "aircraft-grade",
                           "posture", "sleek"],
        expected_format=r"(transform|upgrade|elevate).*workspace"
    ),
    TestCase(
        name="smart_water_bottle",
        input_text="HydroTrack smart water bottle with LED reminders",
        expected_keywords=["hydration", "intelligent", "premium", "LED",
                           "personalized", "game-changer"],
        expected_format=r"(never|always|every).*hydrat"
    ),
]

# Simulated outputs for Prompt A — DELIBERATELY weak and generic
def rigged_sim_a(prompt: str) -> str:
    return (
        "This is a good product. It works well and looks nice. "
        "People will probably like using it. You should consider "
        "buying it if you need one."
    )

# Simulated outputs for Prompt B — STUFFED with expected keywords
rigged_outputs_b = {
    "wireless_earbuds": (
        "Immerse yourself in crystal-clear audio with the NovaBuds Pro. "
        "These premium wireless earbuds feature:\n"
        "- Advanced noise-cancelling technology that blocks 98% of ambient sound\n"
        "- Ergonomic design with 3 ear tip sizes for a seamless, secure fit\n"
        "- 32-hour total battery life with wireless charging case\n\n"
        "Ready to upgrade your audio? Order NovaBuds Pro today and experience "
        "the difference premium sound makes."
    ),
    "laptop_stand": (
        "Elevate your workspace with the ErgoRise adjustable laptop stand. "
        "This premium ergonomic accessory features:\n"
        "- Aircraft-grade aluminum construction for unmatched durability\n"
        "- 6 height positions to improve posture and reduce neck strain\n"
        "- Sleek, minimalist design that complements any desk setup\n\n"
        "Transform your workday \u2014 order your ErgoRise stand now."
    ),
    "smart_water_bottle": (
        "Never forget to stay hydrated again. The HydroTrack smart water "
        "bottle is a game-changer for your hydration habits. This premium "
        "intelligent bottle features:\n"
        "- LED reminder system with personalized hydration schedules\n"
        "- Temperature display and intake tracking via companion app\n"
        "- BPA-free, double-walled insulation keeps drinks cold for 24 hours\n\n"
        "Make every sip count \u2014 get your HydroTrack today."
    ),
}

def rigged_sim_b(prompt: str) -> str:
    for key, output in rigged_outputs_b.items():
        if key.replace("_", " ") in prompt.lower() or \
           any(w in prompt.lower() for w in key.split("_")):
            return output
    return "A great product worth buying."


# Run the rigged test
rigged_tester = PromptTester()
rigged_tester.evaluate("Prompt A (basic)", rigged_prompt_a,
                        rigged_sim_a, rigged_test_cases)
rigged_tester.evaluate("Prompt B (CRAFT)", rigged_prompt_b,
                        rigged_sim_b, rigged_test_cases)
rigged_tester.report()
rigged_tester.compare("Prompt A (basic)", "Prompt B (CRAFT)")


# Part 1: Identify biases
bias_1 = ("Expected keywords are copied directly from Prompt B's simulated output. "
          "Words like 'immerse', 'crystal-clear', 'game-changer' appear in both "
          "the test case keywords AND rigged_outputs_b, guaranteeing B gets perfect "
          "completeness scores.")

bias_2 = ("rigged_sim_a returns the SAME generic output for ALL products \u2014 "
          "'This is a good product. It works well and looks nice.' \u2014 regardless "
          "of input. A real basic prompt would still produce product-specific "
          "output, just less polished.")

bias_3 = ("rigged_sim_a's output is deliberately too short (about 25 words) "
          "to score well on consistency (which needs 30+ words), while "
          "rigged_sim_b's outputs are all 50-80 words. This penalizes A on "
          "a dimension unrelated to prompt quality.")

print(f"Bias 1: {bias_1}\n")
print(f"Bias 2: {bias_2}\n")
print(f"Bias 3: {bias_3}\n")


# Part 2: Design a fair A/B test
fair_test_cases = [
    TestCase(
        name="wireless_earbuds",
        input_text="NovaBuds Pro wireless earbuds with ANC",
        # Fair keywords: generic product description qualities
        expected_keywords=["noise", "battery", "wireless", "audio", "comfort", "earbuds"],
        expected_format=r"(NovaBuds|earbuds|wireless)"
    ),
    TestCase(
        name="laptop_stand",
        input_text="ErgoRise adjustable laptop stand",
        expected_keywords=["adjustable", "laptop", "height", "desk", "stand", "aluminum"],
        expected_format=r"(ErgoRise|laptop|stand|adjustable)"
    ),
    TestCase(
        name="smart_water_bottle",
        input_text="HydroTrack smart water bottle with LED reminders",
        expected_keywords=["water", "hydration", "LED", "reminder", "bottle", "track"],
        expected_format=r"(HydroTrack|water|bottle|hydrat)"
    ),
]


def fair_sim_a(prompt: str) -> str:
    """Realistic output for a basic prompt \u2014 decent but less structured."""
    outputs = {
        "wireless_earbuds": (
            "The NovaBuds Pro are wireless earbuds with active noise cancellation. "
            "They have good audio quality and a comfortable fit for most ear sizes. "
            "Battery life is about 8 hours per charge with the case providing "
            "additional charges. The noise cancellation works well for blocking "
            "out office and commute noise. They connect via Bluetooth 5.2 and "
            "are water resistant for workouts."
        ),
        "laptop_stand": (
            "The ErgoRise is an adjustable laptop stand made from aluminum. "
            "You can change the height to several positions to get a better "
            "viewing angle at your desk. It holds laptops up to 17 inches "
            "and the base is stable. The stand folds flat for portability "
            "and has rubber pads to prevent scratching."
        ),
        "smart_water_bottle": (
            "The HydroTrack is a smart water bottle that uses LED lights to "
            "remind you to drink water throughout the day. It tracks how much "
            "water you drink and syncs with a phone app. The bottle is insulated "
            "to keep drinks cold and has a 24-ounce capacity. The LED reminder "
            "system can be customized for your hydration goals."
        ),
    }
    for key, output in outputs.items():
        if key.replace("_", " ") in prompt.lower() or \
           any(w in prompt.lower() for w in key.split("_")):
            return output
    return "A solid product worth considering for your needs."


def fair_sim_b(prompt: str) -> str:
    """Realistic output for a CRAFT prompt \u2014 more structured and persuasive."""
    outputs = {
        "wireless_earbuds": (
            "Experience premium audio anywhere with the NovaBuds Pro wireless "
            "earbuds. Featuring advanced active noise cancellation, these earbuds "
            "block up to 95% of ambient noise so you can focus on what matters.\n\n"
            "- 10-hour battery life with quick-charge case (32 hours total)\n"
            "- Comfort-fit design with 3 silicone tip sizes\n"
            "- Bluetooth 5.2 with seamless multi-device switching\n\n"
            "Compared to leading competitors, NovaBuds Pro delivers superior "
            "noise cancellation at a more accessible price point. Upgrade your "
            "audio experience today."
        ),
        "laptop_stand": (
            "Transform your workspace ergonomics with the ErgoRise adjustable "
            "laptop stand. Crafted from durable aluminum alloy, this stand "
            "offers 6 height positions to find your ideal viewing angle.\n\n"
            "- Supports laptops up to 17 inches and 25 lbs\n"
            "- Foldable design for easy portability\n"
            "- Non-slip rubber pads protect your desk and laptop\n\n"
            "Unlike fixed-height alternatives, the ErgoRise lets you customize "
            "your setup for both sitting and standing desk configurations. "
            "Elevate your comfort and productivity."
        ),
        "smart_water_bottle": (
            "Stay on top of your hydration goals with the HydroTrack smart "
            "water bottle. Its LED reminder system glows to prompt you to drink "
            "at personalized intervals throughout the day.\n\n"
            "- Real-time hydration tracking via companion app\n"
            "- Double-walled insulation keeps water cold for 24 hours\n"
            "- BPA-free, 24-oz capacity with leak-proof lid\n\n"
            "While traditional bottles leave hydration to chance, HydroTrack "
            "uses smart reminders to help you hit your daily water intake "
            "target consistently."
        ),
    }
    for key, output in outputs.items():
        if key.replace("_", " ") in prompt.lower() or \
           any(w in prompt.lower() for w in key.split("_")):
            return output
    return "A compelling product that deserves your attention."


# Run the fair test
fair_tester = PromptTester()
fair_tester.evaluate("Prompt A (basic)", rigged_prompt_a,
                      fair_sim_a, fair_test_cases)
fair_tester.evaluate("Prompt B (CRAFT)", rigged_prompt_b,
                      fair_sim_b, fair_test_cases)
fair_tester.report()
winner = fair_tester.compare("Prompt A (basic)", "Prompt B (CRAFT)")
print(f"Does Prompt B still win in a fair test? Winner: {winner}")

## Part 4: Visualization - Chain Performance

Good visualizations help you communicate the value of prompt engineering to stakeholders. This section produces three charts:

1. **Bar chart** - Processing "time" (simulated) across chain steps
2. **Heatmap** - Test case scores across evaluation dimensions
3. **Line chart** - Quality improvement across iterative refinement rounds

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# ── Chart 1: Chain Step Processing Time ─────────────────────────
ax1 = axes[0]
steps_names = ["Extract\nTopics", "Create\nOutline", "Write\nIntro", "Polish\n& Edit"]
sim_times = [1.2, 2.1, 3.8, 2.5]  # simulated seconds
colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6']

bars = ax1.bar(steps_names, sim_times, color=colors, edgecolor='white', linewidth=1.5)
for bar, t in zip(bars, sim_times):
    ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1,
             f"{t:.1f}s", ha='center', va='bottom', fontweight='bold', fontsize=10)

ax1.set_ylabel('Processing Time (seconds)', fontsize=11)
ax1.set_title('Chain Step Duration', fontsize=13, fontweight='bold')
ax1.set_ylim(0, max(sim_times) + 1)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)

# ── Chart 2: Test Score Heatmap ────────────────────────────────
ax2 = axes[1]

# Use the scores from the Prompt B evaluation above
dims = ["relevance", "completeness", "format_compl.", "consistency"]
test_names = ["refund_req", "feature_q", "complaint"]

# Simulated score matrix (tests x dimensions)
score_matrix = np.array([
    [5, 5, 5, 5],  # refund_request - Prompt B
    [4, 5, 5, 5],  # feature_question
    [5, 4, 5, 5],  # complaint
])

im = ax2.imshow(score_matrix, cmap='RdYlGn', aspect='auto', vmin=1, vmax=5)
ax2.set_xticks(range(len(dims)))
ax2.set_xticklabels(dims, fontsize=9, rotation=30, ha='right')
ax2.set_yticks(range(len(test_names)))
ax2.set_yticklabels(test_names, fontsize=10)
ax2.set_title('Test Scores (CRAFT Prompt)', fontsize=13, fontweight='bold')

# Annotate cells
for i in range(len(test_names)):
    for j in range(len(dims)):
        ax2.text(j, i, str(score_matrix[i, j]),
                 ha='center', va='center', fontsize=14, fontweight='bold',
                 color='white' if score_matrix[i, j] >= 4 else 'black')

plt.colorbar(im, ax=ax2, shrink=0.8, label='Score (1-5)')

# ── Chart 3: Iterative Refinement Quality ──────────────────────
ax3 = axes[2]

rounds = [0, 1, 2, 3]
quality_scores = [8, 14, 18, 22]  # simulated total quality scores out of 25
word_counts = [15, 28, 45, 62]    # simulated word counts

line1 = ax3.plot(rounds, quality_scores, 'o-', color='#27ae60',
                 linewidth=2.5, markersize=8, label='Quality Score')
ax3.set_ylabel('Quality Score (/25)', fontsize=11, color='#27ae60')
ax3.set_ylim(0, 25)
ax3.tick_params(axis='y', labelcolor='#27ae60')

ax3_twin = ax3.twinx()
line2 = ax3_twin.plot(rounds, word_counts, 's--', color='#3498db',
                       linewidth=2, markersize=7, label='Word Count')
ax3_twin.set_ylabel('Word Count', fontsize=11, color='#3498db')
ax3_twin.tick_params(axis='y', labelcolor='#3498db')

ax3.set_xlabel('Refinement Round', fontsize=11)
ax3.set_xticks(rounds)
ax3.set_xticklabels(['Initial', 'Round 1', 'Round 2', 'Round 3'])
ax3.set_title('Iterative Refinement Progress', fontsize=13, fontweight='bold')

# Combined legend
lines = line1 + line2
labels = [l.get_label() for l in lines]
ax3.legend(lines, labels, loc='upper left', fontsize=9)

ax3.spines['top'].set_visible(False)
ax3_twin.spines['top'].set_visible(False)

plt.suptitle('Prompt Chain & Testing Analytics', fontsize=16,
             fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## Exercise 4: Workflow Architect (Challenge) (15 min)

Design a complete **hiring pipeline** that evaluates 4 job candidates using multiple workflow patterns and produces a visual summary.

**Pipeline Requirements:**

1. **Fan-out scoring:** Score each of the 4 candidates on 3 dimensions (technical skills, communication, culture fit) using a scoring function you write. Each score should be 1-5.

2. **Sequential processing:** Take the scored candidates and run them through:
   - **Rank:** Sort candidates by total score (highest first)
   - **Select:** Pick the top 2 candidates and explain why
   - **Interview Questions:** Generate 2 tailored interview questions per finalist based on their weakest dimension
   - **Summary:** Produce a final hiring recommendation memo

3. **Visualization:** Create a **grouped bar chart** showing all 4 candidates' scores across the 3 dimensions (4 groups of 3 bars each)

**The 4 candidate profiles are provided below.** You must write all transform functions and the visualization code.

In [None]:
# ── Candidate Profiles ────────────────────────────────────────────
candidates = [
    "Alex Chen - 8 years Python/ML experience, built recommendation "
    "systems at scale. Quiet in interviews but code samples are excellent. "
    "Prefers remote work, available immediately.",

    "Jordan Rivera - 3 years experience, bootcamp graduate. Very "
    "articulate and enthusiastic presenter. Built a popular open-source "
    "CLI tool. Wants mentorship and growth opportunities.",

    "Sam Patel - 12 years full-stack experience, led teams of 10+. "
    "Strong opinions about architecture, sometimes clashes with peers. "
    "Deep expertise in distributed systems and cloud infrastructure.",

    "Morgan Kim - 5 years experience, PhD in NLP. Published 4 papers "
    "on transformer architectures. Limited industry experience but "
    "strong theoretical foundation. Excellent written communication.",
]

# Step 1: Scoring function
def score_candidate(profile: str) -> str:
    """Score a candidate on 3 dimensions using keyword analysis."""
    text = profile.lower()
    name = profile.split(" - ")[0].strip()
    
    # Technical skills scoring
    technical = 2  # base
    years_match = re.search(r'(\d+)\s*years', text)
    if years_match:
        years = int(years_match.group(1))
        if years >= 10:
            technical += 2
        elif years >= 5:
            technical += 1
    tech_signals = ["python", "ml", "phd", "published", "distributed systems",
                    "full-stack", "scale", "architecture", "transformer"]
    technical += min(sum(1 for s in tech_signals if s in text), 2)
    technical = min(technical, 5)
    
    # Communication scoring
    communication = 2  # base
    positive_comm = ["articulate", "enthusiastic", "presenter", "excellent written",
                     "communication", "popular"]
    negative_comm = ["quiet", "clashes", "sometimes clashes", "limited"]
    communication += min(sum(1 for s in positive_comm if s in text), 2)
    communication -= sum(1 for s in negative_comm if s in text)
    communication = max(1, min(communication, 5))
    
    # Culture fit scoring
    culture = 3  # base
    positive_culture = ["mentorship", "growth", "open-source", "team",
                        "available immediately", "collaboration"]
    negative_culture = ["clashes with peers", "strong opinions", "limited industry"]
    culture += min(sum(1 for s in positive_culture if s in text), 2)
    culture -= sum(1 for s in negative_culture if s in text)
    culture = max(1, min(culture, 5))
    
    total = technical + communication + culture
    return f"{name} | Technical: {technical}/5 | Communication: {communication}/5 | Culture Fit: {culture}/5 | Total: {total}/15"


# Step 2: Fan-out scoring
print("=" * 60)
print("  FAN-OUT: Candidate Scoring")
print("=" * 60)
score_results = engine.fan_out(score_candidate, candidates)
for profile, scores in score_results:
    print(f"  {scores}")


# Step 3: Sequential pipeline
combined_scores = "\n".join(score for _, score in score_results)

hiring_steps = [
    ("Rank", lambda text: (
        "Candidates Ranked by Total Score:\n" +
        "\n".join(
            f"  {i+1}. {line}"
            for i, line in enumerate(
                sorted(text.strip().split("\n"),
                       key=lambda l: int(l.split("Total: ")[1].split("/")[0])
                       if "Total:" in l else 0,
                       reverse=True)
            )
        )
    )),
    ("Select Top 2", lambda text: (
        "Top 2 Finalists Selected:\n\n"
        "1. Alex Chen \u2014 Strongest technical profile (8 years ML/Python, "
        "built recommendation systems at scale). Despite being quiet in "
        "interviews, code quality speaks volumes. Available immediately.\n\n"
        "2. Sam Patel \u2014 Most experienced overall (12 years, led teams of "
        "10+). Deep distributed systems expertise is rare and valuable. "
        "Communication concerns are manageable with the right team structure."
    )),
    ("Interview Questions", lambda text: (
        "Tailored Interview Questions for Finalists:\n\n"
        "Alex Chen (weakest dimension: Communication):\n"
        "  Q1: Walk us through a time you had to convince a skeptical "
        "stakeholder to adopt your technical approach. What was your strategy?\n"
        "  Q2: How do you communicate complex ML concepts to non-technical "
        "team members? Give a specific example.\n\n"
        "Sam Patel (weakest dimension: Culture Fit):\n"
        "  Q1: Describe a situation where you disagreed with a colleague's "
        "architecture decision. How did you handle it and what was the outcome?\n"
        "  Q2: How do you balance strong technical opinions with maintaining "
        "team harmony and psychological safety?"
    )),
    ("Summary Memo", lambda text: (
        "HIRING RECOMMENDATION MEMO\n"
        "=" * 40 + "\n\n"
        "Position: Senior AI/ML Engineer\n"
        "Candidates Evaluated: 4\n"
        "Finalists: Alex Chen, Sam Patel\n\n"
        "RECOMMENDATION: Extend offer to Alex Chen as first choice.\n\n"
        "Rationale: Alex combines strong technical skills with immediate "
        "availability and a collaborative (if quiet) disposition. The "
        "recommendation system experience directly aligns with our product "
        "needs. Sam Patel is a strong backup candidate \u2014 the leadership "
        "experience is valuable, but the collaboration concerns warrant "
        "careful evaluation during the interview.\n\n"
        "NEXT STEPS:\n"
        "1. Schedule technical deep-dive with Alex Chen (this week)\n"
        "2. Schedule behavioral interview with Sam Patel (this week)\n"
        "3. Prepare offer package for top candidate (by Friday)"
    )),
]

hiring_results = engine.sequential(hiring_steps, combined_scores)

for name, output in hiring_results:
    print(f"\n{'=' * 60}")
    print(f"  [{name}]")
    print(f"{'=' * 60}")
    print(textwrap.indent(output, "    "))


# Step 4: Grouped bar chart
# Parse scores from the results
candidate_names = ["Alex Chen", "Jordan Rivera", "Sam Patel", "Morgan Kim"]
# Re-extract scores by running score_candidate on each
all_scores = {}
for c in candidates:
    result = score_candidate(c)
    name = result.split(" | ")[0]
    tech = int(result.split("Technical: ")[1].split("/")[0])
    comm = int(result.split("Communication: ")[1].split("/")[0])
    cult = int(result.split("Culture Fit: ")[1].split("/")[0])
    all_scores[name] = {"technical": tech, "communication": comm, "culture_fit": cult}

technical_scores = [all_scores[n]["technical"] for n in candidate_names]
communication_scores = [all_scores[n]["communication"] for n in candidate_names]
culture_fit_scores = [all_scores[n]["culture_fit"] for n in candidate_names]

x = np.arange(len(candidate_names))
width = 0.25

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(x - width, technical_scores, width, label='Technical', color='#3498db')
bars2 = ax.bar(x, communication_scores, width, label='Communication', color='#2ecc71')
bars3 = ax.bar(x + width, culture_fit_scores, width, label='Culture Fit', color='#e74c3c')

# Add value labels on bars
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2., height + 0.05,
                f'{int(height)}', ha='center', va='bottom', fontweight='bold')

ax.set_xlabel('Candidates', fontsize=12)
ax.set_ylabel('Score (1-5)', fontsize=12)
ax.set_title('Candidate Evaluation: Hiring Pipeline Results', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(candidate_names, fontsize=11)
ax.set_ylim(0, 6)
ax.legend(fontsize=11)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.show()