# Interactive Workflow Testing Notebook

This notebook provides interactive testing of the hiring workflow components.

Use this to:
- Test individual components (rubric generation, agent evaluation, synthesis)
- Explore the two-pass evaluation pattern
- Visualize decision packets and interview plans
- Debug workflow issues with real data

## Setup

In [None]:
import sys
sys.path.insert(0, "..")

from pathlib import Path
from pprint import pprint
import json

# Import models
from src.models.rubric import Rubric
from src.models.review import AgentReview
from src.models.memory import WorkingMemory
from src.models.packet import DecisionPacket
from src.models.interview import InterviewPlan

# Import nodes
from src.nodes.orchestrator import orchestrator_node
from src.nodes.hr_agent import hr_agent_node, _extract_working_memory, _evaluate_with_memory
from src.nodes.tech_agent import tech_agent_node
from src.nodes.compliance_agent import compliance_agent_node
from src.nodes.synthesis import synthesis_node

# Import workflow
from src.graph import run_hiring_workflow

# Import config
from src.config import get_settings

print("‚úÖ Imports successful")

## Load Sample Data

In [None]:
# Load job descriptions
job_backend = Path("../examples/sample_data/job_backend_engineer.txt").read_text()
job_ai = Path("../examples/sample_data/job_senior_ai_engineer.txt").read_text()

# Load resumes
resume_strong = Path("../examples/sample_data/resume_strong_candidate.txt").read_text()
resume_moderate = Path("../examples/sample_data/resume_moderate_candidate.txt").read_text()

print(f"Loaded {len(job_backend)} chars from backend job description")
print(f"Loaded {len(job_ai)} chars from AI engineer job description")
print(f"Loaded {len(resume_strong)} chars from strong candidate resume")
print(f"Loaded {len(resume_moderate)} chars from moderate candidate resume")

## Component Testing

### Test 1: Rubric Generation (Orchestrator)

In [None]:
# Test orchestrator with AI engineer job
initial_state = {
    "job_description": job_ai,
    "resume": resume_strong,
    "company_context": "We're a Series B startup building AI-powered hiring tools."
}

orchestrator_result = orchestrator_node(initial_state)
rubric = orchestrator_result["rubric"]

print(f"\n=== Generated Rubric ===")
print(f"Categories: {len(rubric.categories)}")
print(f"\nCategory Breakdown:")
for cat in rubric.categories:
    print(f"  - {cat.name} (weight: {cat.weight:.2f}, must-have: {cat.is_must_have})")
    print(f"    {cat.description[:100]}...")
    print(f"    Scoring levels: {len(cat.scoring_criteria)}\n")

### Test 2: Working Memory Extraction (Two-Pass Pattern - Pass 1)

In [None]:
# Create state with rubric
state_with_rubric = {
    "job_description": job_ai,
    "resume": resume_strong,
    "rubric": rubric,
}

# Extract working memory for HR agent
hr_memory = _extract_working_memory(state_with_rubric)

print(f"\n=== HR Agent Working Memory ===")
print(f"Agent Role: {hr_memory.agent_role}")
print(f"Observations: {len(hr_memory.observations)}")
print(f"Cross-references: {len(hr_memory.cross_references)}")
print(f"Ambiguities: {len(hr_memory.ambiguities)}")

print(f"\nSample Observations:")
for obs in hr_memory.observations[:3]:
    print(f"  [{obs.observation_type.upper()}] {obs.category_name}")
    print(f"    {obs.observation}")

if hr_memory.ambiguities:
    print(f"\nAmbiguities to explore in interview:")
    for amb in hr_memory.ambiguities:
        print(f"  - {amb}")

### Test 3: Agent Evaluation (Two-Pass Pattern - Pass 2)

In [None]:
# Evaluate using working memory
hr_review = _evaluate_with_memory(state_with_rubric, hr_memory)

print(f"\n=== HR Agent Review ===")
print(f"Agent: {hr_review.agent_role}")
print(f"Categories Scored: {len(hr_review.category_scores)}")

print(f"\nScores:")
for score in hr_review.category_scores:
    print(f"  {score.category_name}: {score.score}/5 (confidence: {score.confidence})")
    if score.evidence:
        print(f"    Evidence: {score.evidence[0].text[:100]}...")
    print()

### Test 4: Full Panel Evaluation

In [None]:
# Run all panel agents
hr_result = hr_agent_node(state_with_rubric)
tech_result = tech_agent_node(state_with_rubric)
compliance_result = compliance_agent_node(state_with_rubric)

# Combine results
panel_state = {
    **state_with_rubric,
    "panel_reviews": [
        hr_result["panel_reviews"][0],
        tech_result["panel_reviews"][0],
        compliance_result["panel_reviews"][0],
    ],
    "agent_working_memory": {
        **hr_result["agent_working_memory"],
        **tech_result["agent_working_memory"],
        **compliance_result["agent_working_memory"],
    },
}

print("\n=== Panel Evaluation Complete ===")
print(f"Total Reviews: {len(panel_state['panel_reviews'])}")
print(f"Agents: {', '.join(panel_state['agent_working_memory'].keys())}")

# Compare scores across agents for first category
first_category = rubric.categories[0].name
print(f"\nScore comparison for '{first_category}':")
for review in panel_state['panel_reviews']:
    for score in review.category_scores:
        if score.category_name == first_category:
            print(f"  {review.agent_role}: {score.score}/5")

### Test 5: Synthesis & Decision Packet

In [None]:
# Run synthesis
synthesis_result = synthesis_node(panel_state)

decision_packet = synthesis_result["decision_packet"]
interview_plan = synthesis_result["interview_plan"]
disagreements = synthesis_result["disagreements"]

print("\n=== Decision Packet ===")
print(f"Weighted Average Score: {decision_packet.weighted_average_score:.2f}/5.0")
print(f"Recommendation: {decision_packet.recommendation}")
print(f"Confidence: {decision_packet.confidence}")

if decision_packet.must_have_gaps:
    print(f"\n‚ö†Ô∏è  Must-Have Gaps ({len(decision_packet.must_have_gaps)}):")
    for gap in decision_packet.must_have_gaps:
        print(f"  - {gap}")

if disagreements:
    print(f"\n‚ö†Ô∏è  Disagreements ({len(disagreements)}):")
    for dis in disagreements:
        print(f"  - {dis.category_name} (delta: {dis.score_delta})")
        print(f"    Reason: {dis.reason[:150]}...")

### Test 6: Interview Plan

In [None]:
print("\n=== Interview Plan ===")
print(f"Priority Areas: {len(interview_plan.priority_areas_to_probe)}")
print(f"Total Questions: {interview_plan.total_questions()}")

print(f"\nPriority Areas to Probe:")
for area in interview_plan.priority_areas_to_probe:
    print(f"  - {area}")

print(f"\nSample Interview Questions:")
for i, question in enumerate(interview_plan.questions[:5], 1):
    print(f"\n{i}. [{question.suggested_interviewer}] {question.question}")
    print(f"   Why ask: {question.why_ask}")

## Full Workflow Test

In [None]:
# Run complete workflow end-to-end
print("Running full hiring workflow...\n")

final_state = run_hiring_workflow(
    job_description=job_ai,
    resume=resume_strong,
    company_context="Series B startup building AI-powered hiring tools",
)

print("\n" + "="*60)
print("         HIRING WORKFLOW RESULTS")
print("="*60)

# Summary
packet = final_state["decision_packet"]
plan = final_state["interview_plan"]

print(f"\nüìä Overall Score: {packet.weighted_average_score:.2f}/5.0")
print(f"üìù Recommendation: {packet.recommendation}")
print(f"üéØ Confidence: {packet.confidence.upper()}")
print(f"\nüìã Panel Reviews: {len(final_state['panel_reviews'])}")
print(f"üí≠ Working Memory: {len(final_state['agent_working_memory'])} agents")
print(f"‚ö° Disagreements: {len(final_state['disagreements'])}")
print(f"‚ùì Interview Questions: {plan.total_questions()}")
print(f"üéØ Priority Areas: {len(plan.priority_areas_to_probe)}")

if packet.must_have_gaps:
    print(f"\n‚ö†Ô∏è  Critical Gaps:")
    for gap in packet.must_have_gaps:
        print(f"   - {gap}")

print("\n" + "="*60)

## Visualization & Exploration

In [None]:
import pandas as pd

# Create score comparison DataFrame
score_data = []
for review in final_state["panel_reviews"]:
    for score in review.category_scores:
        score_data.append({
            "Agent": review.agent_role,
            "Category": score.category_name,
            "Score": score.score,
            "Confidence": score.confidence,
        })

df = pd.DataFrame(score_data)
print("\n=== Score Comparison Table ===")
print(df.pivot(index="Category", columns="Agent", values="Score"))

In [None]:
# Export results to JSON
output_dir = Path("../examples/results")
output_dir.mkdir(exist_ok=True)

# Export decision packet
with open(output_dir / "decision_packet.json", "w") as f:
    json.dump(packet.model_dump(mode="json"), f, indent=2, default=str)

# Export interview plan
with open(output_dir / "interview_plan.json", "w") as f:
    json.dump(plan.model_dump(mode="json"), f, indent=2, default=str)

print(f"\n‚úÖ Results exported to {output_dir}")

## Error Handling Demo

In [None]:
# Test validation errors
try:
    # Try to run workflow without job description
    run_hiring_workflow(
        job_description="",  # Invalid: empty
        resume=resume_strong,
    )
except ValueError as e:
    print(f"‚úÖ Caught expected validation error: {e}")

## Interactive Exploration

Use the cells below to explore specific aspects of the workflow:

In [None]:
# Explore working memory for specific agent
agent_role = "hr_agent"  # Change to explore different agents
memory = final_state["agent_working_memory"][agent_role]

print(f"\n=== {agent_role.upper()} Working Memory ===")
print(f"\nObservations ({len(memory.observations)}):")
for obs in memory.observations:
    print(f"  [{obs.observation_type}] {obs.category_name}: {obs.observation}")

In [None]:
# Explore disagreements in detail
for dis in final_state["disagreements"]:
    print(f"\n=== Disagreement: {dis.category_name} ===")
    print(f"Score Delta: {dis.score_delta}")
    print(f"\nAgent Scores:")
    for agent, score in dis.agent_scores.items():
        print(f"  {agent}: {score}/5")
    print(f"\nReason: {dis.reason}")
    print(f"\nResolution: {dis.resolution_approach}")

In [None]:
# Compare moderate vs strong candidate
print("\nComparing candidate evaluations...\n")

# You can run the workflow with different resumes and compare results
# This cell is left for manual experimentation