# Customer Support Agent Evaluation Framework

## Overview

This notebook implements the [Strands Agents evaluation strategy](https://strandsagents.com/0.1.x/documentation/docs/user-guide/observability-evaluation/evaluation/?h=evaluation) with LLM Judge Evaluation to comprehensively assess your customer support agent running on Amazon Bedrock AgentCore Runtime.

### Evaluation Strategy

- **Multi-dimensional Quality Assessment**: Helpfulness, accuracy, clarity, professionalism, completeness
- **Tool Usage Analysis**: Appropriate tool selection and usage patterns
- **Performance Metrics**: Response times and success rates
- **LLM-as-Judge**: Claude 4 Sonnet for objective evaluation
- **Summarize test results**
- **Save outputs to a file**

## Setup and Dependencies

In [35]:
!pip install boto3 requests strands-agents bedrock-agentcore



In [36]:
import asyncio
import json
import time
import uuid
import boto3
import requests
from dataclasses import dataclass, asdict
from typing import Dict, List, Any, Optional
from datetime import datetime


# AWS clients
bedrock = boto3.client('bedrock-runtime')
ssm = boto3.client('ssm')

print("✅ Dependencies loaded successfully")

✅ Dependencies loaded successfully


## Configuration

In [37]:
# Configuration
AGENT_NAME = "customer_support_agent" 
EVALUATOR_MODEL = "us.anthropic.claude-sonnet-4-20250514-v1:0"

# Helper function to get SSM parameters
def get_ssm_parameter(name: str) -> str:
    try:
        response = ssm.get_parameter(Name=name, WithDecryption=True)
        return response['Parameter']['Value']
    except Exception as e:
        raise Exception(f"Failed to get SSM parameter {name}: {e}")

# Get agent endpoint
try:
    AGENT_ENDPOINT = get_ssm_parameter(f"/app/customersupport/agentcore/runtime_arn") # This is the default value set in lab-04-agentcore-runtime.ipynb 
    # print(f"✅ Agent endpoint: {AGENT_ENDPOINT}")
except Exception as e:
    print(f"⚠️  Could not get agent endpoint: {e}")
    AGENT_ENDPOINT = "http://localhost:8080"  # Fallback for local testing

## Data Classes and Test Cases

In [38]:
@dataclass
class TestCase:
    id: str
    query: str
    category: str
    expected_tools: List[str]
    expected_criteria: Dict[str, Any]
    description: str

@dataclass
class EvaluationResult:
    test_case_id: str
    query: str
    response: str
    metrics: Dict[str, float]
    response_time: float
    success: bool
    error_message: Optional[str] = None
    tool_calls: List[str] = None
    
    def to_dict(self):
        return asdict(self)

# Test cases
TEST_CASES = [
    TestCase(
        id="basic_greeting",
        query="Hi, I need help with my account",
        category="basic_inquiry",
        expected_tools=[],
        expected_criteria={"should_be_polite": True, "should_ask_for_details": True},
        description="Basic greeting and help request"
    ),
    TestCase(
        id="warranty_check",
        query="I have a Gaming Console Pro device, I want to check my warranty status, warranty serial number is MNO33333333",
        category="warranty_inquiry",
        expected_tools=["warranty_check", "gateway_tool"],
        expected_criteria={"should_use_serial_number": True, "should_provide_warranty_status": True},
        description="Warranty status check with serial number"
    ),
    # TestCase(
    #     id="calendar_check",
    #     query="What's my agenda for today?",
    #     category="calendar_integration",
    #     expected_tools=["google_calendar", "calendar_tool"],
    #     expected_criteria={"should_access_calendar": True, "should_show_events": True},
    #     description="Calendar agenda inquiry"
    # ),
    # TestCase(
    #     id="knowledge_base_query",
    #     query="What are the warranty support guidelines?",
    #     category="knowledge_base",
    #     expected_tools=["knowledge_base", "bedrock_kb"],
    #     expected_criteria={"should_search_knowledge_base": True, "should_provide_guidelines": True},
    #     description="Knowledge base query for guidelines"
    # ),
    TestCase(
        id="troubleshooting",
        query="I have overheating issues with my device, help me debug",
        category="technical_support",
        expected_tools=["knowledge_base", "troubleshooting_tool"],
        expected_criteria={"should_ask_device_details": True, "should_provide_steps": True},
        description="Technical troubleshooting assistance"
    )
]

print(f"✅ Loaded {len(TEST_CASES)} test cases")

✅ Loaded 3 test cases


## Agent Invocation and Evaluation Functions

In [39]:
# Initialize AgentCore Runtime client
from bedrock_agentcore_starter_toolkit import Runtime
from lab_helpers.utils import reauthenticate_user, get_existing_cognito_config
import os
import time
import uuid 
from typing import Dict, List, Any, Optional
import boto3
# Enable verbose logging for requests
import logging
## Set the below config parameters if you want to see detailed logs from the agent
#logging.basicConfig(level=logging.DEBUG)
#logging.getLogger("urllib3.connectionpool").setLevel(logging.DEBUG)
import urllib.parse

session_id = uuid.uuid4()
agentcore_client = boto3.client(
        'bedrock-agentcore',
    )
REGION_NAME="us-east-1"
cognito_config = get_existing_cognito_config()

async def invoke_agent(query: str, actor_id: str = "testuser") -> Dict[str, Any]:
    """Invoke the agent using AgentCore Runtime SDK with JWT token"""
    start_time = time.time()
    
        # Get bearer token
    bearer_token = reauthenticate_user(
        cognito_config.get("client_id"), 
        cognito_config.get("client_secret")
    )
        
    #auth_token = os.environ.get('TOKEN')
    #print(f"Using Agent ARN from environment: {AGENT_ENDPOINT}")

    # URL encode the agent ARN
    escaped_agent_arn = urllib.parse.quote(AGENT_ENDPOINT, safe='')

    # Construct the URL
    url = f"https://bedrock-agentcore.{REGION_NAME}.amazonaws.com/runtimes/{escaped_agent_arn}/invocations?qualifier=DEFAULT"

    # Set up headers
    headers = {
        "Authorization": f"Bearer {bearer_token}", #f"Bearer {auth_token}",
        "X-Amzn-Trace-Id": "1234", 
        "Content-Type": "application/json",
        "X-Amzn-Bedrock-AgentCore-Runtime-Session-Id": f"eval-session-{uuid.uuid4()}"
    }
    invoke_response = requests.post(
        url,
        headers=headers,
        data=json.dumps({"prompt": query})
    )

    # Print response in a safe manner
    print(f"Status Code: {invoke_response.status_code}")
    print(f"Response Headers: {dict(invoke_response.headers)}")

    if invoke_response.status_code == 200:
        try:
            response_data = invoke_response.json()
            print("Response JSON:", response_data)
            
            # Handle both string and dict responses
            if isinstance(response_data, str):
                response_text = response_data
            elif isinstance(response_data, dict):
                response_text = response_data.get("result", str(response_data))
            else:
                response_text = str(response_data)
                
            return {
                "response": response_text,
                "success": True,
                "tool_calls": extract_tool_calls(invoke_response),
                "response_time": time.time() - start_time
            }
        except json.JSONDecodeError:
            return {
                "response": invoke_response.text,
                "success": True,
                "tool_calls": [],
                "response_time": time.time() - start_time
            }
    else:
        error_msg = f"Error ({invoke_response.status_code}): {invoke_response.text[:500]}"
        print(error_msg)
        if invoke_response.status_code >= 400:
            print("Please ensure lab-04-agentcore-runtime.ipynb has been executed and agent is deployed")
        return {
            "response": error_msg,
            "success": False,
            "tool_calls": [],
            "response_time": time.time() - start_time
        }
    
    
def extract_tool_calls(response_data) -> List[str]:
    """Extract tool calls from agent response"""
    tool_calls = []
    # AgentCore response format may vary, adapt as needed
    if hasattr(response_data, 'tool_calls'):
        tool_calls = [str(tool) for tool in response_data.tool_calls]
    return tool_calls

print("✅ Agent invocation functions defined")

✅ Agent invocation functions defined


In [40]:
# cognito_config # Ensure the output matches what you see from lab-04

In [41]:
async def evaluate_response_quality(query: str, response: str, criteria: Dict[str, Any]) -> Dict[str, float]:
    """Evaluate response quality using Claude as judge"""
    
    evaluation_prompt = f"""
    You are an expert evaluator for customer support AI agents. Evaluate the following response on a scale of 1-5 for each metric.

    Customer Query: {query}
    Agent Response: {response}

    Evaluate on these metrics (1=Poor, 2=Below Average, 3=Average, 4=Good, 5=Excellent):

    1. HELPFULNESS: Does the response address the customer's needs and provide useful information?
    2. ACCURACY: Is the information provided factually correct and reliable?
    3. CLARITY: Is the response clear, well-structured, and easy to understand?
    4. PROFESSIONALISM: Does the response maintain appropriate tone and professionalism?
    5. COMPLETENESS: Does the response fully address all aspects of the query?

    Expected criteria: {json.dumps(criteria, indent=2)}

    Respond with ONLY a JSON object in this format:
    {{
        "helpfulness": <score>,
        "accuracy": <score>,
        "clarity": <score>,
        "professionalism": <score>,
        "completeness": <score>,
        "reasoning": "Brief explanation of scores"
    }}
    """
    
    try:
        response_obj = bedrock.invoke_model(
            modelId=EVALUATOR_MODEL,
            body=json.dumps({
                "anthropic_version": "bedrock-2023-05-31",
                "max_tokens": 1000,
                "messages": [
                    {"role": "user", "content": evaluation_prompt}
                ]
            })
        )
        
        result = json.loads(response_obj['body'].read())
        content = result['content'][0]['text']
        
        # Extract JSON from response
        start_idx = content.find('{')
        end_idx = content.rfind('}') + 1
        json_str = content[start_idx:end_idx]
        
        scores = json.loads(json_str)
        return {k: v for k, v in scores.items() if k != "reasoning"}
        
    except Exception as e:
        print(f"Error in quality evaluation: {e}")
        return {
            "helpfulness": 0.0,
            "accuracy": 0.0,
            "clarity": 0.0,
            "professionalism": 0.0,
            "completeness": 0.0
        }

def evaluate_tool_usage(expected_tools: List[str], actual_tools: List[str]) -> float:
    """Evaluate tool usage effectiveness"""
    if not expected_tools:
        return 5.0 if not actual_tools else 3.0
    
    if not actual_tools:
        return 1.0
    
    expected_set = set(expected_tools)
    actual_set = set(actual_tools)
    
    precision = len(expected_set.intersection(actual_set)) / len(actual_set) if actual_set else 0
    recall = len(expected_set.intersection(actual_set)) / len(expected_set) if expected_set else 0
    
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return 1 + (f1 * 4)  # Scale to 1-5

print("✅ Evaluation functions defined")

✅ Evaluation functions defined


## Single Test Case Evaluation

In [42]:
async def evaluate_test_case(test_case: TestCase) -> EvaluationResult:
    """Evaluate a single test case"""
    print(f"🔍 Evaluating: {test_case.id} - {test_case.description}")
    
    # Invoke agent
    agent_result = await invoke_agent(test_case.query)
    
    # Handle None result
    if agent_result is None:
        return EvaluationResult(
            test_case_id=test_case.id,
            query=test_case.query,
            response="",
            metrics={},
            response_time=0.0,
            success=False,
            error_message="invoke_agent returned None"
        )
    
    if not agent_result["success"]:
        return EvaluationResult(
            test_case_id=test_case.id,
            query=test_case.query,
            response="",
            metrics={},
            response_time=agent_result["response_time"],
            success=False,
            error_message=agent_result["error_message"]
        )
    
    # Evaluate response quality
    quality_scores = await evaluate_response_quality(
        test_case.query,
        agent_result["response"],
        test_case.expected_criteria
    )
    
    # Evaluate tool usage
    tool_score = evaluate_tool_usage(
        test_case.expected_tools,
        agent_result["tool_calls"]
    )
    
    # Combine all metrics
    metrics = {
        **quality_scores,
        "tool_usage": tool_score,
        "response_time": agent_result["response_time"]
    }
    
    return EvaluationResult(
        test_case_id=test_case.id,
        query=test_case.query,
        response=agent_result["response"],
        metrics=metrics,
        response_time=agent_result["response_time"],
        success=True,
        tool_calls=agent_result["tool_calls"]
    )

print("✅ Test case evaluation function defined")

✅ Test case evaluation function defined


## Run Single Test Case (Demo)

In [43]:
# Test a single case first
demo_test = TEST_CASES[0]  # Basic greeting
demo_result = await evaluate_test_case(demo_test)

print(f"\n📊 Demo Result for '{demo_test.id}':")
print(f"Query: {demo_result.query}")
print(f"Response: {demo_result.response[:200]}..." if len(demo_result.response) > 200 else f"Response: {demo_result.response}")
print(f"Response Time: {demo_result.response_time:.3f}s")
print(f"Tool Calls: {demo_result.tool_calls}")

🔍 Evaluating: basic_greeting - Basic greeting and help request
Status Code: 200
Response Headers: {'Date': 'Tue, 02 Sep 2025 16:25:40 GMT', 'Content-Type': 'application/json', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'x-amzn-RequestId': 'b1dc031e-b9f1-49da-82da-db034ef045ca', 'baggage': 'session.id=eval-session-751459f6-8a6a-498d-9d1a-41c4368c80ab', 'X-Amzn-Bedrock-AgentCore-Runtime-Session-Id': 'eval-session-751459f6-8a6a-498d-9d1a-41c4368c80ab'}
Response JSON: Hi there! I'd be happy to help you with your account. To assist you better, could you please let me know what specific aspect of your account you need help with? For example, are you looking for:

- Information about your Gaming Console Pro device warranty
- Help with account login issues
- Order tracking information
- Return policy questions
- Technical support for your device
- Something else related to your account

The more details you can provide, the better I'll be able to assist you with your specific 

## Full Evaluation Suite

In [44]:
async def run_full_evaluation(test_cases: List[TestCase]) -> Dict[str, Any]:
    """Run evaluation on all test cases"""
    print(f"🚀 Starting evaluation of {len(test_cases)} test cases...")
    
    results = []
    for i, test_case in enumerate(test_cases, 1):
        print(f"\n[{i}/{len(test_cases)}] Processing: {test_case.id}")
        result = await evaluate_test_case(test_case)
        results.append(result)
        
        # Brief pause between tests
        await asyncio.sleep(1)
    
    # Calculate summary statistics
    summary = calculate_summary(results)
    
    return {
        "agent_name": AGENT_NAME,
        "total_test_cases": len(test_cases),
        "results": [result.to_dict() for result in results],
        "summary": summary,
        "timestamp": datetime.now().isoformat()
    }

def calculate_summary(results: List[EvaluationResult]) -> Dict[str, Any]:
    """Calculate summary statistics"""
    successful_results = [r for r in results if r.success]
    
    if not successful_results:
        return {"error": "No successful test cases"}
    
    # Average scores
    metrics = ["helpfulness", "accuracy", "clarity", "professionalism", "completeness", "tool_usage"]
    avg_scores = {}
    
    for metric in metrics:
        scores = [r.metrics.get(metric, 0) for r in successful_results if metric in r.metrics]
        avg_scores[metric] = sum(scores) / len(scores) if scores else 0
    
    # Response time statistics
    response_times = sorted([r.response_time for r in successful_results])
    n = len(response_times)
    
    percentiles = {
        "p50": response_times[n//2] if n > 0 else 0,
        "p90": response_times[int(n*0.9)] if n > 0 else 0,
        "p95": response_times[int(n*0.95)] if n > 0 else 0,
        "p99": response_times[int(n*0.99)] if n > 0 else 0,
    }
    
    return {
        "success_rate": len(successful_results) / len(results),
        "average_scores": avg_scores,
        "overall_score": sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0,
        "response_time_percentiles": percentiles,
        "total_successful": len(successful_results),
        "total_failed": len(results) - len(successful_results)
    }

print("✅ Full evaluation functions defined")

✅ Full evaluation functions defined


In [45]:
# Run full evaluation
evaluation_results = await run_full_evaluation(TEST_CASES)

print("\n" + "="*60)
print("📊 EVALUATION COMPLETE")
print("="*60)

# Display LLM Judge scores for each test
print("\n🤖 LLM JUDGE SCORES BY TEST CASE:")
print("-" * 60)
for result in evaluation_results.get("results", []):
    if result["success"] and result.get("metrics"):
        metrics = result["metrics"]
        print(f"\n📝 {result['test_case_id'].upper()}:")
        print(f"   Helpfulness:     {metrics.get('helpfulness', 0):.1f}/5.0")
        print(f"   Accuracy:        {metrics.get('accuracy', 0):.1f}/5.0")
        print(f"   Clarity:         {metrics.get('clarity', 0):.1f}/5.0")
        print(f"   Professionalism: {metrics.get('professionalism', 0):.1f}/5.0")
        print(f"   Completeness:    {metrics.get('completeness', 0):.1f}/5.0")
        print(f"   Tool Usage:      {metrics.get('tool_usage', 0):.1f}/5.0")
        avg_score = sum([v for k, v in metrics.items() if k != 'response_time']) / 6
        print(f"   📊 Average:      {avg_score:.1f}/5.0")
    else:
        print(f"\n❌ {result['test_case_id'].upper()}: FAILED")
        print(f"   Error: {result.get('error_message', 'Unknown error')}")

🚀 Starting evaluation of 3 test cases...

[1/3] Processing: basic_greeting
🔍 Evaluating: basic_greeting - Basic greeting and help request
Status Code: 200
Response Headers: {'Date': 'Tue, 02 Sep 2025 16:25:50 GMT', 'Content-Type': 'application/json', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'x-amzn-RequestId': '84e1f524-732d-4d98-b713-04874cd52d5b', 'baggage': 'session.id=eval-session-08183bc6-09c1-4331-823a-d154b980f823', 'X-Amzn-Bedrock-AgentCore-Runtime-Session-Id': 'eval-session-08183bc6-09c1-4331-823a-d154b980f823'}
Response JSON: Hello! I'd be happy to help you with your account. To better assist you, could you please let me know what specific account issue you're having? For example:

- Are you having trouble logging in?
- Do you need to update your account information?
- Are you looking for order history or tracking?
- Do you want to check warranty information for your Gaming Console Pro?
- Are you having issues with payments or billing?

Once I know more abo

## Results Analysis and Visualization

In [46]:
def print_detailed_summary(results: dict):
    """Print comprehensive evaluation summary"""
    summary = results.get("summary", {})
    
    print(f"🤖 Agent: {results['agent_name']}")
    print(f"📝 Total Test Cases: {results['total_test_cases']}")
    print(f"✅ Success Rate: {summary.get('success_rate', 0):.1%}")
    print(f"🎯 Overall Score: {summary.get('overall_score', 0):.2f}/5.0")
    
    print("\n📈 QUALITY METRICS (1-5 scale):")
    avg_scores = summary.get("average_scores", {})
    for metric, score in avg_scores.items():
        if metric != "response_time":
            emoji = "🟢" if score >= 4.0 else "🟡" if score >= 3.0 else "🔴"
            print(f"  {emoji} {metric.title()}: {score:.2f}")
    
    print("\n⏱️  RESPONSE TIME PERCENTILES:")
    percentiles = summary.get("response_time_percentiles", {})
    for p, time_val in percentiles.items():
        print(f"  {p.upper()}: {time_val:.3f}s")
    
    print("\n📋 DETAILED RESULTS:")
    for result in results.get("results", []):
        status = "✅" if result["success"] else "❌"
        if result["success"] and result.get("metrics"):
            score = sum(result["metrics"].values()) / len(result["metrics"])
            print(f"  {status} {result['test_case_id']}: {score:.2f}/5.0 ({result['response_time']:.3f}s)")
        else:
            print(f"  {status} {result['test_case_id']}: FAILED - {result.get('error_message', 'Unknown error')}")

# Print detailed summary
print_detailed_summary(evaluation_results)

🤖 Agent: customer_support_agent
📝 Total Test Cases: 3
✅ Success Rate: 100.0%
🎯 Overall Score: 3.61/5.0

📈 QUALITY METRICS (1-5 scale):
  🟡 Helpfulness: 3.67
  🟡 Accuracy: 3.67
  🟢 Clarity: 4.67
  🟢 Professionalism: 4.33
  🟡 Completeness: 3.00
  🔴 Tool_Usage: 2.33

⏱️  RESPONSE TIME PERCENTILES:
  P50: 11.404s
  P90: 17.285s
  P95: 17.285s
  P99: 17.285s

📋 DETAILED RESULTS:
  ✅ basic_greeting: 4.59/5.0 (6.150s)
  ✅ warranty_check: 4.06/5.0 (11.404s)
  ✅ troubleshooting: 5.61/5.0 (17.285s)


## Save Results

In [47]:
# Save results to file
output_file = f"evaluation_results_{AGENT_NAME}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

with open(output_file, 'w') as f:
    json.dump(evaluation_results, f, indent=2)

print(f"💾 Results saved to: {output_file}")

💾 Results saved to: evaluation_results_customer_support_agent_20250902_092632.json
