# Lab 3: LLM Evaluation & Agentic AI with Bedrock

**Duration:** 60-90 minutes  
**Cost:** < $1.00 (using Claude Haiku + evaluation metrics)

## Learning Objectives
1. Evaluate LLM outputs using multiple metrics
2. Compare different foundation models
3. Implement prompt engineering best practices
4. Build agentic AI workflows with tool use
5. Create self-improving AI agents

## Prerequisites
- Completion of Labs 1 & 2
- Understanding of LLM concepts
- Basic Python and AWS knowledge

## Part 1: LLM Evaluation

### 1. Setup and Configuration

In [None]:
# Install required packages
!pip install -q boto3 pandas numpy matplotlib seaborn scikit-learn nltk textstat

In [None]:
import boto3
import json
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from typing import List, Dict, Any
import nltk
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics.pairwise import cosine_similarity
import textstat

# Download NLTK data
nltk.download('punkt', quiet=True)

# Initialize Bedrock
bedrock_runtime = boto3.client('bedrock-runtime', region_name='us-east-1')
bedrock = boto3.client('bedrock', region_name='us-east-1')

print("✓ Environment setup complete")

### 2. LLM Invocation Utilities

In [None]:
class BedrockModelEvaluator:
    """
    Utility class for invoking and evaluating Bedrock models
    """
    def __init__(self, bedrock_runtime_client):
        self.client = bedrock_runtime_client
        self.models = {
            'claude-haiku': 'anthropic.claude-3-haiku-20240307-v1:0',
            'claude-sonnet': 'anthropic.claude-3-sonnet-20240229-v1:0',
            'titan-express': 'amazon.titan-text-express-v1',
        }
    
    def invoke_claude(self, prompt, model_key='claude-haiku', max_tokens=512, temperature=0.7):
        """Invoke Claude models"""
        body = json.dumps({
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": max_tokens,
            "temperature": temperature,
            "messages": [{"role": "user", "content": prompt}]
        })
        
        start_time = time.time()
        response = self.client.invoke_model(
            modelId=self.models[model_key],
            body=body
        )
        latency = time.time() - start_time
        
        response_body = json.loads(response['body'].read())
        text = response_body['content'][0]['text']
        
        return {
            'text': text,
            'latency': latency,
            'model': model_key,
            'input_tokens': response_body.get('usage', {}).get('input_tokens', 0),
            'output_tokens': response_body.get('usage', {}).get('output_tokens', 0)
        }
    
    def invoke_titan(self, prompt, max_tokens=512, temperature=0.7):
        """Invoke Titan models"""
        body = json.dumps({
            "inputText": prompt,
            "textGenerationConfig": {
                "maxTokenCount": max_tokens,
                "temperature": temperature,
                "topP": 0.9
            }
        })
        
        start_time = time.time()
        response = self.client.invoke_model(
            modelId=self.models['titan-express'],
            body=body
        )
        latency = time.time() - start_time
        
        response_body = json.loads(response['body'].read())
        text = response_body['results'][0]['outputText']
        
        return {
            'text': text,
            'latency': latency,
            'model': 'titan-express',
            'input_tokens': response_body.get('inputTextTokenCount', 0),
            'output_tokens': response_body['results'][0].get('tokenCount', 0)
        }
    
    def invoke_model(self, prompt, model_key='claude-haiku', **kwargs):
        """Universal model invocation"""
        if 'claude' in model_key:
            return self.invoke_claude(prompt, model_key, **kwargs)
        elif 'titan' in model_key:
            return self.invoke_titan(prompt, **kwargs)
        else:
            raise ValueError(f"Unknown model: {model_key}")

evaluator = BedrockModelEvaluator(bedrock_runtime)
print("✓ Model evaluator initialized")

### 3. Evaluation Metrics

In [None]:
class LLMMetrics:
    """
    Comprehensive LLM evaluation metrics
    """
    @staticmethod
    def readability_score(text):
        """Flesch Reading Ease score (0-100, higher = easier)"""
        try:
            return textstat.flesch_reading_ease(text)
        except:
            return 0
    
    @staticmethod
    def get_embedding(text, bedrock_client):
        """Get Titan embedding"""
        body = json.dumps({"inputText": text})
        response = bedrock_client.invoke_model(
            modelId='amazon.titan-embed-text-v1',
            body=body
        )
        response_body = json.loads(response['body'].read())
        return np.array(response_body['embedding'])
    
    @staticmethod
    def semantic_similarity(text1, text2, bedrock_client):
        """Cosine similarity between embeddings"""
        emb1 = LLMMetrics.get_embedding(text1, bedrock_client).reshape(1, -1)
        emb2 = LLMMetrics.get_embedding(text2, bedrock_client).reshape(1, -1)
        return cosine_similarity(emb1, emb2)[0][0]
    
    @staticmethod
    def coherence_score(text):
        """Simple coherence based on sentence count and avg length"""
        sentences = nltk.sent_tokenize(text)
        if not sentences:
            return 0
        avg_length = np.mean([len(s.split()) for s in sentences])
        # Ideal sentence length 15-20 words
        return 1 / (1 + abs(avg_length - 17.5) / 10)
    
    @staticmethod
    def factuality_check(text, bedrock_client, reference_facts):
        """
        LLM-as-judge factuality check
        """
        prompt = f"""Evaluate the factual accuracy of the following text.

Reference Facts:
{reference_facts}

Text to Evaluate:
{text}

Rate factual accuracy on a scale of 0-10 where:
0 = Completely inaccurate
10 = Perfectly accurate

Respond with only a number between 0 and 10."""
        
        body = json.dumps({
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 10,
            "messages": [{"role": "user", "content": prompt}]
        })
        
        response = bedrock_client.invoke_model(
            modelId='anthropic.claude-3-haiku-20240307-v1:0',
            body=body
        )
        
        response_body = json.loads(response['body'].read())
        score_text = response_body['content'][0]['text'].strip()
        
        try:
            return float(score_text) / 10  # Normalize to 0-1
        except:
            return 0.5
    
    @staticmethod
    def evaluate_response(response, ground_truth=None, reference_facts=None, bedrock_client=None):
        """Comprehensive evaluation"""
        text = response['text']
        
        metrics = {
            'model': response['model'],
            'latency': response['latency'],
            'output_tokens': response['output_tokens'],
            'readability': LLMMetrics.readability_score(text),
            'coherence': LLMMetrics.coherence_score(text),
            'length': len(text.split())
        }
        
        if ground_truth and bedrock_client:
            metrics['semantic_similarity'] = LLMMetrics.semantic_similarity(
                text, ground_truth, bedrock_client
            )
        
        if reference_facts and bedrock_client:
            metrics['factuality'] = LLMMetrics.factuality_check(
                text, bedrock_client, reference_facts
            )
        
        return metrics

print("✓ Metrics class created")

### 4. Model Comparison Experiment

In [None]:
# Test prompt and ground truth
test_prompt = "Explain what Amazon S3 is and its main use cases in 2-3 sentences."

ground_truth = """Amazon S3 (Simple Storage Service) is a scalable object storage service 
that stores data as objects within buckets. It's commonly used for backup and restore, 
data archiving, content distribution, data lakes, and hosting static websites."""

reference_facts = """Amazon S3 is object storage. It stores data in buckets. 
Common uses include backup, archiving, data lakes, and static website hosting."""

# Test models
models_to_test = ['claude-haiku', 'titan-express']

print("Running model comparison...\n")
results = []

for model in models_to_test:
    print(f"Testing {model}...")
    
    response = evaluator.invoke_model(
        test_prompt,
        model_key=model,
        max_tokens=200,
        temperature=0.7
    )
    
    metrics = LLMMetrics.evaluate_response(
        response,
        ground_truth=ground_truth,
        reference_facts=reference_facts,
        bedrock_client=bedrock_runtime
    )
    
    metrics['response_text'] = response['text']
    results.append(metrics)
    
    print(f"✓ {model} complete\n")

# Create comparison DataFrame
comparison_df = pd.DataFrame(results)
print("Model Comparison Results:")
print(comparison_df[['model', 'latency', 'readability', 'coherence', 
                     'semantic_similarity', 'factuality', 'output_tokens']].to_string(index=False))

In [None]:
# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Latency comparison
axes[0, 0].bar(comparison_df['model'], comparison_df['latency'])
axes[0, 0].set_title('Response Latency', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Seconds')
axes[0, 0].tick_params(axis='x', rotation=45)

# Quality metrics
quality_metrics = ['readability', 'coherence', 'semantic_similarity', 'factuality']
x = np.arange(len(models_to_test))
width = 0.2

for i, metric in enumerate(quality_metrics):
    if metric in comparison_df.columns:
        normalized = comparison_df[metric] / comparison_df[metric].max()
        axes[0, 1].bar(x + i*width, normalized, width, label=metric)

axes[0, 1].set_title('Quality Metrics (Normalized)', fontsize=12, fontweight='bold')
axes[0, 1].set_xticks(x + width * 1.5)
axes[0, 1].set_xticklabels(comparison_df['model'])
axes[0, 1].legend()
axes[0, 1].tick_params(axis='x', rotation=45)

# Token usage
axes[1, 0].bar(comparison_df['model'], comparison_df['output_tokens'])
axes[1, 0].set_title('Output Tokens', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('Token Count')
axes[1, 0].tick_params(axis='x', rotation=45)

# Overall score (composite)
comparison_df['overall_score'] = (
    comparison_df['readability'] / 100 * 0.2 +
    comparison_df['coherence'] * 0.2 +
    comparison_df.get('semantic_similarity', 0) * 0.3 +
    comparison_df.get('factuality', 0) * 0.3
)

axes[1, 1].bar(comparison_df['model'], comparison_df['overall_score'])
axes[1, 1].set_title('Overall Score (Composite)', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Score (0-1)')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("\nGenerated Responses:")
for i, row in comparison_df.iterrows():
    print(f"\n{row['model']}:")
    print(row['response_text'])

### 5. Prompt Engineering Evaluation

In [None]:
# Test different prompt strategies
base_task = "Explain Amazon Lambda"

prompt_strategies = {
    'Basic': base_task,
    
    'Structured': f"""{base_task}.

Provide your answer in the following format:
1. Definition
2. Key features
3. Use cases""",
    
    'Few-shot': f"""Here are examples of good service explanations:

Example 1: Amazon S3 is object storage that provides scalability and durability. 
It's used for backup, archiving, and data lakes.

Example 2: Amazon EC2 provides virtual servers in the cloud. 
It offers flexible compute capacity for various workloads.

Now, {base_task} following the same style.""",
    
    'Chain-of-Thought': f"""{base_task}.

Think step by step:
1. First, what type of service is it?
2. Then, what problems does it solve?
3. Finally, what are typical use cases?

Provide a concise answer covering these points."""
}

print("Evaluating prompt strategies...\n")
prompt_results = []

for strategy_name, prompt in prompt_strategies.items():
    response = evaluator.invoke_model(prompt, max_tokens=250)
    metrics = LLMMetrics.evaluate_response(response, bedrock_client=bedrock_runtime)
    metrics['strategy'] = strategy_name
    metrics['response_text'] = response['text']
    prompt_results.append(metrics)
    print(f"✓ {strategy_name} strategy evaluated")

prompt_df = pd.DataFrame(prompt_results)
print("\nPrompt Strategy Comparison:")
print(prompt_df[['strategy', 'readability', 'coherence', 'length']].to_string(index=False))

## Part 2: Agentic AI with Bedrock

### 6. Tool Use / Function Calling

In [None]:
# Define tools for the agent
tools = [
    {
        "name": "get_weather",
        "description": "Get current weather for a location. Use this when the user asks about weather.",
        "input_schema": {
            "type": "object",
            "properties": {
                "location": {
                    "type": "string",
                    "description": "City name or location"
                }
            },
            "required": ["location"]
        }
    },
    {
        "name": "calculate",
        "description": "Perform mathematical calculations. Use this for math problems.",
        "input_schema": {
            "type": "object",
            "properties": {
                "expression": {
                    "type": "string",
                    "description": "Mathematical expression to evaluate"
                }
            },
            "required": ["expression"]
        }
    },
    {
        "name": "search_documentation",
        "description": "Search AWS documentation for information about services",
        "input_schema": {
            "type": "object",
            "properties": {
                "service": {
                    "type": "string",
                    "description": "AWS service name"
                },
                "topic": {
                    "type": "string",
                    "description": "Specific topic or question"
                }
            },
            "required": ["service"]
        }
    }
]

print("✓ Tools defined")

In [None]:
# Tool implementations (mock)
def get_weather(location):
    """Mock weather API"""
    return {
        "location": location,
        "temperature": 72,
        "condition": "Partly cloudy",
        "humidity": 65
    }

def calculate(expression):
    """Safe calculator"""
    try:
        # Basic safety check
        if any(char in expression for char in ['__', 'import', 'exec', 'eval']):
            return {"error": "Invalid expression"}
        result = eval(expression, {"__builtins__": {}}, {})
        return {"expression": expression, "result": result}
    except Exception as e:
        return {"error": str(e)}

def search_documentation(service, topic=None):
    """Mock documentation search"""
    docs = {
        "S3": "Amazon S3 provides object storage with high durability and availability.",
        "Lambda": "AWS Lambda runs code without provisioning servers. Pay only for compute time.",
        "EC2": "Amazon EC2 provides resizable virtual servers in the cloud."
    }
    return {
        "service": service,
        "topic": topic,
        "documentation": docs.get(service, "Service documentation not found.")
    }

# Tool registry
tool_functions = {
    "get_weather": get_weather,
    "calculate": calculate,
    "search_documentation": search_documentation
}

print("✓ Tool functions implemented")

### 7. Agentic AI Implementation

In [None]:
class BedrockAgent:
    """
    Simple agentic AI using Claude with tool use
    """
    def __init__(self, bedrock_client, tools, tool_functions, max_iterations=5):
        self.client = bedrock_client
        self.tools = tools
        self.tool_functions = tool_functions
        self.max_iterations = max_iterations
        self.conversation_history = []
    
    def run(self, user_message):
        """
        Run agentic loop with tool use
        """
        self.conversation_history.append({
            "role": "user",
            "content": user_message
        })
        
        iterations = 0
        execution_log = []
        
        while iterations < self.max_iterations:
            iterations += 1
            execution_log.append(f"\n--- Iteration {iterations} ---")
            
            # Call Claude with tools
            body = json.dumps({
                "anthropic_version": "bedrock-2023-05-31",
                "max_tokens": 1024,
                "tools": self.tools,
                "messages": self.conversation_history
            })
            
            response = self.client.invoke_model(
                modelId='anthropic.claude-3-haiku-20240307-v1:0',
                body=body
            )
            
            response_body = json.loads(response['body'].read())
            
            # Check stop reason
            stop_reason = response_body.get('stop_reason')
            
            if stop_reason == 'end_turn':
                # Claude finished without tools
                final_text = response_body['content'][0]['text']
                execution_log.append(f"Final response: {final_text}")
                return {
                    'response': final_text,
                    'iterations': iterations,
                    'log': execution_log
                }
            
            elif stop_reason == 'tool_use':
                # Claude wants to use tools
                assistant_content = response_body['content']
                self.conversation_history.append({
                    "role": "assistant",
                    "content": assistant_content
                })
                
                # Execute tools
                tool_results = []
                
                for content_block in assistant_content:
                    if content_block.get('type') == 'tool_use':
                        tool_name = content_block['name']
                        tool_input = content_block['input']
                        tool_use_id = content_block['id']
                        
                        execution_log.append(f"Calling tool: {tool_name}")
                        execution_log.append(f"Input: {json.dumps(tool_input)}")
                        
                        # Execute tool
                        if tool_name in self.tool_functions:
                            result = self.tool_functions[tool_name](**tool_input)
                            execution_log.append(f"Result: {json.dumps(result)}")
                            
                            tool_results.append({
                                "type": "tool_result",
                                "tool_use_id": tool_use_id,
                                "content": json.dumps(result)
                            })
                
                # Add tool results to conversation
                self.conversation_history.append({
                    "role": "user",
                    "content": tool_results
                })
            
            else:
                # Unexpected stop reason
                execution_log.append(f"Unexpected stop reason: {stop_reason}")
                break
        
        return {
            'response': 'Max iterations reached',
            'iterations': iterations,
            'log': execution_log
        }

# Initialize agent
agent = BedrockAgent(bedrock_runtime, tools, tool_functions)
print("✓ Bedrock Agent initialized")

In [None]:
# Test the agent
test_queries = [
    "What's the weather in Seattle?",
    "Calculate 125 * 48 + 1000",
    "Tell me about AWS Lambda pricing"
]

print("Testing Agentic AI...\n")

for query in test_queries:
    print(f"{'='*80}")
    print(f"User: {query}\n")
    
    # Reset agent for each query
    agent = BedrockAgent(bedrock_runtime, tools, tool_functions)
    result = agent.run(query)
    
    print(f"Agent Response: {result['response']}")
    print(f"\nExecution Log:")
    for log_entry in result['log']:
        print(log_entry)
    print()

### 8. Multi-Step Reasoning Agent

In [None]:
# Complex task requiring multiple steps
complex_task = """I need to deploy a web application on AWS. 
Can you help me understand the costs? 
I expect 10,000 requests per day, need to store 100GB of data, 
and want automatic scaling."""

print("Complex Multi-Step Task:\n")
print(f"Task: {complex_task}\n")

# Add AWS cost calculator tool
def estimate_aws_costs(service, usage):
    """Mock cost estimator"""
    pricing = {
        'Lambda': {'per_million_requests': 0.20, 'per_gb_second': 0.0000166667},
        'S3': {'per_gb_month': 0.023},
        'ALB': {'per_hour': 0.0225, 'per_lcu_hour': 0.008}
    }
    
    if service not in pricing:
        return {"error": f"Pricing for {service} not available"}
    
    return {
        "service": service,
        "usage": usage,
        "estimated_cost": "Varies based on specific usage patterns",
        "pricing_info": pricing[service]
    }

# Add cost estimation tool
extended_tools = tools + [{
    "name": "estimate_aws_costs",
    "description": "Estimate AWS service costs based on usage",
    "input_schema": {
        "type": "object",
        "properties": {
            "service": {"type": "string", "description": "AWS service name"},
            "usage": {"type": "string", "description": "Usage description"}
        },
        "required": ["service", "usage"]
    }
}]

extended_functions = tool_functions.copy()
extended_functions['estimate_aws_costs'] = estimate_aws_costs

# Run complex task
complex_agent = BedrockAgent(bedrock_runtime, extended_tools, extended_functions, max_iterations=10)
result = complex_agent.run(complex_task)

print(f"Agent Response:\n{result['response']}\n")
print(f"Total Iterations: {result['iterations']}\n")
print("Execution Log:")
for log_entry in result['log']:
    print(log_entry)

### 9. Self-Improving Agent with Reflection

In [None]:
class ReflectiveAgent:
    """
    Agent that reflects on and improves its responses
    """
    def __init__(self, bedrock_client):
        self.client = bedrock_client
    
    def generate_response(self, prompt, max_tokens=512):
        """Generate initial response"""
        body = json.dumps({
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": max_tokens,
            "messages": [{"role": "user", "content": prompt}]
        })
        
        response = self.client.invoke_model(
            modelId='anthropic.claude-3-haiku-20240307-v1:0',
            body=body
        )
        
        response_body = json.loads(response['body'].read())
        return response_body['content'][0]['text']
    
    def reflect(self, original_prompt, response):
        """Reflect on the response and identify improvements"""
        reflection_prompt = f"""Review this response and identify areas for improvement.

Original Question: {original_prompt}

Response: {response}

Provide 2-3 specific improvements that could make this response better.
Consider: clarity, completeness, accuracy, and helpfulness."""
        
        return self.generate_response(reflection_prompt, max_tokens=300)
    
    def improve(self, original_prompt, response, reflection):
        """Generate improved response based on reflection"""
        improvement_prompt = f"""Improve the following response based on this reflection.

Original Question: {original_prompt}

Initial Response: {response}

Reflection: {reflection}

Provide an improved response that addresses the identified issues."""
        
        return self.generate_response(improvement_prompt, max_tokens=512)
    
    def run_with_reflection(self, prompt, iterations=2):
        """Run agent with self-reflection and improvement"""
        print(f"Question: {prompt}\n")
        
        # Initial response
        response = self.generate_response(prompt)
        print(f"Initial Response:\n{response}\n")
        print(f"{'-'*80}\n")
        
        # Reflection and improvement loop
        for i in range(iterations):
            print(f"Reflection Iteration {i+1}:\n")
            
            reflection = self.reflect(prompt, response)
            print(f"Reflection:\n{reflection}\n")
            
            response = self.improve(prompt, response, reflection)
            print(f"Improved Response:\n{response}\n")
            print(f"{'-'*80}\n")
        
        return response

# Test reflective agent
reflective_agent = ReflectiveAgent(bedrock_runtime)

test_question = "Explain the difference between Amazon RDS and DynamoDB."

print("Self-Improving Agent Demo:\n")
print("="*80)
final_response = reflective_agent.run_with_reflection(test_question, iterations=1)

print(f"Final Optimized Response:\n{final_response}")

### 10. Cost Analysis

In [None]:
# Estimate lab costs
estimated_usage = {
    'Model Comparison (Claude Haiku)': {
        'calls': 10,
        'avg_input_tokens': 100,
        'avg_output_tokens': 200,
        'input_cost_per_1M': 0.25,
        'output_cost_per_1M': 1.25
    },
    'Titan Express': {
        'calls': 5,
        'avg_input_tokens': 100,
        'avg_output_tokens': 200,
        'input_cost_per_1M': 0.20,
        'output_cost_per_1M': 0.60
    },
    'Embeddings (Titan)': {
        'calls': 30,
        'avg_tokens': 50,
        'cost_per_1K': 0.0001
    },
    'Agentic AI (Claude Haiku)': {
        'calls': 20,
        'avg_input_tokens': 200,
        'avg_output_tokens': 300,
        'input_cost_per_1M': 0.25,
        'output_cost_per_1M': 1.25
    }
}

total_cost = 0

print("Lab 3 Cost Breakdown:\n")
print(f"{'Component':<30} {'Calls':<10} {'Cost':<10}")
print("="*50)

for component, usage in estimated_usage.items():
    if 'Embeddings' in component:
        cost = usage['calls'] * usage['avg_tokens'] / 1000 * usage['cost_per_1K']
    else:
        cost = (
            (usage['calls'] * usage['avg_input_tokens'] / 1_000_000 * usage['input_cost_per_1M']) +
            (usage['calls'] * usage['avg_output_tokens'] / 1_000_000 * usage['output_cost_per_1M'])
        )
    
    total_cost += cost
    print(f"{component:<30} {usage['calls']:<10} ${cost:.4f}")

print("="*50)
print(f"{'Total Estimated Cost':<40} ${total_cost:.4f}")

print("\n✓ Well under $1.00 budget!")

## Summary

In this lab, you learned:

**LLM Evaluation:**
- ✅ Implementing multiple evaluation metrics (readability, coherence, factuality)
- ✅ Comparing different foundation models
- ✅ Testing prompt engineering strategies
- ✅ Using LLM-as-judge for quality assessment

**Agentic AI:**
- ✅ Implementing tool use / function calling
- ✅ Building multi-step reasoning agents
- ✅ Creating self-improving agents with reflection
- ✅ Orchestrating complex workflows

**Key Takeaways:**
1. Evaluation is critical for production LLM systems
2. Different models excel at different tasks
3. Prompt engineering significantly impacts quality
4. Agentic patterns enable complex problem-solving
5. Reflection loops can improve output quality

**Next Steps:**
- Implement custom evaluation metrics for your domain
- Build production-ready agents with error handling
- Explore Bedrock Agents for managed agentic workflows
- Integrate with your applications

**Additional Resources:**
- [Bedrock Model Evaluation](https://docs.aws.amazon.com/bedrock/latest/userguide/model-evaluation.html)
- [Bedrock Agents](https://docs.aws.amazon.com/bedrock/latest/userguide/agents.html)
- [Claude Tool Use Guide](https://docs.anthropic.com/claude/docs/tool-use)