# LLM System Design - Practical Implementation

## Overview

This notebook demonstrates practical implementations of LLM system design patterns including:

- **End-to-end pipelines**: Complete request processing workflows
- **Caching strategies**: Multi-level and semantic caching
- **Cost optimization**: Resource allocation and budget management
- **Orchestration**: Workflow and multi-agent coordination

In [None]:
import asyncio
import time
import hashlib
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from collections import defaultdict, deque
from typing import Dict, List, Any, Optional, Tuple
import threading
from concurrent.futures import ThreadPoolExecutor

print("Libraries imported successfully!")

## 1. End-to-End LLM Pipeline

Let's implement a complete LLM processing pipeline with validation, caching, and monitoring:

In [None]:
class LLMPipeline:
    """Complete LLM processing pipeline with caching and monitoring"""
    
    def __init__(self, config):
        self.config = config
        self.cache = {}
        self.metrics = {
            'total_requests': 0,
            'cache_hits': 0,
            'cache_misses': 0,
            'total_inference_time': 0,
            'errors': 0
        }
        self.request_history = deque(maxlen=1000)
    
    def process_request(self, request):
        """Process a complete LLM request through the pipeline"""
        request_id = self.generate_request_id()
        start_time = time.time()
        
        try:
            # 1. Request validation
            validated_request = self.validate_request(request)
            
            # 2. Preprocessing
            processed_input = self.preprocess_input(validated_request)
            
            # 3. Cache lookup
            cache_key = self.generate_cache_key(processed_input)
            cached_response = self.cache.get(cache_key)
            
            if cached_response:
                self.metrics['cache_hits'] += 1
                response = self.format_response(cached_response, from_cache=True)
            else:
                self.metrics['cache_misses'] += 1
                
                # 4. Model inference
                inference_start = time.time()
                model_output = self.simulate_model_inference(processed_input)
                inference_time = time.time() - inference_start
                
                self.metrics['total_inference_time'] += inference_time
                
                # 5. Post-processing
                processed_output = self.postprocess_output(model_output)
                
                # 6. Cache storage
                self.cache[cache_key] = processed_output
                
                # 7. Response formatting
                response = self.format_response(processed_output, inference_time=inference_time)
            
            # 8. Logging and monitoring
            total_time = time.time() - start_time
            self.log_request(request_id, request, response, total_time)
            
            self.metrics['total_requests'] += 1
            
            return response
            
        except Exception as e:
            self.metrics['errors'] += 1
            return self.handle_error(e, request_id)
    
    def validate_request(self, request):
        """Validate incoming request"""
        required_fields = ['prompt']
        
        for field in required_fields:
            if field not in request:
                raise ValueError(f"Missing required field: {field}")
        
        # Validate prompt length
        max_length = self.config.get('max_prompt_length', 4000)
        if len(request['prompt']) > max_length:
            raise ValueError(f"Prompt exceeds maximum length of {max_length}")
        
        # Set defaults
        validated = request.copy()
        validated.setdefault('max_tokens', 100)
        validated.setdefault('temperature', 0.7)
        validated.setdefault('top_p', 0.9)
        
        return validated
    
    def preprocess_input(self, request):
        """Preprocess input for model"""
        processed = {
            'prompt': request['prompt'].strip(),
            'max_tokens': min(request['max_tokens'], 512),  # Cap max tokens
            'temperature': max(0.0, min(2.0, request['temperature'])),  # Clamp temperature
            'top_p': max(0.0, min(1.0, request['top_p']))  # Clamp top_p
        }
        
        # Add preprocessing timestamp
        processed['processed_at'] = datetime.now().isoformat()
        
        return processed
    
    def generate_cache_key(self, processed_input):
        """Generate cache key for processed input"""
        key_components = [
            processed_input['prompt'],
            str(processed_input['temperature']),
            str(processed_input['max_tokens']),
            str(processed_input['top_p'])
        ]
        
        key_string = '|'.join(key_components)
        return hashlib.md5(key_string.encode()).hexdigest()
    
    def simulate_model_inference(self, processed_input):
        """Simulate model inference (replace with actual model call)"""
        # Simulate inference time based on prompt length
        prompt_length = len(processed_input['prompt'])
        base_time = 0.1  # Base inference time
        length_factor = prompt_length / 1000  # Additional time per 1000 chars
        
        inference_time = base_time + length_factor
        time.sleep(min(inference_time, 2.0))  # Cap simulation time
        
        # Generate mock response
        response_templates = [
            "This is a helpful response to your query about {topic}.",
            "Based on your question, I can provide the following information: {info}.",
            "Here's what I understand about {topic}: {explanation}."
        ]
        
        template = np.random.choice(response_templates)
        mock_response = template.format(
            topic="the requested topic",
            info="relevant details",
            explanation="a comprehensive explanation"
        )
        
        return {
            'text': mock_response,
            'tokens_generated': len(mock_response.split()),
            'finish_reason': 'completed'
        }
    
    def postprocess_output(self, model_output):
        """Post-process model output"""
        processed = model_output.copy()
        
        # Clean up text
        processed['text'] = processed['text'].strip()
        
        # Add metadata
        processed['processed_at'] = datetime.now().isoformat()
        processed['word_count'] = len(processed['text'].split())
        processed['character_count'] = len(processed['text'])
        
        return processed
    
    def format_response(self, output, from_cache=False, inference_time=None):
        """Format final response"""
        response = {
            'text': output['text'],
            'metadata': {
                'tokens_generated': output.get('tokens_generated', 0),
                'word_count': output.get('word_count', 0),
                'character_count': output.get('character_count', 0),
                'from_cache': from_cache,
                'inference_time': inference_time,
                'timestamp': datetime.now().isoformat()
            }
        }
        
        return response
    
    def generate_request_id(self):
        """Generate unique request ID"""
        timestamp = str(int(time.time() * 1000000))
        return f"req_{timestamp}"
    
    def log_request(self, request_id, request, response, total_time):
        """Log request for monitoring"""
        log_entry = {
            'request_id': request_id,
            'timestamp': datetime.now().isoformat(),
            'prompt_length': len(request['prompt']),
            'response_length': len(response['text']),
            'total_time': total_time,
            'from_cache': response['metadata']['from_cache']
        }
        
        self.request_history.append(log_entry)
    
    def handle_error(self, error, request_id):
        """Handle pipeline errors"""
        error_response = {
            'error': True,
            'message': str(error),
            'request_id': request_id,
            'timestamp': datetime.now().isoformat()
        }
        
        return error_response
    
    def get_metrics(self):
        """Get pipeline performance metrics"""
        metrics = self.metrics.copy()
        
        if metrics['total_requests'] > 0:
            metrics['cache_hit_rate'] = metrics['cache_hits'] / metrics['total_requests']
            metrics['error_rate'] = metrics['errors'] / metrics['total_requests']
            
        if metrics['cache_misses'] > 0:
            metrics['avg_inference_time'] = metrics['total_inference_time'] / metrics['cache_misses']
        
        return metrics

# Initialize pipeline
pipeline_config = {
    'max_prompt_length': 4000,
    'cache_ttl': 3600  # 1 hour
}

pipeline = LLMPipeline(pipeline_config)
print("LLM Pipeline initialized!")

### Testing the Pipeline

Let's test our pipeline with various requests and observe caching behavior:

In [None]:
# Test requests
test_requests = [
    {'prompt': 'What is machine learning?', 'max_tokens': 100},
    {'prompt': 'Explain neural networks', 'max_tokens': 150},
    {'prompt': 'What is machine learning?', 'max_tokens': 100},  # Duplicate for cache test
    {'prompt': 'How does deep learning work?', 'max_tokens': 200},
    {'prompt': 'What is machine learning?', 'max_tokens': 100},  # Another duplicate
]

print("Processing test requests...\n")

responses = []
for i, request in enumerate(test_requests, 1):
    print(f"Request {i}: {request['prompt'][:50]}...")
    
    start_time = time.time()
    response = pipeline.process_request(request)
    end_time = time.time()
    
    responses.append(response)
    
    if 'error' not in response:
        print(f"  Response: {response['text'][:100]}...")
        print(f"  From cache: {response['metadata']['from_cache']}")
        print(f"  Total time: {end_time - start_time:.3f}s")
    else:
        print(f"  Error: {response['message']}")
    
    print()

# Display pipeline metrics
metrics = pipeline.get_metrics()
print("=== PIPELINE METRICS ===")
for key, value in metrics.items():
    if isinstance(value, float):
        print(f"{key}: {value:.3f}")
    else:
        print(f"{key}: {value}")

## 2. Multi-Level Caching System

Let's implement a sophisticated caching system with multiple levels and semantic similarity:

In [None]:
class MultiLevelCache:
    """Multi-level caching system with L1 (memory), L2 (disk simulation), L3 (database simulation)"""
    
    def __init__(self, l1_size=100, l2_size=500, l3_size=2000):
        # L1: In-memory cache (fastest, smallest)
        self.l1_cache = {}
        self.l1_access_order = deque(maxlen=l1_size)
        self.l1_size = l1_size
        
        # L2: Simulated disk cache (medium speed, medium size)
        self.l2_cache = {}
        self.l2_access_order = deque(maxlen=l2_size)
        self.l2_size = l2_size
        
        # L3: Simulated database cache (slowest, largest)
        self.l3_cache = {}
        self.l3_access_order = deque(maxlen=l3_size)
        self.l3_size = l3_size
        
        # Cache statistics
        self.stats = {
            'l1_hits': 0, 'l1_misses': 0,
            'l2_hits': 0, 'l2_misses': 0,
            'l3_hits': 0, 'l3_misses': 0,
            'total_requests': 0
        }
    
    def get(self, key):
        """Get value from multi-level cache"""
        self.stats['total_requests'] += 1
        
        # Try L1 cache first
        if key in self.l1_cache:
            self.stats['l1_hits'] += 1
            self._update_access_order(key, 1)
            return self.l1_cache[key]
        
        self.stats['l1_misses'] += 1
        
        # Try L2 cache
        if key in self.l2_cache:
            self.stats['l2_hits'] += 1
            value = self.l2_cache[key]
            # Promote to L1
            self._promote_to_l1(key, value)
            time.sleep(0.01)  # Simulate L2 access time
            return value
        
        self.stats['l2_misses'] += 1
        
        # Try L3 cache
        if key in self.l3_cache:
            self.stats['l3_hits'] += 1
            value = self.l3_cache[key]
            # Promote to L2 and L1
            self._promote_to_l2(key, value)
            self._promote_to_l1(key, value)
            time.sleep(0.05)  # Simulate L3 access time
            return value
        
        self.stats['l3_misses'] += 1
        return None
    
    def set(self, key, value, ttl=None):
        """Set value in all cache levels"""
        # Add TTL if specified
        if ttl:
            expiry_time = time.time() + ttl
            cache_value = {'value': value, 'expires_at': expiry_time}
        else:
            cache_value = {'value': value, 'expires_at': None}
        
        # Store in all levels
        self._set_l1(key, cache_value)
        self._set_l2(key, cache_value)
        self._set_l3(key, cache_value)
    
    def _set_l1(self, key, value):
        """Set value in L1 cache with LRU eviction"""
        if len(self.l1_cache) >= self.l1_size and key not in self.l1_cache:
            # Evict least recently used
            if self.l1_access_order:
                lru_key = self.l1_access_order.popleft()
                self.l1_cache.pop(lru_key, None)
        
        self.l1_cache[key] = value
        self._update_access_order(key, 1)
    
    def _set_l2(self, key, value):
        """Set value in L2 cache"""
        if len(self.l2_cache) >= self.l2_size and key not in self.l2_cache:
            if self.l2_access_order:
                lru_key = self.l2_access_order.popleft()
                self.l2_cache.pop(lru_key, None)
        
        self.l2_cache[key] = value
        self._update_access_order(key, 2)
    
    def _set_l3(self, key, value):
        """Set value in L3 cache"""
        if len(self.l3_cache) >= self.l3_size and key not in self.l3_cache:
            if self.l3_access_order:
                lru_key = self.l3_access_order.popleft()
                self.l3_cache.pop(lru_key, None)
        
        self.l3_cache[key] = value
        self._update_access_order(key, 3)
    
    def _promote_to_l1(self, key, value):
        """Promote value to L1 cache"""
        self._set_l1(key, value)
    
    def _promote_to_l2(self, key, value):
        """Promote value to L2 cache"""
        self._set_l2(key, value)
    
    def _update_access_order(self, key, level):
        """Update access order for LRU"""
        if level == 1:
            if key in self.l1_access_order:
                self.l1_access_order.remove(key)
            self.l1_access_order.append(key)
        elif level == 2:
            if key in self.l2_access_order:
                self.l2_access_order.remove(key)
            self.l2_access_order.append(key)
        elif level == 3:
            if key in self.l3_access_order:
                self.l3_access_order.remove(key)
            self.l3_access_order.append(key)
    
    def get_stats(self):
        """Get cache statistics"""
        stats = self.stats.copy()
        
        if stats['total_requests'] > 0:
            stats['l1_hit_rate'] = stats['l1_hits'] / stats['total_requests']
            stats['l2_hit_rate'] = stats['l2_hits'] / stats['total_requests']
            stats['l3_hit_rate'] = stats['l3_hits'] / stats['total_requests']
            stats['overall_hit_rate'] = (stats['l1_hits'] + stats['l2_hits'] + stats['l3_hits']) / stats['total_requests']
        
        stats['cache_sizes'] = {
            'l1': len(self.l1_cache),
            'l2': len(self.l2_cache),
            'l3': len(self.l3_cache)
        }
        
        return stats

# Initialize multi-level cache
cache = MultiLevelCache(l1_size=5, l2_size=10, l3_size=20)  # Small sizes for demo
print("Multi-level cache initialized!")

### Testing Multi-Level Cache

Let's test the cache with various access patterns:

In [None]:
# Test cache with various patterns
print("Testing multi-level cache...\n")

# Store some initial data
test_data = {
    'key1': 'First value',
    'key2': 'Second value', 
    'key3': 'Third value',
    'key4': 'Fourth value',
    'key5': 'Fifth value',
    'key6': 'Sixth value',
    'key7': 'Seventh value'
}

print("Storing initial data...")
for key, value in test_data.items():
    cache.set(key, value)
    print(f"Stored {key}: {value}")

print("\nTesting cache retrieval patterns...")

# Test different access patterns
access_patterns = [
    ('key1', 'First access - should be in L1'),
    ('key1', 'Second access - should be L1 hit'),
    ('key2', 'Access key2 - should be in L1'),
    ('key8', 'Non-existent key - should be miss'),
    ('key3', 'Access key3 - might be in L2/L3'),
    ('key1', 'Access key1 again - should be L1 hit'),
]

for key, description in access_patterns:
    start_time = time.time()
    result = cache.get(key)
    end_time = time.time()
    
    print(f"{description}:")
    print(f"  Key: {key}")
    print(f"  Result: {result['value'] if result else 'Not found'}")
    print(f"  Access time: {(end_time - start_time)*1000:.2f}ms")
    print()

# Display cache statistics
stats = cache.get_stats()
print("=== CACHE STATISTICS ===")
print(f"Total requests: {stats['total_requests']}")
print(f"L1 hits: {stats['l1_hits']} (rate: {stats.get('l1_hit_rate', 0):.2%})")
print(f"L2 hits: {stats['l2_hits']} (rate: {stats.get('l2_hit_rate', 0):.2%})")
print(f"L3 hits: {stats['l3_hits']} (rate: {stats.get('l3_hit_rate', 0):.2%})")
print(f"Overall hit rate: {stats.get('overall_hit_rate', 0):.2%}")
print(f"Cache sizes: L1={stats['cache_sizes']['l1']}, L2={stats['cache_sizes']['l2']}, L3={stats['cache_sizes']['l3']}")

## 3. Cost Optimization System

Let's implement a cost-aware system that optimizes resource allocation and tracks budgets:

In [None]:
class CostOptimizationSystem:
    """Cost optimization and budget management system"""
    
    def __init__(self):
        self.user_budgets = {}
        self.model_costs = {
            'gpt-4': {'cost_per_token': 0.00003, 'quality_score': 0.95, 'speed': 1.0},
            'gpt-3.5': {'cost_per_token': 0.000002, 'quality_score': 0.85, 'speed': 2.0},
            'local-llama': {'cost_per_token': 0.0000001, 'quality_score': 0.75, 'speed': 0.5}
        }
        self.usage_history = defaultdict(list)
    
    def set_user_budget(self, user_id, daily_limit, monthly_limit):
        """Set budget limits for a user"""
        self.user_budgets[user_id] = {
            'daily_limit': daily_limit,
            'monthly_limit': monthly_limit,
            'daily_usage': 0.0,
            'monthly_usage': 0.0,
            'last_reset': datetime.now().date()
        }
    
    def estimate_request_cost(self, prompt, max_tokens, model='gpt-3.5'):
        """Estimate cost for a request"""
        if model not in self.model_costs:
            raise ValueError(f"Unknown model: {model}")
        
        # Estimate input tokens (rough approximation)
        input_tokens = len(prompt.split()) * 1.3  # Account for tokenization
        
        # Total tokens (input + output)
        total_tokens = input_tokens + max_tokens
        
        cost_per_token = self.model_costs[model]['cost_per_token']
        estimated_cost = total_tokens * cost_per_token
        
        return {
            'estimated_cost': estimated_cost,
            'input_tokens': input_tokens,
            'max_output_tokens': max_tokens,
            'total_tokens': total_tokens,
            'model': model
        }
    
    def select_optimal_model(self, prompt, max_tokens, user_id, quality_threshold=0.8):
        """Select optimal model based on budget and quality requirements"""
        if user_id not in self.user_budgets:
            return 'gpt-3.5'  # Default model
        
        budget_info = self.user_budgets[user_id]
        available_budget = min(
            budget_info['daily_limit'] - budget_info['daily_usage'],
            budget_info['monthly_limit'] - budget_info['monthly_usage']
        )
        
        # Evaluate each model
        model_options = []
        
        for model_name, model_info in self.model_costs.items():
            cost_estimate = self.estimate_request_cost(prompt, max_tokens, model_name)
            
            if cost_estimate['estimated_cost'] <= available_budget:
                # Calculate utility score (quality per dollar)
                utility_score = model_info['quality_score'] / cost_estimate['estimated_cost']
                
                model_options.append({
                    'model': model_name,
                    'cost': cost_estimate['estimated_cost'],
                    'quality': model_info['quality_score'],
                    'utility_score': utility_score,
                    'meets_quality_threshold': model_info['quality_score'] >= quality_threshold
                })
        
        if not model_options:
            return None  # No affordable options
        
        # Filter by quality threshold first
        quality_options = [opt for opt in model_options if opt['meets_quality_threshold']]
        
        if quality_options:
            # Select highest utility among quality options
            best_option = max(quality_options, key=lambda x: x['utility_score'])
        else:
            # If no options meet quality threshold, select cheapest
            best_option = min(model_options, key=lambda x: x['cost'])
        
        return best_option['model']
    
    def process_request_with_cost_optimization(self, user_id, prompt, max_tokens, quality_threshold=0.8):
        """Process request with cost optimization"""
        # Select optimal model
        selected_model = self.select_optimal_model(prompt, max_tokens, user_id, quality_threshold)
        
        if not selected_model:
            return {
                'error': 'Insufficient budget for any available model',
                'available_budget': self.get_available_budget(user_id)
            }
        
        # Estimate actual cost
        cost_estimate = self.estimate_request_cost(prompt, max_tokens, selected_model)
        
        # Simulate processing (in practice, call actual model)
        processing_result = self.simulate_model_processing(prompt, max_tokens, selected_model)
        
        # Calculate actual cost based on actual tokens used
        actual_tokens = processing_result['actual_tokens']
        actual_cost = actual_tokens * self.model_costs[selected_model]['cost_per_token']
        
        # Deduct from budget
        self.deduct_cost(user_id, actual_cost)
        
        # Record usage
        self.record_usage(user_id, {
            'timestamp': datetime.now(),
            'model': selected_model,
            'prompt_length': len(prompt),
            'tokens_used': actual_tokens,
            'cost': actual_cost,
            'estimated_cost': cost_estimate['estimated_cost']
        })
        
        return {
            'response': processing_result['response'],
            'model_used': selected_model,
            'cost_info': {
                'estimated_cost': cost_estimate['estimated_cost'],
                'actual_cost': actual_cost,
                'tokens_used': actual_tokens,
                'remaining_budget': self.get_available_budget(user_id)
            }
        }
    
    def simulate_model_processing(self, prompt, max_tokens, model):
        """Simulate model processing"""
        # Simulate different response lengths based on model
        model_info = self.model_costs[model]
        
        # Higher quality models tend to give more comprehensive responses
        response_length_factor = model_info['quality_score']
        actual_output_tokens = int(max_tokens * response_length_factor * np.random.uniform(0.7, 1.0))
        
        input_tokens = len(prompt.split()) * 1.3
        total_tokens = input_tokens + actual_output_tokens
        
        # Simulate processing time
        processing_time = (total_tokens / 1000) / model_info['speed']
        time.sleep(min(processing_time, 1.0))  # Cap simulation time
        
        response = f"This is a {model} response to: {prompt[:50]}... (Quality: {model_info['quality_score']:.2f})"
        
        return {
            'response': response,
            'actual_tokens': total_tokens,
            'processing_time': processing_time
        }
    
    def deduct_cost(self, user_id, cost):
        """Deduct cost from user budget"""
        if user_id in self.user_budgets:
            budget = self.user_budgets[user_id]
            
            # Check if we need to reset daily usage
            today = datetime.now().date()
            if budget['last_reset'] < today:
                budget['daily_usage'] = 0.0
                budget['last_reset'] = today
            
            budget['daily_usage'] += cost
            budget['monthly_usage'] += cost
    
    def get_available_budget(self, user_id):
        """Get available budget for user"""
        if user_id not in self.user_budgets:
            return float('inf')
        
        budget = self.user_budgets[user_id]
        daily_remaining = budget['daily_limit'] - budget['daily_usage']
        monthly_remaining = budget['monthly_limit'] - budget['monthly_usage']
        
        return min(daily_remaining, monthly_remaining)
    
    def record_usage(self, user_id, usage_record):
        """Record usage for analytics"""
        self.usage_history[user_id].append(usage_record)
    
    def get_usage_analytics(self, user_id):
        """Get usage analytics for user"""
        if user_id not in self.usage_history:
            return {'total_requests': 0, 'total_cost': 0.0}
        
        usage_records = self.usage_history[user_id]
        
        analytics = {
            'total_requests': len(usage_records),
            'total_cost': sum(record['cost'] for record in usage_records),
            'avg_cost_per_request': np.mean([record['cost'] for record in usage_records]),
            'model_usage': Counter(record['model'] for record in usage_records),
            'cost_by_model': defaultdict(float)
        }
        
        for record in usage_records:
            analytics['cost_by_model'][record['model']] += record['cost']
        
        return analytics

# Initialize cost optimization system
cost_optimizer = CostOptimizationSystem()
print("Cost optimization system initialized!")

### Testing Cost Optimization

Let's test the cost optimization system with different users and budgets:

In [None]:
# Set up test users with different budgets
cost_optimizer.set_user_budget('user_premium', daily_limit=10.0, monthly_limit=200.0)
cost_optimizer.set_user_budget('user_standard', daily_limit=1.0, monthly_limit=20.0)
cost_optimizer.set_user_budget('user_budget', daily_limit=0.1, monthly_limit=2.0)

print("Set up user budgets:")
print("- Premium user: $10/day, $200/month")
print("- Standard user: $1/day, $20/month")
print("- Budget user: $0.10/day, $2/month")
print()

# Test requests
test_requests = [
    {
        'user_id': 'user_premium',
        'prompt': 'Write a comprehensive analysis of machine learning trends in 2024',
        'max_tokens': 500,
        'quality_threshold': 0.9
    },
    {
        'user_id': 'user_standard',
        'prompt': 'Explain the basics of neural networks',
        'max_tokens': 200,
        'quality_threshold': 0.8
    },
    {
        'user_id': 'user_budget',
        'prompt': 'What is AI?',
        'max_tokens': 100,
        'quality_threshold': 0.7
    },
    {
        'user_id': 'user_budget',
        'prompt': 'Explain quantum computing in detail',
        'max_tokens': 300,
        'quality_threshold': 0.9  # High quality requirement
    }
]

print("Processing requests with cost optimization...\n")

for i, request in enumerate(test_requests, 1):
    print(f"Request {i} from {request['user_id']}:")
    print(f"  Prompt: {request['prompt'][:60]}...")
    print(f"  Max tokens: {request['max_tokens']}")
    print(f"  Quality threshold: {request['quality_threshold']}")
    
    # Process request
    result = cost_optimizer.process_request_with_cost_optimization(
        request['user_id'],
        request['prompt'],
        request['max_tokens'],
        request['quality_threshold']
    )
    
    if 'error' in result:
        print(f"  ERROR: {result['error']}")
        print(f"  Available budget: ${result['available_budget']:.4f}")
    else:
        print(f"  Model selected: {result['model_used']}")
        print(f"  Estimated cost: ${result['cost_info']['estimated_cost']:.4f}")
        print(f"  Actual cost: ${result['cost_info']['actual_cost']:.4f}")
        print(f"  Tokens used: {result['cost_info']['tokens_used']:.0f}")
        print(f"  Remaining budget: ${result['cost_info']['remaining_budget']:.4f}")
        print(f"  Response: {result['response'][:80]}...")
    
    print()

# Display usage analytics
print("=== USAGE ANALYTICS ===")
for user_id in ['user_premium', 'user_standard', 'user_budget']:
    analytics = cost_optimizer.get_usage_analytics(user_id)
    print(f"\n{user_id.upper()}:")
    print(f"  Total requests: {analytics['total_requests']}")
    print(f"  Total cost: ${analytics['total_cost']:.4f}")
    if analytics['total_requests'] > 0:
        print(f"  Average cost per request: ${analytics['avg_cost_per_request']:.4f}")
        print(f"  Model usage: {dict(analytics['model_usage'])}")
        print(f"  Cost by model: {dict(analytics['cost_by_model'])}")