# LLM Deployment and Serving

## Overview

Deploying LLMs in production requires careful consideration of performance, scalability, and reliability. This notebook covers:

- **Model Serving**: REST APIs and streaming interfaces
- **Load Balancing**: Traffic distribution and scaling strategies
- **Performance Monitoring**: Latency, throughput, and error tracking
- **Deployment Patterns**: Blue-green, canary, and A/B deployments

Let's implement practical deployment and serving solutions.

In [None]:
import asyncio
import time
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, deque
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
import threading
import queue
import random

print("Libraries imported successfully!")

## 1. Model Serving Infrastructure

Let's implement a scalable model serving system:

In [None]:
@dataclass
class ModelEndpoint:
    endpoint_id: str
    model_name: str
    version: str
    max_concurrent: int
    timeout_seconds: int
    current_load: int = 0
    total_requests: int = 0
    successful_requests: int = 0
    avg_latency: float = 0.0
    status: str = 'healthy'

class ModelServingSystem:
    """Scalable model serving with load balancing and monitoring"""
    
    def __init__(self):
        self.endpoints = {}
        self.request_queue = queue.Queue()
        self.metrics = {
            'total_requests': 0,
            'successful_requests': 0,
            'failed_requests': 0,
            'avg_latency': 0.0,
            'requests_per_second': 0.0
        }
        self.latency_history = deque(maxlen=1000)
        self.load_balancer = LoadBalancer()
        self.monitoring_active = False
    
    def register_endpoint(self, endpoint: ModelEndpoint):
        """Register a new model endpoint"""
        self.endpoints[endpoint.endpoint_id] = endpoint
        self.load_balancer.add_endpoint(endpoint)
        print(f"Registered endpoint: {endpoint.endpoint_id} ({endpoint.model_name} v{endpoint.version})")
    
    def serve_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
        """Serve a single request"""
        start_time = time.time()
        request_id = f"req_{int(time.time() * 1000000)}"
        
        try:
            # Select endpoint using load balancer
            endpoint = self.load_balancer.select_endpoint(request_data)
            
            if not endpoint:
                return {
                    'request_id': request_id,
                    'error': 'No available endpoints',
                    'status': 'failed'
                }
            
            # Process request
            response = self._process_request(endpoint, request_data, request_id)
            
            # Update metrics
            latency = time.time() - start_time
            self._update_metrics(endpoint, latency, success=True)
            
            response.update({
                'request_id': request_id,
                'endpoint_id': endpoint.endpoint_id,
                'latency': latency,
                'status': 'success'
            })
            
            return response
            
        except Exception as e:
            latency = time.time() - start_time
            self._update_metrics(None, latency, success=False)
            
            return {
                'request_id': request_id,
                'error': str(e),
                'latency': latency,
                'status': 'failed'
            }
    
    def _process_request(self, endpoint: ModelEndpoint, request_data: Dict, request_id: str) -> Dict:
        """Process request on specific endpoint"""
        # Simulate model inference
        prompt = request_data.get('prompt', '')
        max_tokens = request_data.get('max_tokens', 100)
        
        # Simulate processing time based on request complexity
        base_time = 0.1
        complexity_factor = len(prompt) / 1000 + max_tokens / 1000
        processing_time = base_time + complexity_factor * 0.5
        
        time.sleep(min(processing_time, 2.0))  # Cap simulation time
        
        # Generate mock response
        response_text = f"Response from {endpoint.model_name} to: {prompt[:50]}..."
        
        return {
            'text': response_text,
            'model': endpoint.model_name,
            'version': endpoint.version,
            'tokens_generated': max_tokens,
            'processing_time': processing_time
        }
    
    def _update_metrics(self, endpoint: Optional[ModelEndpoint], latency: float, success: bool):
        """Update system and endpoint metrics"""
        # System metrics
        self.metrics['total_requests'] += 1
        
        if success:
            self.metrics['successful_requests'] += 1
        else:
            self.metrics['failed_requests'] += 1
        
        # Update latency
        self.latency_history.append(latency)
        self.metrics['avg_latency'] = np.mean(list(self.latency_history))
        
        # Endpoint metrics
        if endpoint:
            endpoint.total_requests += 1
            if success:
                endpoint.successful_requests += 1
            
            # Update endpoint average latency
            if endpoint.total_requests == 1:
                endpoint.avg_latency = latency
            else:
                endpoint.avg_latency = (
                    endpoint.avg_latency * (endpoint.total_requests - 1) + latency
                ) / endpoint.total_requests
    
    def get_system_metrics(self) -> Dict[str, Any]:
        """Get comprehensive system metrics"""
        metrics = self.metrics.copy()
        
        if metrics['total_requests'] > 0:
            metrics['success_rate'] = metrics['successful_requests'] / metrics['total_requests']
            metrics['error_rate'] = metrics['failed_requests'] / metrics['total_requests']
        
        # Calculate requests per second (last minute)
        recent_requests = len([l for l in self.latency_history if l is not None])
        metrics['requests_per_second'] = recent_requests / 60.0  # Approximate
        
        # Endpoint status
        metrics['endpoint_status'] = {
            eid: {
                'status': ep.status,
                'load': ep.current_load,
                'success_rate': ep.successful_requests / max(ep.total_requests, 1),
                'avg_latency': ep.avg_latency
            } for eid, ep in self.endpoints.items()
        }
        
        return metrics

class LoadBalancer:
    """Load balancer for distributing requests across endpoints"""
    
    def __init__(self, strategy='round_robin'):
        self.strategy = strategy
        self.endpoints = []
        self.current_index = 0
        
        self.strategies = {
            'round_robin': self._round_robin_select,
            'least_loaded': self._least_loaded_select,
            'weighted_random': self._weighted_random_select,
            'latency_based': self._latency_based_select
        }
    
    def add_endpoint(self, endpoint: ModelEndpoint):
        """Add endpoint to load balancer"""
        self.endpoints.append(endpoint)
    
    def select_endpoint(self, request_data: Dict) -> Optional[ModelEndpoint]:
        """Select best endpoint for request"""
        available_endpoints = [ep for ep in self.endpoints 
                             if ep.status == 'healthy' and ep.current_load < ep.max_concurrent]
        
        if not available_endpoints:
            return None
        
        return self.strategies[self.strategy](available_endpoints, request_data)
    
    def _round_robin_select(self, endpoints: List[ModelEndpoint], request_data: Dict) -> ModelEndpoint:
        """Round-robin selection"""
        selected = endpoints[self.current_index % len(endpoints)]
        self.current_index += 1
        return selected
    
    def _least_loaded_select(self, endpoints: List[ModelEndpoint], request_data: Dict) -> ModelEndpoint:
        """Select endpoint with least current load"""
        return min(endpoints, key=lambda ep: ep.current_load)
    
    def _weighted_random_select(self, endpoints: List[ModelEndpoint], request_data: Dict) -> ModelEndpoint:
        """Weighted random selection based on capacity"""
        weights = [ep.max_concurrent - ep.current_load for ep in endpoints]
        total_weight = sum(weights)
        
        if total_weight == 0:
            return random.choice(endpoints)
        
        r = random.uniform(0, total_weight)
        cumulative = 0
        
        for i, weight in enumerate(weights):
            cumulative += weight
            if r <= cumulative:
                return endpoints[i]
        
        return endpoints[-1]
    
    def _latency_based_select(self, endpoints: List[ModelEndpoint], request_data: Dict) -> ModelEndpoint:
        """Select endpoint with best latency performance"""
        return min(endpoints, key=lambda ep: ep.avg_latency if ep.avg_latency > 0 else float('inf'))

print("Model serving infrastructure implemented!")

### Testing Model Serving System

Let's test our serving system with multiple endpoints and load balancing:

In [None]:
# Initialize serving system
serving_system = ModelServingSystem()

# Create test endpoints
endpoints = [
    ModelEndpoint('ep1', 'gpt-3.5-turbo', '1.0', max_concurrent=10, timeout_seconds=30),
    ModelEndpoint('ep2', 'llama-2-7b', '1.0', max_concurrent=8, timeout_seconds=45),
    ModelEndpoint('ep3', 'mistral-7b', '0.1', max_concurrent=12, timeout_seconds=25)
]

for endpoint in endpoints:
    serving_system.register_endpoint(endpoint)

# Test different load balancing strategies
strategies = ['round_robin', 'least_loaded', 'weighted_random', 'latency_based']
test_requests = [
    {'prompt': 'What is machine learning?', 'max_tokens': 100},
    {'prompt': 'Explain neural networks', 'max_tokens': 150},
    {'prompt': 'How does AI work?', 'max_tokens': 120},
    {'prompt': 'Define deep learning', 'max_tokens': 80},
    {'prompt': 'What are transformers?', 'max_tokens': 200}
]

strategy_results = {}

for strategy in strategies:
    print(f"\nTesting {strategy} strategy...")
    
    # Reset endpoint metrics
    for endpoint in serving_system.endpoints.values():
        endpoint.total_requests = 0
        endpoint.successful_requests = 0
        endpoint.avg_latency = 0.0
    
    serving_system.load_balancer.strategy = strategy
    serving_system.metrics = {
        'total_requests': 0, 'successful_requests': 0, 'failed_requests': 0,
        'avg_latency': 0.0, 'requests_per_second': 0.0
    }
    serving_system.latency_history.clear()
    
    # Process test requests
    responses = []
    for request in test_requests:
        response = serving_system.serve_request(request)
        responses.append(response)
        print(f"  Request processed: {response['status']} (latency: {response.get('latency', 0):.3f}s)")
    
    # Get metrics for this strategy
    metrics = serving_system.get_system_metrics()
    strategy_results[strategy] = {
        'success_rate': metrics['success_rate'],
        'avg_latency': metrics['avg_latency'],
        'endpoint_distribution': {eid: ep['load'] for eid, ep in metrics['endpoint_status'].items()}
    }
    
    print(f"  Success rate: {metrics['success_rate']:.1%}")
    print(f"  Average latency: {metrics['avg_latency']:.3f}s")
    print(f"  Load distribution: {strategy_results[strategy]['endpoint_distribution']}")

# Visualize results
plt.figure(figsize=(15, 10))

# Success rates by strategy
plt.subplot(2, 3, 1)
success_rates = [strategy_results[s]['success_rate'] for s in strategies]
plt.bar(strategies, success_rates, color='lightgreen')
plt.xlabel('Load Balancing Strategy')
plt.ylabel('Success Rate')
plt.title('Success Rates by Strategy')
plt.xticks(rotation=45)
plt.ylim(0, 1)

# Average latencies by strategy
plt.subplot(2, 3, 2)
latencies = [strategy_results[s]['avg_latency'] for s in strategies]
plt.bar(strategies, latencies, color='skyblue')
plt.xlabel('Load Balancing Strategy')
plt.ylabel('Average Latency (s)')
plt.title('Latency by Strategy')
plt.xticks(rotation=45)

# Load distribution for round_robin
plt.subplot(2, 3, 3)
rr_distribution = strategy_results['round_robin']['endpoint_distribution']
plt.pie(rr_distribution.values(), labels=rr_distribution.keys(), autopct='%1.1f%%')
plt.title('Load Distribution (Round Robin)')

# Load distribution for least_loaded
plt.subplot(2, 3, 4)
ll_distribution = strategy_results['least_loaded']['endpoint_distribution']
plt.pie(ll_distribution.values(), labels=ll_distribution.keys(), autopct='%1.1f%%')
plt.title('Load Distribution (Least Loaded)')

# Strategy comparison radar chart
plt.subplot(2, 3, 5)
metrics_comparison = np.array([
    success_rates,
    [1/l for l in latencies]  # Inverse latency (higher is better)
])

x = np.arange(len(strategies))
width = 0.35

plt.bar(x - width/2, success_rates, width, label='Success Rate', alpha=0.8)
plt.bar(x + width/2, [l/max(latencies) for l in latencies], width, label='Latency (normalized)', alpha=0.8)
plt.xlabel('Strategy')
plt.ylabel('Performance Score')
plt.title('Strategy Performance Comparison')
plt.xticks(x, strategies, rotation=45)
plt.legend()

# Endpoint performance comparison
plt.subplot(2, 3, 6)
endpoint_names = [ep.model_name for ep in serving_system.endpoints.values()]
endpoint_latencies = [ep.avg_latency for ep in serving_system.endpoints.values()]
endpoint_success_rates = [ep.successful_requests / max(ep.total_requests, 1) 
                         for ep in serving_system.endpoints.values()]

plt.scatter(endpoint_latencies, endpoint_success_rates, s=100, alpha=0.7)
for i, name in enumerate(endpoint_names):
    plt.annotate(name, (endpoint_latencies[i], endpoint_success_rates[i]), 
                xytext=(5, 5), textcoords='offset points')
plt.xlabel('Average Latency (s)')
plt.ylabel('Success Rate')
plt.title('Endpoint Performance')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n=== SERVING SYSTEM ANALYSIS ===")
best_strategy = min(strategies, key=lambda s: strategy_results[s]['avg_latency'])
print(f"Best latency strategy: {best_strategy} ({strategy_results[best_strategy]['avg_latency']:.3f}s)")

most_reliable = max(strategies, key=lambda s: strategy_results[s]['success_rate'])
print(f"Most reliable strategy: {most_reliable} ({strategy_results[most_reliable]['success_rate']:.1%})")

print(f"\nEndpoint Performance:")
for endpoint in serving_system.endpoints.values():
    print(f"  {endpoint.model_name}: {endpoint.avg_latency:.3f}s avg latency, {endpoint.successful_requests}/{endpoint.total_requests} success")