# 11 - Evaluation and Optimization

This notebook covers evaluation metrics and optimization techniques for language models.

## Topics Covered:
- Evaluation metrics (Perplexity, BLEU, ROUGE)
- Optimization techniques (Quantization, Pruning, Knowledge Distillation)
- Caching strategies

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple
from collections import Counter

np.random.seed(42)

## 1. Evaluation Metrics

In [None]:
class EvaluationMetrics:
    """Language model evaluation metrics."""
    
    @staticmethod
    def perplexity(probabilities: np.ndarray) -> float:
        """Calculate perplexity from token probabilities."""
        log_probs = np.log(probabilities + 1e-10)
        avg_log_prob = np.mean(log_probs)
        return np.exp(-avg_log_prob)
    
    @staticmethod
    def bleu_score(reference: List[str], candidate: List[str], n: int = 4) -> float:
        """Calculate BLEU score."""
        def get_ngrams(tokens: List[str], n: int) -> Counter:
            return Counter([tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)])
        
        # Calculate precision for each n-gram order
        precisions = []
        
        for i in range(1, n+1):
            ref_ngrams = get_ngrams(reference, i)
            cand_ngrams = get_ngrams(candidate, i)
            
            if len(cand_ngrams) == 0:
                precisions.append(0)
                continue
            
            matches = sum(min(ref_ngrams[ngram], cand_ngrams[ngram]) 
                         for ngram in cand_ngrams)
            precision = matches / len(cand_ngrams)
            precisions.append(precision)
        
        # Brevity penalty
        bp = min(1, np.exp(1 - len(reference) / len(candidate))) if len(candidate) > 0 else 0
        
        # Geometric mean of precisions
        if all(p > 0 for p in precisions):
            bleu = bp * np.exp(np.mean(np.log(precisions)))
        else:
            bleu = 0
        
        return bleu
    
    @staticmethod
    def rouge_l(reference: List[str], candidate: List[str]) -> Dict[str, float]:
        """Calculate ROUGE-L score."""
        def lcs_length(x: List[str], y: List[str]) -> int:
            """Longest common subsequence length."""
            m, n = len(x), len(y)
            dp = [[0] * (n + 1) for _ in range(m + 1)]
            
            for i in range(1, m + 1):
                for j in range(1, n + 1):
                    if x[i-1] == y[j-1]:
                        dp[i][j] = dp[i-1][j-1] + 1
                    else:
                        dp[i][j] = max(dp[i-1][j], dp[i][j-1])
            
            return dp[m][n]
        
        lcs_len = lcs_length(reference, candidate)
        
        if len(reference) == 0 or len(candidate) == 0:
            return {'precision': 0, 'recall': 0, 'f1': 0}
        
        precision = lcs_len / len(candidate)
        recall = lcs_len / len(reference)
        
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = 2 * precision * recall / (precision + recall)
        
        return {'precision': precision, 'recall': recall, 'f1': f1}

class ModelOptimization:
    """Model optimization techniques."""
    
    @staticmethod
    def quantize_weights(weights: np.ndarray, bits: int = 8) -> Tuple[np.ndarray, float, float]:
        """Quantize weights to specified bit precision."""
        # Calculate scale and zero point
        w_min, w_max = weights.min(), weights.max()
        
        if bits == 8:
            qmin, qmax = 0, 255
        elif bits == 4:
            qmin, qmax = 0, 15
        else:
            raise ValueError(f"Unsupported bit width: {bits}")
        
        scale = (w_max - w_min) / (qmax - qmin)
        zero_point = qmin - w_min / scale
        zero_point = np.clip(np.round(zero_point), qmin, qmax)
        
        # Quantize
        quantized = np.clip(np.round(weights / scale + zero_point), qmin, qmax)
        
        # Dequantize for comparison
        dequantized = scale * (quantized - zero_point)
        
        return dequantized, scale, zero_point
    
    @staticmethod
    def magnitude_pruning(weights: np.ndarray, sparsity: float) -> np.ndarray:
        """Prune weights by magnitude."""
        flat_weights = weights.flatten()
        threshold = np.percentile(np.abs(flat_weights), sparsity * 100)
        
        pruned_weights = weights.copy()
        pruned_weights[np.abs(weights) < threshold] = 0
        
        return pruned_weights
    
    @staticmethod
    def knowledge_distillation_loss(student_logits: np.ndarray, teacher_logits: np.ndarray, 
                                  temperature: float = 3.0, alpha: float = 0.7) -> float:
        """Calculate knowledge distillation loss."""
        # Soft targets from teacher
        teacher_probs = ModelOptimization._softmax(teacher_logits / temperature)
        student_log_probs = ModelOptimization._log_softmax(student_logits / temperature)
        
        # KL divergence loss
        kl_loss = -np.sum(teacher_probs * student_log_probs, axis=-1)
        
        return np.mean(kl_loss) * (temperature ** 2) * alpha
    
    @staticmethod
    def _softmax(x: np.ndarray) -> np.ndarray:
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
    
    @staticmethod
    def _log_softmax(x: np.ndarray) -> np.ndarray:
        return x - np.log(np.sum(np.exp(x - np.max(x, axis=-1, keepdims=True)), axis=-1, keepdims=True))

def demonstrate_evaluation_optimization():
    """Demonstrate evaluation metrics and optimization techniques."""
    
    print("Language Model Evaluation and Optimization:")
    
    # Evaluation metrics demonstration
    evaluator = EvaluationMetrics()
    
    # Perplexity example
    good_probs = np.array([0.8, 0.7, 0.9, 0.6, 0.8])
    bad_probs = np.array([0.3, 0.2, 0.4, 0.1, 0.3])
    
    good_ppl = evaluator.perplexity(good_probs)
    bad_ppl = evaluator.perplexity(bad_probs)
    
    print(f"\nPerplexity Comparison:")
    print(f"  Good model: {good_ppl:.2f}")
    print(f"  Bad model: {bad_ppl:.2f}")
    
    # BLEU score example
    reference = "the cat sat on the mat".split()
    good_candidate = "the cat is sitting on the mat".split()
    bad_candidate = "a dog ran in the park".split()
    
    good_bleu = evaluator.bleu_score(reference, good_candidate)
    bad_bleu = evaluator.bleu_score(reference, bad_candidate)
    
    print(f"\nBLEU Score Comparison:")
    print(f"  Reference: {' '.join(reference)}")
    print(f"  Good candidate: {' '.join(good_candidate)} (BLEU: {good_bleu:.3f})")
    print(f"  Bad candidate: {' '.join(bad_candidate)} (BLEU: {bad_bleu:.3f})")
    
    # ROUGE-L example
    rouge_good = evaluator.rouge_l(reference, good_candidate)
    rouge_bad = evaluator.rouge_l(reference, bad_candidate)
    
    print(f"\nROUGE-L Comparison:")
    print(f"  Good candidate F1: {rouge_good['f1']:.3f}")
    print(f"  Bad candidate F1: {rouge_bad['f1']:.3f}")
    
    # Optimization techniques
    optimizer = ModelOptimization()
    
    # Generate sample weights
    weights = np.random.randn(100, 100) * 0.1
    
    # Quantization
    quant_8bit, scale_8, zp_8 = optimizer.quantize_weights(weights, bits=8)
    quant_4bit, scale_4, zp_4 = optimizer.quantize_weights(weights, bits=4)
    
    quant_error_8 = np.mean((weights - quant_8bit) ** 2)
    quant_error_4 = np.mean((weights - quant_4bit) ** 2)
    
    print(f"\nQuantization Results:")
    print(f"  8-bit MSE: {quant_error_8:.6f}")
    print(f"  4-bit MSE: {quant_error_4:.6f}")
    print(f"  Memory reduction: 4x (FP32→INT8), 8x (FP32→INT4)")
    
    # Pruning
    sparsities = [0.5, 0.7, 0.9]
    
    print(f"\nPruning Results:")
    for sparsity in sparsities:
        pruned = optimizer.magnitude_pruning(weights, sparsity)
        actual_sparsity = np.mean(pruned == 0)
        print(f"  Target {sparsity*100}% sparse → Actual {actual_sparsity*100:.1f}% sparse")
    
    # Knowledge distillation
    teacher_logits = np.random.randn(10, 1000) * 2
    student_logits = np.random.randn(10, 1000) * 1.5
    
    kd_loss = optimizer.knowledge_distillation_loss(student_logits, teacher_logits)
    print(f"\nKnowledge Distillation Loss: {kd_loss:.4f}")
    
    # Visualizations
    plt.figure(figsize=(15, 12))
    
    # Perplexity vs probability
    plt.subplot(3, 3, 1)
    probs = np.linspace(0.1, 0.9, 20)
    perplexities = [evaluator.perplexity(np.array([p])) for p in probs]
    
    plt.plot(probs, perplexities, 'b-', linewidth=2)
    plt.xlabel('Token Probability')
    plt.ylabel('Perplexity')
    plt.title('Perplexity vs Token Probability')
    plt.grid(True, alpha=0.3)
    
    # Quantization error analysis
    plt.subplot(3, 3, 2)
    
    bit_widths = [32, 16, 8, 4]
    memory_usage = [4, 2, 1, 0.5]  # Bytes per parameter
    
    # Simulate quantization errors
    errors = [0, 0.001, quant_error_8, quant_error_4]
    
    plt.scatter(memory_usage, errors, s=100, alpha=0.7)
    
    for i, bits in enumerate(bit_widths):
        plt.annotate(f'{bits}-bit', (memory_usage[i], errors[i]), 
                    xytext=(5, 5), textcoords='offset points')
    
    plt.xlabel('Memory per Parameter (Bytes)')
    plt.ylabel('Quantization Error (MSE)')
    plt.title('Memory vs Accuracy Trade-off')
    plt.grid(True, alpha=0.3)
    
    # Pruning sparsity analysis
    plt.subplot(3, 3, 3)
    
    sparsity_levels = np.linspace(0, 0.95, 20)
    
    # Simulate performance degradation
    performance = 100 * (1 - sparsity_levels) ** 0.3
    speedup = 1 / (1 - sparsity_levels * 0.8)
    
    ax1 = plt.gca()
    ax1.plot(sparsity_levels * 100, performance, 'r-', label='Performance', linewidth=2)
    ax1.set_xlabel('Sparsity (%)')
    ax1.set_ylabel('Performance (%)', color='r')
    ax1.tick_params(axis='y', labelcolor='r')
    
    ax2 = ax1.twinx()
    ax2.plot(sparsity_levels * 100, speedup, 'b-', label='Speedup', linewidth=2)
    ax2.set_ylabel('Speedup (x)', color='b')
    ax2.tick_params(axis='y', labelcolor='b')
    
    plt.title('Pruning: Performance vs Speedup')
    
    # BLEU score components
    plt.subplot(3, 3, 4)
    
    n_grams = ['1-gram', '2-gram', '3-gram', '4-gram']
    
    # Calculate n-gram precisions for good candidate
    precisions = []
    for n in range(1, 5):
        ref_ngrams = Counter([tuple(reference[i:i+n]) for i in range(len(reference)-n+1)])
        cand_ngrams = Counter([tuple(good_candidate[i:i+n]) for i in range(len(good_candidate)-n+1)])
        
        if len(cand_ngrams) > 0:
            matches = sum(min(ref_ngrams[ngram], cand_ngrams[ngram]) for ngram in cand_ngrams)
            precision = matches / len(cand_ngrams)
        else:
            precision = 0
        precisions.append(precision)
    
    plt.bar(n_grams, precisions, alpha=0.7)
    plt.ylabel('Precision')
    plt.title('BLEU N-gram Precisions')
    plt.ylim(0, 1)
    
    # Model compression comparison
    plt.subplot(3, 3, 5)
    
    techniques = ['Original', 'Quantization\n(8-bit)', 'Pruning\n(50%)', 'Distillation\n(0.5x)']
    model_sizes = [100, 25, 50, 50]  # Relative sizes
    performances = [100, 98, 95, 92]  # Performance retention
    
    x = np.arange(len(techniques))
    width = 0.35
    
    plt.bar(x - width/2, model_sizes, width, label='Model Size', alpha=0.7)
    plt.bar(x + width/2, performances, width, label='Performance', alpha=0.7)
    
    plt.xlabel('Optimization Technique')
    plt.ylabel('Relative Score')
    plt.title('Optimization Techniques Comparison')
    plt.xticks(x, techniques, rotation=45)
    plt.legend()
    
    # Knowledge distillation temperature effect
    plt.subplot(3, 3, 6)
    
    temperatures = np.linspace(1, 10, 20)
    
    # Simulate how temperature affects distillation
    teacher_entropy = []
    for temp in temperatures:
        soft_probs = optimizer._softmax(teacher_logits[0] / temp)
        entropy = -np.sum(soft_probs * np.log(soft_probs + 1e-10))
        teacher_entropy.append(entropy)
    
    plt.plot(temperatures, teacher_entropy, 'g-', linewidth=2)
    plt.xlabel('Temperature')
    plt.ylabel('Teacher Output Entropy')
    plt.title('Temperature Effect on Soft Targets')
    plt.grid(True, alpha=0.3)
    
    # Evaluation metrics comparison
    plt.subplot(3, 3, 7)
    
    metrics = ['Perplexity\n(lower better)', 'BLEU\n(higher better)', 'ROUGE-L\n(higher better)']
    good_scores = [good_ppl, good_bleu, rouge_good['f1']]
    bad_scores = [bad_ppl, bad_bleu, rouge_bad['f1']]
    
    # Normalize for visualization
    good_norm = [1/good_ppl, good_bleu, rouge_good['f1']]
    bad_norm = [1/bad_ppl, bad_bleu, rouge_bad['f1']]
    
    x = np.arange(len(metrics))
    width = 0.35
    
    plt.bar(x - width/2, good_norm, width, label='Good Model', alpha=0.7)
    plt.bar(x + width/2, bad_norm, width, label='Bad Model', alpha=0.7)
    
    plt.xlabel('Metric')
    plt.ylabel('Normalized Score')
    plt.title('Evaluation Metrics Comparison')
    plt.xticks(x, metrics)
    plt.legend()
    
    # Optimization pipeline
    plt.subplot(3, 3, 8)
    
    pipeline_steps = ['Original', 'Distill', 'Quantize', 'Prune']
    cumulative_compression = [1, 2, 8, 16]  # Cumulative compression ratio
    
    plt.plot(pipeline_steps, cumulative_compression, 'o-', linewidth=2, markersize=8)
    plt.xlabel('Optimization Step')
    plt.ylabel('Compression Ratio')
    plt.title('Optimization Pipeline')
    plt.grid(True, alpha=0.3)
    
    # Memory usage breakdown
    plt.subplot(3, 3, 9)
    
    components = ['Weights', 'Activations', 'Gradients', 'Optimizer\nStates']
    memory_breakdown = [40, 20, 25, 15]  # Percentage breakdown
    
    plt.pie(memory_breakdown, labels=components, autopct='%1.1f%%')
    plt.title('Training Memory Breakdown')
    
    plt.tight_layout()
    plt.show()
    
    print("\nOptimization Insights:")
    
    print("\nQuantization:")
    print("  + 2-8x memory reduction")
    print("  + Faster inference on specialized hardware")
    print("  - Some accuracy loss")
    
    print("\nPruning:")
    print("  + Reduces model size and computation")
    print("  + Can maintain performance with proper fine-tuning")
    print("  - Requires sparse computation support")
    
    print("\nKnowledge Distillation:")
    print("  + Creates smaller models with retained performance")
    print("  + Transfers knowledge from large to small models")
    print("  - Requires teacher model and additional training")

demonstrate_evaluation_optimization()