# 08 - Scaling Laws and Model Size

This notebook explores the relationship between model size, compute, data, and performance in large language models.

## Topics Covered:
- Parameter count analysis
- Model depth and width trade-offs
- Context window considerations
- Compute scaling relationships
- Data scaling laws

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple
import math

np.random.seed(42)

## 1. Parameter Count Analysis

In [None]:
class ModelSizeAnalyzer:
    """Analyze parameter counts and model sizes."""
    
    def __init__(self):
        self.model_configs = {
            'GPT-Small': {'layers': 12, 'd_model': 768, 'heads': 12, 'vocab': 50257},
            'GPT-Medium': {'layers': 24, 'd_model': 1024, 'heads': 16, 'vocab': 50257},
            'GPT-Large': {'layers': 36, 'd_model': 1280, 'heads': 20, 'vocab': 50257},
            'GPT-XL': {'layers': 48, 'd_model': 1600, 'heads': 25, 'vocab': 50257},
            'GPT-2': {'layers': 48, 'd_model': 1600, 'heads': 25, 'vocab': 50257},
            'GPT-3': {'layers': 96, 'd_model': 12288, 'heads': 96, 'vocab': 50257}
        }
    
    def calculate_parameters(self, layers: int, d_model: int, heads: int, 
                           vocab_size: int, context_length: int = 2048) -> Dict[str, int]:
        """Calculate parameter count breakdown for a transformer model."""
        
        # Token embeddings
        token_embeddings = vocab_size * d_model
        
        # Position embeddings (if learned)
        position_embeddings = context_length * d_model
        
        # Per-layer parameters
        # Multi-head attention: Q, K, V projections + output projection
        attention_params = 4 * d_model * d_model
        
        # Feed-forward network (typically 4x expansion)
        d_ff = 4 * d_model
        ffn_params = d_model * d_ff + d_ff * d_model  # Two linear layers
        
        # Layer normalization (2 per layer: pre-attention and pre-ffn)
        layernorm_params = 2 * 2 * d_model  # 2 layers * 2 params (scale, bias)
        
        # Total per layer
        per_layer_params = attention_params + ffn_params + layernorm_params
        
        # Total transformer layers
        transformer_params = layers * per_layer_params
        
        # Output layer (language modeling head)
        output_params = d_model * vocab_size
        
        # Final layer norm
        final_layernorm = 2 * d_model
        
        total_params = (
            token_embeddings + position_embeddings + transformer_params + 
            output_params + final_layernorm
        )
        
        return {
            'token_embeddings': token_embeddings,
            'position_embeddings': position_embeddings,
            'attention_params': layers * attention_params,
            'ffn_params': layers * ffn_params,
            'layernorm_params': layers * layernorm_params + final_layernorm,
            'output_params': output_params,
            'total_params': total_params
        }
    
    def analyze_scaling_trends(self) -> Dict[str, List]:
        """Analyze parameter scaling trends across model sizes."""
        results = {
            'model_names': [],
            'total_params': [],
            'layers': [],
            'd_model': [],
            'attention_ratio': [],
            'ffn_ratio': []
        }
        
        for name, config in self.model_configs.items():
            params = self.calculate_parameters(**config)
            
            results['model_names'].append(name)
            results['total_params'].append(params['total_params'])
            results['layers'].append(config['layers'])
            results['d_model'].append(config['d_model'])
            
            # Calculate ratios
            total = params['total_params']
            results['attention_ratio'].append(params['attention_params'] / total)
            results['ffn_ratio'].append(params['ffn_params'] / total)
        
        return results
    
    def memory_requirements(self, total_params: int, precision: str = 'fp16') -> Dict[str, float]:
        """Calculate memory requirements for different scenarios."""
        
        # Bytes per parameter based on precision
        bytes_per_param = {
            'fp32': 4,
            'fp16': 2,
            'int8': 1,
            'int4': 0.5
        }
        
        param_bytes = bytes_per_param[precision]
        
        # Model weights
        model_memory = total_params * param_bytes
        
        # Training memory (approximate)
        # Gradients + optimizer states (Adam: 2x params) + activations
        training_memory = model_memory * 4  # Rough estimate
        
        # Inference memory (model + activations)
        inference_memory = model_memory * 1.2  # Rough estimate
        
        return {
            'model_gb': model_memory / (1024**3),
            'training_gb': training_memory / (1024**3),
            'inference_gb': inference_memory / (1024**3)
        }

def demonstrate_parameter_analysis():
    """Demonstrate parameter count analysis."""
    
    analyzer = ModelSizeAnalyzer()
    
    # Analyze a specific model configuration
    config = {'layers': 24, 'd_model': 1024, 'heads': 16, 'vocab': 50257}
    params = analyzer.calculate_parameters(**config)
    
    print("Parameter Breakdown for GPT-Medium:")
    for component, count in params.items():
        percentage = (count / params['total_params']) * 100
        print(f"  {component}: {count:,} ({percentage:.1f}%)")
    
    # Analyze scaling trends
    trends = analyzer.analyze_scaling_trends()
    
    print(f"\nModel Scaling Analysis:")
    for i, name in enumerate(trends['model_names']):
        params_m = trends['total_params'][i] / 1e6
        print(f"  {name}: {params_m:.1f}M parameters")
    
    # Memory analysis
    gpt3_params = trends['total_params'][-1]  # GPT-3
    memory_fp16 = analyzer.memory_requirements(gpt3_params, 'fp16')
    memory_fp32 = analyzer.memory_requirements(gpt3_params, 'fp32')
    
    print(f"\nGPT-3 Memory Requirements:")
    print(f"  FP16 - Model: {memory_fp16['model_gb']:.1f}GB, Training: {memory_fp16['training_gb']:.1f}GB")
    print(f"  FP32 - Model: {memory_fp32['model_gb']:.1f}GB, Training: {memory_fp32['training_gb']:.1f}GB")
    
    # Visualize parameter scaling
    plt.figure(figsize=(15, 12))
    
    # Parameter count scaling
    plt.subplot(3, 3, 1)
    params_millions = [p / 1e6 for p in trends['total_params']]
    plt.semilogy(trends['model_names'], params_millions, 'o-', linewidth=2, markersize=8)
    plt.title('Parameter Count Scaling')
    plt.ylabel('Parameters (Millions)')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    # Parameter breakdown for GPT-Medium
    plt.subplot(3, 3, 2)
    components = ['Token Emb', 'Position Emb', 'Attention', 'FFN', 'LayerNorm', 'Output']
    values = [
        params['token_embeddings'],
        params['position_embeddings'],
        params['attention_params'],
        params['ffn_params'],
        params['layernorm_params'],
        params['output_params']
    ]
    
    plt.pie(values, labels=components, autopct='%1.1f%%')
    plt.title('Parameter Distribution\n(GPT-Medium)')
    
    # Layers vs d_model scaling
    plt.subplot(3, 3, 3)
    plt.scatter(trends['layers'], trends['d_model'], s=[p/1e6 for p in trends['total_params']], 
               alpha=0.7, c=range(len(trends['model_names'])), cmap='viridis')
    
    for i, name in enumerate(trends['model_names']):
        plt.annotate(name, (trends['layers'][i], trends['d_model'][i]), 
                    xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    plt.xlabel('Number of Layers')
    plt.ylabel('Model Dimension')
    plt.title('Architecture Scaling\n(Bubble size = Parameters)')
    plt.grid(True, alpha=0.3)
    
    # Memory requirements comparison
    plt.subplot(3, 3, 4)
    precisions = ['fp32', 'fp16', 'int8', 'int4']
    model_memory = []
    training_memory = []
    
    for precision in precisions:
        mem = analyzer.memory_requirements(gpt3_params, precision)
        model_memory.append(mem['model_gb'])
        training_memory.append(mem['training_gb'])
    
    x = np.arange(len(precisions))
    width = 0.35
    
    plt.bar(x - width/2, model_memory, width, label='Model', alpha=0.7)
    plt.bar(x + width/2, training_memory, width, label='Training', alpha=0.7)
    
    plt.xlabel('Precision')
    plt.ylabel('Memory (GB)')
    plt.title('Memory vs Precision\n(GPT-3 Scale)')
    plt.xticks(x, precisions)
    plt.legend()
    plt.yscale('log')
    
    # Attention vs FFN parameter ratio
    plt.subplot(3, 3, 5)
    plt.plot(trends['model_names'], trends['attention_ratio'], 'o-', label='Attention', alpha=0.7)
    plt.plot(trends['model_names'], trends['ffn_ratio'], 's-', label='FFN', alpha=0.7)
    
    plt.xlabel('Model')
    plt.ylabel('Parameter Ratio')
    plt.title('Component Parameter Ratios')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Parameter efficiency analysis
    plt.subplot(3, 3, 6)
    
    # Calculate parameters per layer
    params_per_layer = [p / l for p, l in zip(trends['total_params'], trends['layers'])]
    
    plt.scatter(trends['layers'], params_per_layer, alpha=0.7)
    
    for i, name in enumerate(trends['model_names']):
        plt.annotate(name, (trends['layers'][i], params_per_layer[i]), 
                    xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    plt.xlabel('Number of Layers')
    plt.ylabel('Parameters per Layer')
    plt.title('Parameter Efficiency')
    plt.grid(True, alpha=0.3)
    
    # Context length impact
    plt.subplot(3, 3, 7)
    
    context_lengths = [512, 1024, 2048, 4096, 8192]
    base_config = {'layers': 24, 'd_model': 1024, 'heads': 16, 'vocab': 50257}
    
    total_params = []
    pos_emb_params = []
    
    for ctx_len in context_lengths:
        params = analyzer.calculate_parameters(**base_config, context_length=ctx_len)
        total_params.append(params['total_params'] / 1e6)
        pos_emb_params.append(params['position_embeddings'] / 1e6)
    
    plt.plot(context_lengths, total_params, 'o-', label='Total Parameters', alpha=0.7)
    plt.plot(context_lengths, pos_emb_params, 's-', label='Position Embeddings', alpha=0.7)
    
    plt.xlabel('Context Length')
    plt.ylabel('Parameters (Millions)')
    plt.title('Context Length Impact')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Model size vs compute relationship
    plt.subplot(3, 3, 8)
    
    # Approximate FLOPs for forward pass (simplified)
    def estimate_flops(params, seq_len=2048):
        # Very rough estimate: 2 * params * seq_len
        return 2 * params * seq_len
    
    flops = [estimate_flops(p) / 1e12 for p in trends['total_params']]  # TFLOPs
    
    plt.loglog(params_millions, flops, 'o-', alpha=0.7)
    
    for i, name in enumerate(trends['model_names']):
        plt.annotate(name, (params_millions[i], flops[i]), 
                    xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    plt.xlabel('Parameters (Millions)')
    plt.ylabel('FLOPs per Forward Pass (TFLOPs)')
    plt.title('Parameters vs Compute')
    plt.grid(True, alpha=0.3)
    
    # Architecture design space
    plt.subplot(3, 3, 9)
    
    # Show different ways to achieve similar parameter counts
    target_params = 350e6  # 350M parameters
    
    layer_options = np.arange(12, 49, 4)
    d_model_options = []
    
    for layers in layer_options:
        # Approximate d_model needed for target parameters
        # Simplified: total_params ≈ layers * 12 * d_model^2
        d_model = int(np.sqrt(target_params / (layers * 12)))
        d_model_options.append(d_model)
    
    plt.plot(layer_options, d_model_options, 'o-', alpha=0.7)
    plt.xlabel('Number of Layers')
    plt.ylabel('Model Dimension')
    plt.title(f'Design Space for\n{target_params/1e6:.0f}M Parameters')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\nKey Insights:")
    print("\n1. Parameter Distribution:")
    print("   - FFN parameters dominate (typically 60-70%)")
    print("   - Attention parameters are significant (20-30%)")
    print("   - Embeddings become less significant as models grow")
    
    print("\n2. Scaling Patterns:")
    print("   - Both depth and width contribute to capacity")
    print("   - Deeper models may be more parameter-efficient")
    print("   - Context length has linear impact on position embeddings")
    
    print("\n3. Memory Considerations:")
    print("   - Training requires ~4x model memory")
    print("   - Precision choice significantly impacts memory")
    print("   - Quantization can reduce memory by 2-8x")

demonstrate_parameter_analysis()

## 2. Scaling Laws Implementation

In [None]:
class ScalingLaws:
    """Implementation of neural scaling laws."""
    
    def __init__(self):
        # Empirical constants from scaling law research
        self.constants = {
            'A': 1.0,      # Scale factor
            'alpha': 0.076, # Parameter scaling exponent
            'beta': 0.095,  # Data scaling exponent
            'gamma': 0.5,   # Compute scaling exponent
            'E': 1.69       # Irreducible loss
        }
    
    def loss_vs_parameters(self, N: np.ndarray) -> np.ndarray:
        """Loss as a function of parameter count (with fixed data)."""
        A, alpha, E = self.constants['A'], self.constants['alpha'], self.constants['E']
        return A * (N ** -alpha) + E
    
    def loss_vs_data(self, D: np.ndarray) -> np.ndarray:
        """Loss as a function of dataset size (with fixed parameters)."""
        A, beta, E = self.constants['A'], self.constants['beta'], self.constants['E']
        return A * (D ** -beta) + E
    
    def loss_vs_compute(self, C: np.ndarray) -> np.ndarray:
        """Loss as a function of compute (FLOPs)."""
        A, gamma, E = self.constants['A'], self.constants['gamma'], self.constants['E']
        return A * (C ** -gamma) + E
    
    def optimal_allocation(self, compute_budget: float) -> Tuple[float, float]:
        """Find optimal parameter count and data size for given compute budget."""
        # Chinchilla scaling: optimal ratio between parameters and data
        # Simplified relationship: D_optimal ≈ 20 * N_optimal
        
        # Approximate relationship between compute, parameters, and data
        # C ≈ 6 * N * D (forward pass) + training overhead
        
        # For optimal allocation: N_optimal ≈ (C / 120)^0.5
        N_optimal = (compute_budget / 120) ** 0.5
        D_optimal = 20 * N_optimal
        
        return N_optimal, D_optimal
    
    def compute_requirements(self, N: float, D: float, epochs: int = 1) -> float:
        """Estimate compute requirements for training."""
        # Simplified: 6 FLOPs per parameter per token (forward + backward)
        flops_per_token = 6 * N
        total_compute = flops_per_token * D * epochs
        return total_compute
    
    def performance_prediction(self, N: float, D: float, C: float) -> float:
        """Predict performance given parameters, data, and compute."""
        # Combined scaling law (simplified)
        A, alpha, beta, E = self.constants['A'], self.constants['alpha'], self.constants['beta'], self.constants['E']
        
        # Weighted combination of parameter and data scaling
        param_term = A * (N ** -alpha)
        data_term = A * (D ** -beta)
        
        # Take the maximum (bottleneck)
        loss = max(param_term, data_term) + E
        
        return loss

def demonstrate_scaling_laws():
    """Demonstrate neural scaling laws."""
    
    scaling = ScalingLaws()
    
    # Parameter ranges for analysis
    N_range = np.logspace(6, 11, 50)  # 1M to 100B parameters
    D_range = np.logspace(8, 13, 50)  # 100M to 10T tokens
    C_range = np.logspace(18, 25, 50) # 1e18 to 1e25 FLOPs
    
    # Calculate losses
    loss_N = scaling.loss_vs_parameters(N_range)
    loss_D = scaling.loss_vs_data(D_range)
    loss_C = scaling.loss_vs_compute(C_range)
    
    print("Scaling Laws Analysis:")
    
    # Example predictions
    example_models = [
        {'name': 'GPT-2', 'params': 1.5e9, 'data': 40e9},
        {'name': 'GPT-3', 'params': 175e9, 'data': 300e9},
        {'name': 'Hypothetical', 'params': 1e12, 'data': 2e13}
    ]
    
    print("\nModel Performance Predictions:")
    for model in example_models:
        N, D = model['params'], model['data']
        C = scaling.compute_requirements(N, D)
        loss = scaling.performance_prediction(N, D, C)
        
        print(f"  {model['name']}:")
        print(f"    Parameters: {N/1e9:.1f}B")
        print(f"    Data: {D/1e9:.1f}B tokens")
        print(f"    Compute: {C:.2e} FLOPs")
        print(f"    Predicted Loss: {loss:.3f}")
    
    # Optimal allocation analysis
    compute_budgets = [1e21, 1e22, 1e23, 1e24]  # Different compute budgets
    
    print("\nOptimal Resource Allocation:")
    for budget in compute_budgets:
        N_opt, D_opt = scaling.optimal_allocation(budget)
        print(f"  Budget {budget:.0e} FLOPs:")
        print(f"    Optimal Parameters: {N_opt/1e9:.1f}B")
        print(f"    Optimal Data: {D_opt/1e9:.1f}B tokens")
    
    # Visualize scaling laws
    plt.figure(figsize=(15, 12))
    
    # Loss vs Parameters
    plt.subplot(3, 3, 1)
    plt.loglog(N_range / 1e9, loss_N, 'b-', linewidth=2, label='Scaling Law')
    
    # Add example models
    for model in example_models:
        N = model['params']
        loss = scaling.loss_vs_parameters(np.array([N]))[0]
        plt.loglog(N / 1e9, loss, 'ro', markersize=8)
        plt.annotate(model['name'], (N / 1e9, loss), xytext=(5, 5), 
                    textcoords='offset points')
    
    plt.xlabel('Parameters (Billions)')
    plt.ylabel('Loss')
    plt.title('Loss vs Parameters')
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    # Loss vs Data
    plt.subplot(3, 3, 2)
    plt.loglog(D_range / 1e9, loss_D, 'g-', linewidth=2, label='Scaling Law')
    
    for model in example_models:
        D = model['data']
        loss = scaling.loss_vs_data(np.array([D]))[0]
        plt.loglog(D / 1e9, loss, 'ro', markersize=8)
        plt.annotate(model['name'], (D / 1e9, loss), xytext=(5, 5), 
                    textcoords='offset points')
    
    plt.xlabel('Training Data (Billion Tokens)')
    plt.ylabel('Loss')
    plt.title('Loss vs Training Data')
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    # Loss vs Compute
    plt.subplot(3, 3, 3)
    plt.loglog(C_range, loss_C, 'r-', linewidth=2, label='Scaling Law')
    
    for model in example_models:
        N, D = model['params'], model['data']
        C = scaling.compute_requirements(N, D)
        loss = scaling.loss_vs_compute(np.array([C]))[0]
        plt.loglog(C, loss, 'ro', markersize=8)
        plt.annotate(model['name'], (C, loss), xytext=(5, 5), 
                    textcoords='offset points')
    
    plt.xlabel('Compute (FLOPs)')
    plt.ylabel('Loss')
    plt.title('Loss vs Compute')
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    # Optimal allocation frontier
    plt.subplot(3, 3, 4)
    
    compute_range = np.logspace(20, 24, 20)
    optimal_N = []
    optimal_D = []
    
    for C in compute_range:
        N_opt, D_opt = scaling.optimal_allocation(C)
        optimal_N.append(N_opt)
        optimal_D.append(D_opt)
    
    plt.loglog(optimal_N, optimal_D, 'b-', linewidth=2, label='Optimal Frontier')
    
    # Add example models
    for model in example_models:
        N, D = model['params'], model['data']
        plt.loglog(N, D, 'ro', markersize=8)
        plt.annotate(model['name'], (N, D), xytext=(5, 5), 
                    textcoords='offset points')
    
    plt.xlabel('Parameters')
    plt.ylabel('Training Tokens')
    plt.title('Optimal Parameter-Data Allocation')
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    # Compute efficiency analysis
    plt.subplot(3, 3, 5)
    
    # Show how loss improves with compute for different allocation strategies
    compute_budgets = np.logspace(20, 24, 20)
    
    # Optimal allocation
    optimal_losses = []
    for C in compute_budgets:
        N_opt, D_opt = scaling.optimal_allocation(C)
        loss = scaling.performance_prediction(N_opt, D_opt, C)
        optimal_losses.append(loss)
    
    # Suboptimal: only scaling parameters
    param_only_losses = []
    fixed_data = 1e11  # Fixed 100B tokens
    for C in compute_budgets:
        # Estimate max parameters for this compute budget
        N_max = C / (6 * fixed_data)
        loss = scaling.performance_prediction(N_max, fixed_data, C)
        param_only_losses.append(loss)
    
    plt.loglog(compute_budgets, optimal_losses, 'b-', linewidth=2, label='Optimal Allocation')
    plt.loglog(compute_budgets, param_only_losses, 'r--', linewidth=2, label='Parameters Only')
    
    plt.xlabel('Compute Budget (FLOPs)')
    plt.ylabel('Loss')
    plt.title('Allocation Strategy Comparison')
    plt.grid(True, alpha=0.3)
    plt.legend()
    
    # Training efficiency over time
    plt.subplot(3, 3, 6)
    
    # Simulate training progress
    epochs = np.linspace(0.1, 3, 30)
    base_loss = 2.5
    
    # Different model sizes
    model_sizes = [1e9, 10e9, 100e9]  # 1B, 10B, 100B parameters
    
    for i, N in enumerate(model_sizes):
        # Simulate loss decay during training
        losses = base_loss * np.exp(-epochs * 0.5) + scaling.constants['E']
        # Adjust for model size
        losses = losses * (1e9 / N) ** 0.1
        
        plt.plot(epochs, losses, label=f'{N/1e9:.0f}B params', linewidth=2)
    
    plt.xlabel('Training Epochs')
    plt.ylabel('Loss')
    plt.title('Training Progress by Model Size')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Scaling exponents visualization
    plt.subplot(3, 3, 7)
    
    exponents = ['α (Parameters)', 'β (Data)', 'γ (Compute)']
    values = [scaling.constants['alpha'], scaling.constants['beta'], scaling.constants['gamma']]
    
    bars = plt.bar(exponents, values, alpha=0.7)
    plt.title('Scaling Exponents')
    plt.ylabel('Exponent Value')
    
    # Add value labels
    for bar, val in zip(bars, values):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, 
                f'{val:.3f}', ha='center')
    
    # Performance vs cost trade-off
    plt.subplot(3, 3, 8)
    
    # Different model configurations
    configs = [
        {'name': 'Small', 'params': 1e8, 'data': 1e10},
        {'name': 'Medium', 'params': 1e9, 'data': 1e11},
        {'name': 'Large', 'params': 1e10, 'data': 1e12},
        {'name': 'XL', 'params': 1e11, 'data': 1e13}
    ]
    
    costs = []
    performances = []
    
    for config in configs:
        N, D = config['params'], config['data']
        cost = scaling.compute_requirements(N, D) / 1e21  # Normalize
        performance = 1 / scaling.performance_prediction(N, D, cost * 1e21)  # Inverse loss
        
        costs.append(cost)
        performances.append(performance)
    
    plt.scatter(costs, performances, s=100, alpha=0.7)
    
    for i, config in enumerate(configs):
        plt.annotate(config['name'], (costs[i], performances[i]), 
                    xytext=(5, 5), textcoords='offset points')
    
    plt.xlabel('Relative Training Cost')
    plt.ylabel('Performance (1/Loss)')
    plt.title('Performance vs Cost Trade-off')
    plt.grid(True, alpha=0.3)
    
    # Future projections
    plt.subplot(3, 3, 9)
    
    # Project future model capabilities
    years = np.arange(2020, 2031)
    
    # Assume compute grows exponentially (Moore's law-like)
    compute_growth = 1e21 * (2 ** ((years - 2020) / 2))  # Double every 2 years
    
    projected_losses = []
    for C in compute_growth:
        N_opt, D_opt = scaling.optimal_allocation(C)
        loss = scaling.performance_prediction(N_opt, D_opt, C)
        projected_losses.append(loss)
    
    plt.plot(years, projected_losses, 'o-', linewidth=2)
    plt.xlabel('Year')
    plt.ylabel('Projected Loss')
    plt.title('Future Performance Projections')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\nScaling Laws Insights:")
    
    print("\n1. Power Law Relationships:")
    print(f"   - Loss ∝ N^(-{scaling.constants['alpha']:.3f}) for parameters")
    print(f"   - Loss ∝ D^(-{scaling.constants['beta']:.3f}) for data")
    print(f"   - Loss ∝ C^(-{scaling.constants['gamma']:.3f}) for compute")
    
    print("\n2. Optimal Allocation (Chinchilla):")
    print("   - Data should scale ~20x faster than parameters")
    print("   - Many models are undertrained (too few tokens)")
    print("   - Compute-optimal models are smaller but see more data")
    
    print("\n3. Practical Implications:")
    print("   - Doubling compute improves loss by ~30%")
    print("   - Data quality matters as much as quantity")
    print("   - Diminishing returns require exponential resources")

demonstrate_scaling_laws()