# 09 - Model Architectures and Variants

This notebook explores different transformer architectures and their variants.

## Topics Covered:
- Decoder-only models (GPT-style)
- Encoder-only models (BERT-style)
- Encoder-decoder models (T5-style)
- Dense vs Sparse models
- Mixture of Experts (MoE)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Tuple, Optional

np.random.seed(42)

## 1. Model Architecture Variants

In [None]:
class ModelArchitectures:
    """Different transformer architecture variants."""
    
    def __init__(self, d_model: int = 512, vocab_size: int = 1000):
        self.d_model = d_model
        self.vocab_size = vocab_size
    
    def decoder_only_forward(self, input_ids: np.ndarray, mask: Optional[np.ndarray] = None) -> np.ndarray:
        """GPT-style decoder-only model."""
        batch_size, seq_len = input_ids.shape
        
        # Create causal mask
        if mask is None:
            mask = np.triu(np.ones((seq_len, seq_len)), k=1) == 0
        
        # Simplified forward pass
        # Token embeddings
        embeddings = np.random.randn(batch_size, seq_len, self.d_model)
        
        # Self-attention with causal mask
        attention_output = self._masked_attention(embeddings, mask)
        
        # Output projection to vocabulary
        logits = attention_output @ np.random.randn(self.d_model, self.vocab_size)
        
        return logits
    
    def encoder_only_forward(self, input_ids: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
        """BERT-style encoder-only model."""
        batch_size, seq_len = input_ids.shape
        
        # Token embeddings
        embeddings = np.random.randn(batch_size, seq_len, self.d_model)
        
        # Bidirectional self-attention
        attention_output = self._bidirectional_attention(embeddings, attention_mask)
        
        # For classification: use [CLS] token
        cls_output = attention_output[:, 0, :]  # First token
        
        return cls_output
    
    def encoder_decoder_forward(self, encoder_input: np.ndarray, decoder_input: np.ndarray) -> np.ndarray:
        """T5-style encoder-decoder model."""
        # Encoder
        encoder_embeddings = np.random.randn(*encoder_input.shape, self.d_model)
        encoder_output = self._bidirectional_attention(encoder_embeddings, None)
        
        # Decoder
        decoder_embeddings = np.random.randn(*decoder_input.shape, self.d_model)
        
        # Causal self-attention in decoder
        seq_len = decoder_input.shape[1]
        causal_mask = np.triu(np.ones((seq_len, seq_len)), k=1) == 0
        decoder_self_attn = self._masked_attention(decoder_embeddings, causal_mask)
        
        # Cross-attention to encoder
        decoder_output = self._cross_attention(decoder_self_attn, encoder_output)
        
        # Output projection
        logits = decoder_output @ np.random.randn(self.d_model, self.vocab_size)
        
        return logits
    
    def _masked_attention(self, x: np.ndarray, mask: np.ndarray) -> np.ndarray:
        """Simplified masked attention."""
        # Simplified: just apply mask and return modified input
        batch_size, seq_len, d_model = x.shape
        
        # Simulate attention computation
        attention_weights = np.random.rand(batch_size, seq_len, seq_len)
        
        # Apply mask
        attention_weights = attention_weights * mask[None, :, :]
        
        # Normalize
        attention_weights = attention_weights / (np.sum(attention_weights, axis=-1, keepdims=True) + 1e-10)
        
        # Apply attention
        output = attention_weights @ x
        
        return output
    
    def _bidirectional_attention(self, x: np.ndarray, mask: Optional[np.ndarray]) -> np.ndarray:
        """Simplified bidirectional attention."""
        # No causal masking - can attend to all positions
        batch_size, seq_len, d_model = x.shape
        
        attention_weights = np.random.rand(batch_size, seq_len, seq_len)
        
        if mask is not None:
            attention_weights = attention_weights * mask[:, None, :]
        
        attention_weights = attention_weights / (np.sum(attention_weights, axis=-1, keepdims=True) + 1e-10)
        output = attention_weights @ x
        
        return output
    
    def _cross_attention(self, queries: np.ndarray, keys_values: np.ndarray) -> np.ndarray:
        """Simplified cross-attention."""
        batch_size, q_len, d_model = queries.shape
        kv_len = keys_values.shape[1]
        
        # Cross-attention weights
        attention_weights = np.random.rand(batch_size, q_len, kv_len)
        attention_weights = attention_weights / np.sum(attention_weights, axis=-1, keepdims=True)
        
        # Apply to keys/values
        output = attention_weights @ keys_values
        
        return output

class MixtureOfExperts:
    """Mixture of Experts implementation."""
    
    def __init__(self, d_model: int, num_experts: int, expert_capacity: int, top_k: int = 2):
        self.d_model = d_model
        self.num_experts = num_experts
        self.expert_capacity = expert_capacity
        self.top_k = top_k
        
        # Router network
        self.router_weights = np.random.randn(d_model, num_experts) * 0.1
        
        # Expert networks (simplified as linear layers)
        self.expert_weights = []
        for _ in range(num_experts):
            w1 = np.random.randn(d_model, expert_capacity) * 0.1
            w2 = np.random.randn(expert_capacity, d_model) * 0.1
            self.expert_weights.append((w1, w2))
    
    def forward(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """Forward pass through MoE layer."""
        batch_size, seq_len, d_model = x.shape
        
        # Flatten for routing
        x_flat = x.reshape(-1, d_model)  # (batch_size * seq_len, d_model)
        
        # Router computation
        router_logits = x_flat @ self.router_weights  # (batch_size * seq_len, num_experts)
        router_probs = self._softmax(router_logits)
        
        # Top-k routing
        top_k_indices = np.argpartition(router_probs, -self.top_k, axis=1)[:, -self.top_k:]
        top_k_probs = np.take_along_axis(router_probs, top_k_indices, axis=1)
        
        # Normalize top-k probabilities
        top_k_probs = top_k_probs / (np.sum(top_k_probs, axis=1, keepdims=True) + 1e-10)
        
        # Process through experts
        output = np.zeros_like(x_flat)
        expert_usage = np.zeros(self.num_experts)
        
        for i in range(len(x_flat)):
            token_output = np.zeros(d_model)
            
            for j in range(self.top_k):
                expert_idx = top_k_indices[i, j]
                expert_prob = top_k_probs[i, j]
                
                # Expert computation
                w1, w2 = self.expert_weights[expert_idx]
                hidden = np.maximum(0, x_flat[i] @ w1)  # ReLU activation
                expert_output = hidden @ w2
                
                token_output += expert_prob * expert_output
                expert_usage[expert_idx] += expert_prob
            
            output[i] = token_output
        
        # Reshape back
        output = output.reshape(batch_size, seq_len, d_model)
        
        return output, expert_usage
    
    def _softmax(self, x: np.ndarray) -> np.ndarray:
        """Softmax activation."""
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def demonstrate_architectures():
    """Demonstrate different model architectures."""
    
    # Initialize models
    d_model, vocab_size = 256, 1000
    models = ModelArchitectures(d_model, vocab_size)
    
    # Sample inputs
    batch_size, seq_len = 2, 8
    input_ids = np.random.randint(0, vocab_size, (batch_size, seq_len))
    attention_mask = np.ones((batch_size, seq_len))
    
    print("Model Architecture Comparison:")
    
    # Decoder-only (GPT-style)
    decoder_output = models.decoder_only_forward(input_ids)
    print(f"\nDecoder-only output shape: {decoder_output.shape}")
    print(f"  - Causal attention (can't see future tokens)")
    print(f"  - Autoregressive generation")
    print(f"  - Applications: Text generation, completion")
    
    # Encoder-only (BERT-style)
    encoder_output = models.encoder_only_forward(input_ids, attention_mask)
    print(f"\nEncoder-only output shape: {encoder_output.shape}")
    print(f"  - Bidirectional attention")
    print(f"  - Masked language modeling")
    print(f"  - Applications: Classification, understanding")
    
    # Encoder-decoder (T5-style)
    encoder_input = input_ids
    decoder_input = np.random.randint(0, vocab_size, (batch_size, 6))
    enc_dec_output = models.encoder_decoder_forward(encoder_input, decoder_input)
    print(f"\nEncoder-decoder output shape: {enc_dec_output.shape}")
    print(f"  - Encoder: bidirectional, Decoder: causal")
    print(f"  - Cross-attention between encoder and decoder")
    print(f"  - Applications: Translation, summarization")
    
    # Mixture of Experts
    moe = MixtureOfExperts(d_model=d_model, num_experts=8, expert_capacity=512, top_k=2)
    x = np.random.randn(batch_size, seq_len, d_model)
    moe_output, expert_usage = moe.forward(x)
    
    print(f"\nMixture of Experts:")
    print(f"  - Output shape: {moe_output.shape}")
    print(f"  - Expert usage: {expert_usage}")
    print(f"  - Sparse computation (only top-k experts active)")
    
    # Visualize architectures
    plt.figure(figsize=(15, 10))
    
    # Architecture comparison
    plt.subplot(2, 3, 1)
    
    architectures = ['Decoder-only\n(GPT)', 'Encoder-only\n(BERT)', 'Encoder-Decoder\n(T5)']
    
    # Simulate complexity metrics
    params = [100, 110, 150]  # Relative parameter counts
    memory = [80, 70, 120]    # Relative memory usage
    
    x = np.arange(len(architectures))
    width = 0.35
    
    plt.bar(x - width/2, params, width, label='Parameters', alpha=0.7)
    plt.bar(x + width/2, memory, width, label='Memory', alpha=0.7)
    
    plt.xlabel('Architecture')
    plt.ylabel('Relative Cost')
    plt.title('Architecture Comparison')
    plt.xticks(x, architectures)
    plt.legend()
    
    # Attention patterns visualization
    plt.subplot(2, 3, 2)
    
    # Causal mask (decoder-only)
    causal_mask = np.triu(np.ones((8, 8)), k=1) == 0
    plt.imshow(causal_mask, cmap='RdYlBu', aspect='auto')
    plt.title('Causal Attention\n(Decoder-only)')
    plt.xlabel('Key Position')
    plt.ylabel('Query Position')
    
    plt.subplot(2, 3, 3)
    
    # Bidirectional mask (encoder-only)
    bidirectional_mask = np.ones((8, 8))
    plt.imshow(bidirectional_mask, cmap='RdYlBu', aspect='auto')
    plt.title('Bidirectional Attention\n(Encoder-only)')
    plt.xlabel('Key Position')
    plt.ylabel('Query Position')
    
    # MoE expert usage
    plt.subplot(2, 3, 4)
    
    expert_ids = range(len(expert_usage))
    plt.bar(expert_ids, expert_usage, alpha=0.7)
    plt.xlabel('Expert ID')
    plt.ylabel('Usage')
    plt.title('MoE Expert Usage')
    
    # Model capacity vs efficiency
    plt.subplot(2, 3, 5)
    
    model_types = ['Dense\nSmall', 'Dense\nLarge', 'Sparse\nMoE']
    capacity = [50, 100, 120]     # Model capacity
    efficiency = [90, 60, 85]    # Computational efficiency
    
    plt.scatter(capacity, efficiency, s=[50, 200, 150], alpha=0.7)
    
    for i, model_type in enumerate(model_types):
        plt.annotate(model_type, (capacity[i], efficiency[i]), 
                    xytext=(5, 5), textcoords='offset points')
    
    plt.xlabel('Model Capacity')
    plt.ylabel('Computational Efficiency')
    plt.title('Capacity vs Efficiency')
    plt.grid(True, alpha=0.3)
    
    # Task suitability
    plt.subplot(2, 3, 6)
    
    tasks = ['Generation', 'Classification', 'Translation']
    decoder_scores = [0.9, 0.3, 0.4]
    encoder_scores = [0.2, 0.9, 0.3]
    enc_dec_scores = [0.6, 0.5, 0.9]
    
    x = np.arange(len(tasks))
    width = 0.25
    
    plt.bar(x - width, decoder_scores, width, label='Decoder-only', alpha=0.7)
    plt.bar(x, encoder_scores, width, label='Encoder-only', alpha=0.7)
    plt.bar(x + width, enc_dec_scores, width, label='Encoder-Decoder', alpha=0.7)
    
    plt.xlabel('Task Type')
    plt.ylabel('Suitability Score')
    plt.title('Task Suitability')
    plt.xticks(x, tasks)
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    print("\nArchitecture Insights:")
    
    print("\nDecoder-only (GPT-style):")
    print("  + Excellent for generation tasks")
    print("  + Simple architecture")
    print("  - Limited for understanding tasks")
    
    print("\nEncoder-only (BERT-style):")
    print("  + Great for classification/understanding")
    print("  + Bidirectional context")
    print("  - Cannot generate text naturally")
    
    print("\nEncoder-Decoder (T5-style):")
    print("  + Versatile for seq2seq tasks")
    print("  + Good for translation/summarization")
    print("  - More complex architecture")
    
    print("\nMixture of Experts:")
    print("  + Scales parameters without proportional compute")
    print("  + Specialization through expert routing")
    print("  - Complex training and load balancing")

demonstrate_architectures()