# 05 - Language Modeling

This notebook covers different approaches to language modeling that form the foundation of modern LLMs.

## Topics Covered:
- Statistical language models
- Neural language models
- Autoregressive modeling
- Masked language modeling
- Causal language modeling

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from typing import List, Dict, Tuple, Optional
import re

np.random.seed(42)

## 1. Statistical Language Models

In [None]:
class NGramLanguageModel:
    """N-gram statistical language model."""
    
    def __init__(self, n: int = 3, smoothing: str = 'laplace', alpha: float = 1.0):
        self.n = n
        self.smoothing = smoothing
        self.alpha = alpha
        self.ngram_counts = defaultdict(int)
        self.context_counts = defaultdict(int)
        self.vocab = set()
    
    def tokenize(self, text: str) -> List[str]:
        """Simple tokenization."""
        tokens = re.findall(r'\b\w+\b|[.,!?;]', text.lower())
        return ['<BOS>'] * (self.n - 1) + tokens + ['<EOS>']
    
    def get_ngrams(self, tokens: List[str]) -> List[Tuple[str, ...]]:
        """Extract n-grams from tokens."""
        return [tuple(tokens[i:i+self.n]) for i in range(len(tokens) - self.n + 1)]
    
    def train(self, texts: List[str]):
        """Train the n-gram model."""
        for text in texts:
            tokens = self.tokenize(text)
            self.vocab.update(tokens)
            
            ngrams = self.get_ngrams(tokens)
            for ngram in ngrams:
                context = ngram[:-1]
                word = ngram[-1]
                
                self.ngram_counts[ngram] += 1
                self.context_counts[context] += 1
    
    def probability(self, word: str, context: Tuple[str, ...]) -> float:
        """Calculate probability of word given context."""
        ngram = context + (word,)
        
        if self.smoothing == 'laplace':
            # Laplace smoothing
            numerator = self.ngram_counts[ngram] + self.alpha
            denominator = self.context_counts[context] + self.alpha * len(self.vocab)
            return numerator / denominator if denominator > 0 else 1.0 / len(self.vocab)
        
        elif self.smoothing == 'mle':
            # Maximum likelihood estimation
            return (self.ngram_counts[ngram] / self.context_counts[context] 
                   if self.context_counts[context] > 0 else 0.0)
    
    def perplexity(self, text: str) -> float:
        """Calculate perplexity on test text."""
        tokens = self.tokenize(text)
        ngrams = self.get_ngrams(tokens)
        
        log_prob_sum = 0
        for ngram in ngrams:
            context = ngram[:-1]
            word = ngram[-1]
            prob = self.probability(word, context)
            log_prob_sum += np.log(prob + 1e-10)
        
        return np.exp(-log_prob_sum / len(ngrams))
    
    def generate(self, max_length: int = 20, seed_context: Optional[Tuple[str, ...]] = None) -> str:
        """Generate text using the model."""
        if seed_context is None:
            context = tuple(['<BOS>'] * (self.n - 1))
        else:
            context = seed_context
        
        generated = list(context)
        
        for _ in range(max_length):
            # Get all possible next words
            candidates = []
            probs = []
            
            for word in self.vocab:
                if word not in ['<BOS>', '<EOS>']:
                    prob = self.probability(word, context)
                    candidates.append(word)
                    probs.append(prob)
            
            if not candidates:
                break
            
            # Sample next word
            probs = np.array(probs)
            probs = probs / np.sum(probs)
            next_word = np.random.choice(candidates, p=probs)
            
            generated.append(next_word)
            
            if next_word == '<EOS>':
                break
            
            # Update context
            context = context[1:] + (next_word,)
        
        # Remove special tokens and join
        words = [w for w in generated if w not in ['<BOS>', '<EOS>']]
        return ' '.join(words)

# Demonstrate n-gram models
def demonstrate_ngram_models():
    """Compare different n-gram models."""
    
    # Sample training data
    training_texts = [
        "The cat sat on the mat.",
        "The dog ran in the park.",
        "A cat and a dog played together.",
        "The mat was comfortable for the cat.",
        "In the park, children played with their dog.",
        "The comfortable mat attracted the sleeping cat."
    ]
    
    test_text = "The cat played in the park."
    
    # Train different n-gram models
    models = {}
    for n in [1, 2, 3, 4]:
        model = NGramLanguageModel(n=n, smoothing='laplace')
        model.train(training_texts)
        models[n] = model
    
    # Compare perplexities
    perplexities = []
    for n in [1, 2, 3, 4]:
        perp = models[n].perplexity(test_text)
        perplexities.append(perp)
        print(f"{n}-gram perplexity: {perp:.2f}")
    
    # Plot perplexities
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 2, 1)
    plt.plot([1, 2, 3, 4], perplexities, marker='o')
    plt.title('Perplexity vs N-gram Order')
    plt.xlabel('N-gram Order')
    plt.ylabel('Perplexity')
    plt.grid(True, alpha=0.3)
    
    # Show vocabulary sizes
    vocab_sizes = [len(models[n].vocab) for n in [1, 2, 3, 4]]
    plt.subplot(2, 2, 2)
    plt.bar([1, 2, 3, 4], vocab_sizes)
    plt.title('Vocabulary Size')
    plt.xlabel('N-gram Order')
    plt.ylabel('Vocabulary Size')
    
    # Show n-gram counts
    ngram_counts = [len(models[n].ngram_counts) for n in [1, 2, 3, 4]]
    plt.subplot(2, 2, 3)
    plt.bar([1, 2, 3, 4], ngram_counts)
    plt.title('Number of N-grams')
    plt.xlabel('N-gram Order')
    plt.ylabel('N-gram Count')
    
    # Generate samples
    plt.subplot(2, 2, 4)
    plt.text(0.1, 0.8, "Generated Samples:", fontsize=12, weight='bold')
    
    y_pos = 0.7
    for n in [2, 3]:
        sample = models[n].generate(max_length=10)
        plt.text(0.1, y_pos, f"{n}-gram: {sample}", fontsize=10, wrap=True)
        y_pos -= 0.15
    
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.axis('off')
    
    plt.tight_layout()
    plt.show()
    
    print("\nN-gram Model Analysis:")
    print("- Higher order n-grams capture more context")
    print("- But suffer from data sparsity")
    print("- Smoothing helps with unseen n-grams")
    print("- Trade-off between context and generalization")

demonstrate_ngram_models()

## 2. Neural Language Models

In [None]:
class SimpleNeuralLM:
    """Simple feedforward neural language model."""
    
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int, context_size: int):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.context_size = context_size
        
        # Initialize parameters
        self.embeddings = np.random.randn(vocab_size, embedding_dim) * 0.1
        self.W1 = np.random.randn(context_size * embedding_dim, hidden_dim) * 0.1
        self.b1 = np.zeros(hidden_dim)
        self.W2 = np.random.randn(hidden_dim, vocab_size) * 0.1
        self.b2 = np.zeros(vocab_size)
        
        self.word_to_idx = {}
        self.idx_to_word = {}
    
    def build_vocab(self, texts: List[str]):
        """Build vocabulary from texts."""
        words = set()
        for text in texts:
            tokens = re.findall(r'\b\w+\b|[.,!?;]', text.lower())
            words.update(tokens)
        
        words = ['<UNK>', '<BOS>', '<EOS>'] + sorted(list(words))
        self.word_to_idx = {word: i for i, word in enumerate(words)}
        self.idx_to_word = {i: word for word, i in self.word_to_idx.items()}
        
        # Update vocab size
        self.vocab_size = len(words)
        
        # Reinitialize parameters with correct vocab size
        self.embeddings = np.random.randn(self.vocab_size, self.embedding_dim) * 0.1
        self.W2 = np.random.randn(self.hidden_dim, self.vocab_size) * 0.1
        self.b2 = np.zeros(self.vocab_size)
    
    def tokenize(self, text: str) -> List[int]:
        """Convert text to token indices."""
        tokens = re.findall(r'\b\w+\b|[.,!?;]', text.lower())
        indices = []
        for token in tokens:
            indices.append(self.word_to_idx.get(token, self.word_to_idx['<UNK>']))
        return indices
    
    def softmax(self, x: np.ndarray) -> np.ndarray:
        """Softmax activation."""
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
    
    def forward(self, context_indices: np.ndarray) -> np.ndarray:
        """Forward pass."""
        # Embedding lookup
        embedded = self.embeddings[context_indices]  # (batch, context_size, embedding_dim)
        
        # Flatten context embeddings
        flattened = embedded.reshape(embedded.shape[0], -1)  # (batch, context_size * embedding_dim)
        
        # Hidden layer
        hidden = np.tanh(flattened @ self.W1 + self.b1)  # (batch, hidden_dim)
        
        # Output layer
        logits = hidden @ self.W2 + self.b2  # (batch, vocab_size)
        
        # Softmax
        probs = self.softmax(logits)
        
        return probs, hidden, embedded
    
    def create_training_data(self, texts: List[str]) -> Tuple[np.ndarray, np.ndarray]:
        """Create training data from texts."""
        contexts = []
        targets = []
        
        for text in texts:
            indices = self.tokenize(text)
            
            # Add BOS tokens
            padded = [self.word_to_idx['<BOS>']] * self.context_size + indices + [self.word_to_idx['<EOS>']]
            
            for i in range(len(padded) - self.context_size):
                context = padded[i:i + self.context_size]
                target = padded[i + self.context_size]
                contexts.append(context)
                targets.append(target)
        
        return np.array(contexts), np.array(targets)
    
    def train_step(self, contexts: np.ndarray, targets: np.ndarray, learning_rate: float = 0.01) -> float:
        """Single training step (simplified)."""
        batch_size = contexts.shape[0]
        
        # Forward pass
        probs, hidden, embedded = self.forward(contexts)
        
        # Compute loss (cross-entropy)
        target_probs = probs[np.arange(batch_size), targets]
        loss = -np.mean(np.log(target_probs + 1e-10))
        
        return loss
    
    def generate(self, seed_text: str, max_length: int = 20) -> str:
        """Generate text using the model."""
        # Initialize context
        if seed_text:
            context = self.tokenize(seed_text)[-self.context_size:]
        else:
            context = [self.word_to_idx['<BOS>']] * self.context_size
        
        # Pad if necessary
        while len(context) < self.context_size:
            context = [self.word_to_idx['<BOS>']] + context
        
        generated = []
        
        for _ in range(max_length):
            # Get probabilities for next word
            context_array = np.array([context])
            probs, _, _ = self.forward(context_array)
            
            # Sample next word
            next_idx = np.random.choice(self.vocab_size, p=probs[0])
            next_word = self.idx_to_word[next_idx]
            
            if next_word == '<EOS>':
                break
            
            generated.append(next_word)
            
            # Update context
            context = context[1:] + [next_idx]
        
        return ' '.join(generated)

# Compare neural vs n-gram models
def compare_neural_ngram():
    """Compare neural and n-gram language models."""
    
    # Training data
    training_texts = [
        "The quick brown fox jumps over the lazy dog.",
        "A quick brown fox ran through the forest.",
        "The lazy dog slept under the tree.",
        "Brown foxes are quick and clever animals.",
        "Dogs and foxes are different animals.",
        "The forest was full of trees and animals."
    ]
    
    # Initialize models
    ngram_model = NGramLanguageModel(n=3, smoothing='laplace')
    ngram_model.train(training_texts)
    
    neural_model = SimpleNeuralLM(
        vocab_size=100,  # Will be updated
        embedding_dim=16,
        hidden_dim=32,
        context_size=3
    )
    neural_model.build_vocab(training_texts)
    
    # Create training data for neural model
    contexts, targets = neural_model.create_training_data(training_texts)
    
    # Train neural model (simplified)
    losses = []
    for epoch in range(100):
        loss = neural_model.train_step(contexts, targets)
        losses.append(loss)
    
    # Generate samples
    print("Model Comparison:")
    print("\nN-gram samples:")
    for i in range(3):
        sample = ngram_model.generate(max_length=15)
        print(f"  {i+1}: {sample}")
    
    print("\nNeural model samples:")
    for i in range(3):
        sample = neural_model.generate("", max_length=15)
        print(f"  {i+1}: {sample}")
    
    # Plot training loss
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 3, 1)
    plt.plot(losses)
    plt.title('Neural LM Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Cross-Entropy Loss')
    plt.grid(True, alpha=0.3)
    
    # Visualize embeddings (2D projection)
    plt.subplot(1, 3, 2)
    # Simple 2D projection of embeddings
    embeddings_2d = neural_model.embeddings[:, :2]  # Take first 2 dimensions
    
    # Plot a few word embeddings
    words_to_plot = ['the', 'fox', 'dog', 'quick', 'brown', 'lazy']
    for word in words_to_plot:
        if word in neural_model.word_to_idx:
            idx = neural_model.word_to_idx[word]
            x, y = embeddings_2d[idx]
            plt.scatter(x, y, s=100)
            plt.annotate(word, (x, y), xytext=(5, 5), textcoords='offset points')
    
    plt.title('Word Embeddings (2D Projection)')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.grid(True, alpha=0.3)
    
    # Model comparison
    plt.subplot(1, 3, 3)
    comparison_data = {
        'N-gram': [len(ngram_model.vocab), len(ngram_model.ngram_counts), 0],
        'Neural': [neural_model.vocab_size, contexts.shape[0], 
                  neural_model.embedding_dim * neural_model.vocab_size + 
                  neural_model.hidden_dim * neural_model.context_size * neural_model.embedding_dim]
    }
    
    x = np.arange(3)
    width = 0.35
    
    plt.bar(x - width/2, comparison_data['N-gram'], width, label='N-gram', alpha=0.7)
    plt.bar(x + width/2, comparison_data['Neural'], width, label='Neural', alpha=0.7)
    
    plt.title('Model Comparison')
    plt.xlabel('Metric')
    plt.ylabel('Count')
    plt.xticks(x, ['Vocab Size', 'Training Examples', 'Parameters'])
    plt.legend()
    plt.yscale('log')
    
    plt.tight_layout()
    plt.show()
    
    print("\nKey Differences:")
    print("N-gram Models:")
    print("  + Simple and interpretable")
    print("  + Fast training and inference")
    print("  - Discrete, sparse representations")
    print("  - Limited context window")
    
    print("\nNeural Models:")
    print("  + Dense, learned representations")
    print("  + Can capture semantic similarities")
    print("  + Scalable to larger contexts")
    print("  - More complex training")
    print("  - Requires more data")

compare_neural_ngram()

## 3. Autoregressive Modeling

In [None]:
class AutoregressiveModel:
    """Autoregressive language model demonstration."""
    
    def __init__(self, vocab_size: int, d_model: int = 64):
        self.vocab_size = vocab_size
        self.d_model = d_model
        
        # Simple transformer-like parameters
        self.token_embedding = np.random.randn(vocab_size, d_model) * 0.1
        self.pos_embedding = np.random.randn(100, d_model) * 0.1  # Max seq len 100
        self.output_projection = np.random.randn(d_model, vocab_size) * 0.1
        
        self.word_to_idx = {}
        self.idx_to_word = {}
    
    def build_vocab(self, texts: List[str]):
        """Build vocabulary."""
        words = set()
        for text in texts:
            tokens = re.findall(r'\b\w+\b|[.,!?;]', text.lower())
            words.update(tokens)
        
        words = ['<PAD>', '<BOS>', '<EOS>'] + sorted(list(words))
        self.word_to_idx = {word: i for i, word in enumerate(words)}
        self.idx_to_word = {i: word for word, i in self.word_to_idx.items()}
        
        self.vocab_size = len(words)
        # Reinitialize with correct vocab size
        self.token_embedding = np.random.randn(self.vocab_size, self.d_model) * 0.1
        self.output_projection = np.random.randn(self.d_model, self.vocab_size) * 0.1
    
    def create_causal_mask(self, seq_len: int) -> np.ndarray:
        """Create causal mask for autoregressive generation."""
        mask = np.triu(np.ones((seq_len, seq_len)), k=1)
        return mask == 0  # True where attention is allowed
    
    def forward(self, input_ids: np.ndarray, use_cache: bool = False) -> np.ndarray:
        """Forward pass (simplified)."""
        batch_size, seq_len = input_ids.shape
        
        # Token embeddings
        token_embeds = self.token_embedding[input_ids]  # (batch, seq_len, d_model)
        
        # Positional embeddings
        pos_embeds = self.pos_embedding[:seq_len]  # (seq_len, d_model)
        
        # Combine embeddings
        hidden_states = token_embeds + pos_embeds[None, :, :]  # (batch, seq_len, d_model)
        
        # Simple transformation (in real model, this would be transformer layers)
        hidden_states = np.tanh(hidden_states)
        
        # Output projection
        logits = hidden_states @ self.output_projection  # (batch, seq_len, vocab_size)
        
        return logits
    
    def generate_autoregressive(self, prompt: str, max_length: int = 20, 
                              temperature: float = 1.0) -> str:
        """Generate text autoregressively."""
        # Tokenize prompt
        if prompt:
            tokens = re.findall(r'\b\w+\b|[.,!?;]', prompt.lower())
            input_ids = [self.word_to_idx.get(token, 0) for token in tokens]
        else:
            input_ids = [self.word_to_idx['<BOS>']]
        
        generated_ids = input_ids.copy()
        
        for _ in range(max_length):
            # Prepare input
            current_input = np.array([generated_ids])
            
            # Forward pass
            logits = self.forward(current_input)
            
            # Get logits for next token (last position)
            next_token_logits = logits[0, -1, :] / temperature
            
            # Apply softmax
            probs = np.exp(next_token_logits - np.max(next_token_logits))
            probs = probs / np.sum(probs)
            
            # Sample next token
            next_token_id = np.random.choice(self.vocab_size, p=probs)
            
            # Check for EOS
            if next_token_id == self.word_to_idx.get('<EOS>', -1):
                break
            
            generated_ids.append(next_token_id)
        
        # Convert back to text
        generated_tokens = [self.idx_to_word.get(idx, '<UNK>') for idx in generated_ids]
        return ' '.join(generated_tokens)

def demonstrate_autoregressive():
    """Demonstrate autoregressive generation."""
    
    # Sample data
    texts = [
        "The sun rises in the east.",
        "Birds fly in the sky.",
        "The ocean is deep and blue.",
        "Mountains are tall and majestic.",
        "Flowers bloom in spring."
    ]
    
    # Initialize model
    model = AutoregressiveModel(vocab_size=100)
    model.build_vocab(texts)
    
    # Demonstrate causal masking
    seq_len = 8
    causal_mask = model.create_causal_mask(seq_len)
    
    plt.figure(figsize=(15, 10))
    
    # Visualize causal mask
    plt.subplot(2, 3, 1)
    plt.imshow(causal_mask, cmap='RdYlBu', aspect='auto')
    plt.title('Causal Attention Mask')
    plt.xlabel('Key Position')
    plt.ylabel('Query Position')
    plt.colorbar()
    
    # Show autoregressive generation process
    plt.subplot(2, 3, 2)
    
    # Simulate generation steps
    steps = ['the', 'sun', 'rises', 'in', 'the']
    step_probs = []
    
    for i, step in enumerate(steps):
        # Create input up to current step
        current_tokens = steps[:i+1]
        input_ids = [model.word_to_idx.get(token, 0) for token in current_tokens]
        
        # Pad to same length for visualization
        padded_input = input_ids + [0] * (len(steps) - len(input_ids))
        step_probs.append(padded_input)
    
    step_matrix = np.array(step_probs)
    plt.imshow(step_matrix, cmap='viridis', aspect='auto')
    plt.title('Autoregressive Generation Steps')
    plt.xlabel('Token Position')
    plt.ylabel('Generation Step')
    plt.colorbar()
    
    # Generate samples with different temperatures
    plt.subplot(2, 3, 3)
    temperatures = [0.5, 1.0, 1.5, 2.0]
    
    # Simulate probability distributions for different temperatures
    base_logits = np.array([2.0, 1.5, 1.0, 0.5, 0.2])
    
    for i, temp in enumerate(temperatures):
        scaled_logits = base_logits / temp
        probs = np.exp(scaled_logits - np.max(scaled_logits))
        probs = probs / np.sum(probs)
        
        plt.plot(probs, label=f'T={temp}', marker='o')
    
    plt.title('Temperature Effect on Sampling')
    plt.xlabel('Token Index')
    plt.ylabel('Probability')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Show generation examples
    plt.subplot(2, 3, 4)
    plt.text(0.1, 0.9, "Autoregressive Generation Examples:", fontsize=12, weight='bold')
    
    y_pos = 0.8
    prompts = ["the", "birds", "ocean"]
    
    for prompt in prompts:
        generated = model.generate_autoregressive(prompt, max_length=8, temperature=1.0)
        plt.text(0.1, y_pos, f"'{prompt}' → {generated}", fontsize=10)
        y_pos -= 0.15
    
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.axis('off')
    
    # Compare different sampling strategies
    plt.subplot(2, 3, 5)
    
    # Simulate different sampling methods
    logits = np.array([3.0, 2.0, 1.0, 0.5, 0.1, -0.5, -1.0])
    probs = np.exp(logits - np.max(logits))
    probs = probs / np.sum(probs)
    
    # Greedy (argmax)
    greedy = np.zeros_like(probs)
    greedy[np.argmax(probs)] = 1.0
    
    # Top-k (k=3)
    top_k = probs.copy()
    top_k_indices = np.argsort(probs)[-3:]
    mask = np.zeros_like(probs, dtype=bool)
    mask[top_k_indices] = True
    top_k[~mask] = 0
    top_k = top_k / np.sum(top_k)
    
    x = np.arange(len(probs))
    width = 0.25
    
    plt.bar(x - width, probs, width, label='Original', alpha=0.7)
    plt.bar(x, greedy, width, label='Greedy', alpha=0.7)
    plt.bar(x + width, top_k, width, label='Top-k (k=3)', alpha=0.7)
    
    plt.title('Sampling Strategies')
    plt.xlabel('Token Index')
    plt.ylabel('Probability')
    plt.legend()
    
    # Show autoregressive property
    plt.subplot(2, 3, 6)
    
    # Demonstrate that P(w1,w2,w3) = P(w1) * P(w2|w1) * P(w3|w1,w2)
    sequence = "the sun rises"
    tokens = sequence.split()
    
    # Simulate conditional probabilities
    probs = [0.8, 0.6, 0.7]  # P(the), P(sun|the), P(rises|the,sun)
    cumulative_prob = np.cumprod(probs)
    
    plt.plot(range(1, len(tokens)+1), probs, 'o-', label='Conditional P(wi|w<i)', linewidth=2)
    plt.plot(range(1, len(tokens)+1), cumulative_prob, 's-', label='Joint P(w1...wi)', linewidth=2)
    
    plt.title('Autoregressive Factorization')
    plt.xlabel('Token Position')
    plt.ylabel('Probability')
    plt.xticks(range(1, len(tokens)+1), tokens)
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("Autoregressive Language Modeling:")
    print("\nKey Properties:")
    print("- Factorizes joint probability: P(w1...wn) = ∏P(wi|w<i)")
    print("- Uses causal masking to prevent future information leakage")
    print("- Generates text left-to-right, one token at a time")
    print("- Temperature controls randomness in sampling")
    
    print("\nAdvantages:")
    print("- Natural for text generation tasks")
    print("- Can generate variable-length sequences")
    print("- Straightforward training objective")
    
    print("\nApplications:")
    print("- GPT family models")
    print("- Text completion")
    print("- Creative writing")
    print("- Code generation")

demonstrate_autoregressive()