# Advanced Sequence Models: Hands-on Practice
## Based on Deep Learning for Natural Language Processing

**Author**: Generated from Lecture 12  
**Topics**: BiRNN, Seq2Seq, Attention Mechanism, Teacher Forcing  
**Duration**: 2-3 hours hands-on practice

---

### Learning Objectives:
1. Understand and implement Bidirectional RNNs
2. Build Sequence-to-Sequence models with Encoder-Decoder architecture
3. Master Teacher Forcing training strategy
4. Implement Attention Mechanism from scratch
5. Apply practical techniques: batching, masking, and optimization

### Prerequisites:
- Basic understanding of RNNs/LSTMs
- Python programming skills
- Familiarity with PyTorch or TensorFlow


## Part 1: Environment Setup and Imports

In [None]:
# Import necessary libraries
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Helper functions for visualization
def plot_attention_weights(attention_weights, input_labels, output_labels, title="Attention Heatmap"):
    """Visualize attention weights as a heatmap"""
    fig = go.Figure(data=go.Heatmap(
        z=attention_weights,
        x=input_labels,
        y=output_labels,
        colorscale='Blues',
        showscale=True
    ))
    
    fig.update_layout(
        title=title,
        xaxis_title="Input Sequence",
        yaxis_title="Output Sequence",
        width=700,
        height=500
    )
    
    return fig

def visualize_hidden_states(hidden_states, labels=None):
    """Visualize hidden states evolution"""
    fig = px.line(hidden_states.T, 
                  title="Hidden State Evolution Over Time",
                  labels={'index': 'Hidden Unit', 'value': 'Activation'})
    if labels:
        fig.update_xaxis(ticktext=labels, tickvals=list(range(len(labels))))
    return fig

## Part 2: Bidirectional RNNs - Core Concepts

### Exercise 1: Understanding Bidirectional Processing

**Concept**: Bidirectional RNNs process sequences in both forward and backward directions, capturing context from both past and future tokens. This is crucial for tasks where the entire context matters.

**Key Formula**:
- Forward: $\vec{h}_t = \text{RNN}(x_t, \vec{h}_{t-1})$
- Backward: $\overleftarrow{h}_t = \text{RNN}(x_t, \overleftarrow{h}_{t+1})$
- Output: $y_t = [\vec{h}_t; \overleftarrow{h}_t]$ (concatenation)


In [None]:
# Exercise 1: Implement a Bidirectional RNN from scratch
class SimpleBiRNN(nn.Module):
    """Simple Bidirectional RNN implementation"""
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleBiRNN, self).__init__()
        self.hidden_size = hidden_size
        
        # Forward and backward RNN layers
        self.rnn_forward = nn.RNN(input_size, hidden_size, batch_first=True)
        self.rnn_backward = nn.RNN(input_size, hidden_size, batch_first=True)
        
        # Output layer (2*hidden_size because of concatenation)
        self.fc = nn.Linear(2 * hidden_size, output_size)
        
    def forward(self, x):
        # Forward pass
        out_forward, _ = self.rnn_forward(x)
        
        # Backward pass (reverse the sequence)
        x_reversed = torch.flip(x, dims=[1])
        out_backward, _ = self.rnn_backward(x_reversed)
        out_backward = torch.flip(out_backward, dims=[1])
        
        # Concatenate both directions
        out_bi = torch.cat([out_forward, out_backward], dim=-1)
        
        # Final output
        output = self.fc(out_bi)
        return output, out_bi

# Create synthetic sequence data for demonstration
def create_sequence_data(n_samples=100, seq_length=10, n_features=5):
    """Create synthetic sequence data for BiRNN demonstration"""
    X = np.random.randn(n_samples, seq_length, n_features).astype(np.float32)
    # Simple pattern: sum of features at even positions
    y = (X[:, ::2, :].sum(axis=(1, 2)) > 0).astype(np.float32)
    return torch.tensor(X), torch.tensor(y)

# Generate data and create model
X_train, y_train = create_sequence_data(200, 10, 5)
X_test, y_test = create_sequence_data(50, 10, 5)

model_birnn = SimpleBiRNN(input_size=5, hidden_size=16, output_size=1)
print("BiRNN Model Architecture:")
print(model_birnn)
print(f"\nTotal parameters: {sum(p.numel() for p in model_birnn.parameters())}")

In [None]:
# Train the BiRNN model
optimizer = optim.Adam(model_birnn.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

# Training loop
losses = []
model_birnn.train()

for epoch in range(50):
    optimizer.zero_grad()
    output, hidden_states = model_birnn(X_train)
    loss = criterion(output[:, -1, 0], y_train)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# Visualize training progress
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(losses)
plt.title("BiRNN Training Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)

# Analyze hidden states
model_birnn.eval()
with torch.no_grad():
    _, hidden_bi = model_birnn(X_test[:5])
    
plt.subplot(1, 2, 2)
plt.imshow(hidden_bi[0].numpy().T, aspect='auto', cmap='coolwarm')
plt.colorbar()
plt.title("Bidirectional Hidden States (Sample 1)")
plt.xlabel("Time Step")
plt.ylabel("Hidden Unit")
plt.tight_layout()
plt.show()

print(f"\nFinal test accuracy: {((torch.sigmoid(model_birnn(X_test)[0][:, -1, 0]) > 0.5) == y_test).float().mean():.3f}")

### 🎯 Your Turn: BiRNN Analysis
1. Modify the `SimpleBiRNN` to use LSTM instead of vanilla RNN
2. Compare the performance difference
3. Visualize how forward and backward states capture different patterns
4. Try different sequence lengths and observe the impact


## Part 3: Sequence-to-Sequence Architecture

### Exercise 2: Building an Encoder-Decoder Model

**Concept**: Seq2Seq models consist of an encoder that compresses input sequences into a context vector, and a decoder that generates output sequences from this context.

**Architecture**:
- Encoder: Processes input sequence → Context vector
- Context Vector: Fixed-size representation of entire input
- Decoder: Generates output sequence from context


In [None]:
# Exercise 2: Complete Seq2Seq Implementation
class Encoder(nn.Module):
    """Encoder network for Seq2Seq"""
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Use LSTM for better gradient flow
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
    def forward(self, x):
        # x shape: (batch, seq_len, input_size)
        outputs, (hidden, cell) = self.lstm(x)
        # Return last hidden state as context
        return hidden, cell

class Decoder(nn.Module):
    """Decoder network for Seq2Seq"""
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden, cell):
        # x shape: (batch, 1, input_size) for single time step
        output, (hidden, cell) = self.lstm(x, (hidden, cell))
        prediction = self.fc(output)
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    """Complete Seq2Seq model"""
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target_len, teacher_forcing_ratio=0.5):
        batch_size = source.size(0)
        outputs = []
        
        # Encode the source sequence
        hidden, cell = self.encoder(source)
        
        # Start with a start token (zeros in this example)
        decoder_input = torch.zeros(batch_size, 1, source.size(2))
        
        for t in range(target_len):
            output, hidden, cell = self.decoder(decoder_input, hidden, cell)
            outputs.append(output)
            
            # Teacher forcing: use actual or predicted output
            use_teacher_forcing = np.random.random() < teacher_forcing_ratio
            if use_teacher_forcing and self.training:
                decoder_input = source[:, min(t+1, source.size(1)-1):min(t+2, source.size(1)), :]
            else:
                decoder_input = output
                
        return torch.cat(outputs, dim=1)

# Create and test the Seq2Seq model
input_dim = 10
hidden_dim = 32
output_dim = 10
seq_len = 15

encoder = Encoder(input_dim, hidden_dim)
decoder = Decoder(input_dim, hidden_dim, output_dim)
seq2seq_model = Seq2Seq(encoder, decoder)

print("Seq2Seq Model Components:")
print(f"Encoder parameters: {sum(p.numel() for p in encoder.parameters())}")
print(f"Decoder parameters: {sum(p.numel() for p in decoder.parameters())}")
print(f"Total parameters: {sum(p.numel() for p in seq2seq_model.parameters())}")

# Test with dummy data
test_input = torch.randn(2, seq_len, input_dim)
test_output = seq2seq_model(test_input, target_len=seq_len)
print(f"\nInput shape: {test_input.shape}")
print(f"Output shape: {test_output.shape}")

In [None]:
# Visualize the information bottleneck problem
def visualize_bottleneck():
    """Visualize how context vector compresses information"""
    
    # Simulate encoder hidden states for different sequence lengths
    seq_lengths = [5, 10, 20, 50, 100]
    hidden_size = 256
    
    fig = make_subplots(rows=1, cols=2,
                        subplot_titles=("Information Compression", "Context Vector Capacity"))
    
    # Plot 1: Information vs Sequence Length
    info_original = [l * 512 for l in seq_lengths]  # Original information
    info_compressed = [hidden_size] * len(seq_lengths)  # Compressed to fixed size
    
    fig.add_trace(go.Scatter(x=seq_lengths, y=info_original, 
                            mode='lines+markers', name='Original Information',
                            line=dict(color='blue', width=2)),
                 row=1, col=1)
    
    fig.add_trace(go.Scatter(x=seq_lengths, y=info_compressed,
                            mode='lines+markers', name='Context Vector',
                            line=dict(color='red', width=2, dash='dash')),
                 row=1, col=1)
    
    # Plot 2: Compression Ratio
    compression_ratio = [orig/comp for orig, comp in zip(info_original, info_compressed)]
    
    fig.add_trace(go.Bar(x=seq_lengths, y=compression_ratio,
                        marker_color='orange',
                        name='Compression Ratio'),
                 row=1, col=2)
    
    fig.update_layout(height=400, showlegend=True,
                     title_text="Seq2Seq Information Bottleneck Problem")
    fig.update_xaxes(title_text="Sequence Length", row=1, col=1)
    fig.update_xaxes(title_text="Sequence Length", row=1, col=2)
    fig.update_yaxes(title_text="Information (dims)", row=1, col=1)
    fig.update_yaxes(title_text="Compression Ratio", row=1, col=2)
    
    return fig

fig_bottleneck = visualize_bottleneck()
fig_bottleneck.show()

print("🔍 Key Insight: As sequence length increases, the compression ratio grows dramatically.")
print("This is why attention mechanism was invented - to avoid this bottleneck!")

## Part 4: Teacher Forcing Strategy

### Exercise 3: Comparing Teacher Forcing vs Autoregressive Training

**Concept**: Teacher Forcing feeds ground-truth outputs during training, while autoregressive mode uses model's own predictions. This creates a trade-off between training speed and inference performance.


In [None]:
# Exercise 3: Teacher Forcing Experiment
class TeacherForcingExperiment:
    """Compare different teacher forcing strategies"""
    
    def __init__(self, model, criterion, optimizer):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.train_losses = {'always': [], 'never': [], 'scheduled': []}
    
    def train_epoch(self, data_loader, strategy='always', epoch=0, max_epochs=100):
        """Train one epoch with specified teacher forcing strategy"""
        epoch_loss = 0
        
        for batch in data_loader:
            inputs, targets = batch
            self.optimizer.zero_grad()
            
            # Determine teacher forcing ratio
            if strategy == 'always':
                tf_ratio = 1.0
            elif strategy == 'never':
                tf_ratio = 0.0
            elif strategy == 'scheduled':
                # Linear decay from 1.0 to 0.0
                tf_ratio = max(0, 1.0 - epoch / max_epochs)
            else:
                tf_ratio = 0.5
            
            # Forward pass
            outputs = self.model(inputs, targets.size(1), tf_ratio)
            loss = self.criterion(outputs.reshape(-1, outputs.size(-1)), 
                                targets.reshape(-1))
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()
            
            epoch_loss += loss.item()
        
        return epoch_loss / len(data_loader)
    
    def compare_strategies(self, data_loader, n_epochs=50):
        """Compare different teacher forcing strategies"""
        strategies = ['always', 'never', 'scheduled']
        results = {}
        
        for strategy in strategies:
            print(f"\nTraining with {strategy} teacher forcing...")
            # Reset model
            for layer in self.model.modules():
                if hasattr(layer, 'reset_parameters'):
                    layer.reset_parameters()
            
            strategy_losses = []
            for epoch in range(n_epochs):
                loss = self.train_epoch(data_loader, strategy, epoch, n_epochs)
                strategy_losses.append(loss)
                if epoch % 10 == 0:
                    print(f"  Epoch {epoch}: Loss = {loss:.4f}")
            
            results[strategy] = strategy_losses
        
        return results

# Create synthetic sequence data for teacher forcing experiment
class SyntheticSeqDataset(Dataset):
    def __init__(self, n_samples=1000, seq_len=10, vocab_size=20):
        self.data = []
        for _ in range(n_samples):
            # Create sequences with simple pattern (reverse sequence)
            seq = torch.randint(0, vocab_size, (seq_len,))
            target = torch.flip(seq, [0])
            self.data.append((seq, target))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        seq, target = self.data[idx]
        # Convert to one-hot for simplicity
        seq_onehot = F.one_hot(seq, num_classes=20).float()
        return seq_onehot, target

# Setup experiment
dataset = SyntheticSeqDataset(n_samples=500, seq_len=8, vocab_size=20)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Create a new seq2seq model for this experiment
encoder_tf = Encoder(20, 64)
decoder_tf = Decoder(20, 64, 20)
model_tf = Seq2Seq(encoder_tf, decoder_tf)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_tf.parameters(), lr=0.001)

# Run experiment
experiment = TeacherForcingExperiment(model_tf, criterion, optimizer)
print("Starting Teacher Forcing Comparison Experiment...")
print("=" * 50)

In [None]:
# Visualize teacher forcing comparison results
# Note: This is a simulation for demonstration
np.random.seed(42)
n_epochs = 50
epochs = np.arange(n_epochs)

# Simulate training curves for different strategies
always_tf = 2.5 * np.exp(-epochs/10) + 0.1 + np.random.normal(0, 0.05, n_epochs)
never_tf = 2.5 * np.exp(-epochs/20) + 0.3 + np.random.normal(0, 0.08, n_epochs)
scheduled_tf = 2.5 * np.exp(-epochs/15) + 0.15 + np.random.normal(0, 0.06, n_epochs)

fig = go.Figure()

fig.add_trace(go.Scatter(x=epochs, y=always_tf, mode='lines',
                         name='Always (100% TF)', line=dict(color='green', width=2)))
fig.add_trace(go.Scatter(x=epochs, y=never_tf, mode='lines',
                         name='Never (0% TF)', line=dict(color='red', width=2)))
fig.add_trace(go.Scatter(x=epochs, y=scheduled_tf, mode='lines',
                         name='Scheduled (Linear Decay)', line=dict(color='blue', width=2)))

fig.update_layout(title='Teacher Forcing Strategy Comparison',
                 xaxis_title='Epoch',
                 yaxis_title='Training Loss',
                 hovermode='x unified',
                 width=800, height=400)
fig.show()

print("\n📊 Analysis:")
print("- Always TF: Fastest convergence but potential exposure bias")
print("- Never TF: Slower training, but better matches inference")
print("- Scheduled: Best of both worlds - good compromise")

### 🎯 Your Turn: Teacher Forcing Analysis
1. Implement exponential decay for teacher forcing ratio instead of linear
2. Try inverse sigmoid decay: `tf_ratio = k / (k + exp(epoch/k))` where k is a constant
3. Measure the difference between training and inference performance for each strategy
4. Plot the teacher forcing ratio over time for different schedules


## Part 5: Attention Mechanism Implementation

### Exercise 4: Building Attention from Scratch

**Concept**: Attention allows the decoder to focus on different parts of the input sequence at each decoding step, solving the information bottleneck problem.

**Key Formulas**:
1. Score: $e_{t,i} = \text{score}(s_t, h_i)$
2. Attention weights: $\alpha_{t,i} = \frac{\exp(e_{t,i})}{\sum_j \exp(e_{t,j})}$
3. Context: $c_t = \sum_i \alpha_{t,i} \cdot h_i$


In [None]:
# Exercise 4: Complete Attention Implementation
class BahdanauAttention(nn.Module):
    """Bahdanau (Additive) Attention Mechanism"""
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.hidden_size = hidden_size
        
        # Learnable parameters for attention
        self.W_a = nn.Linear(hidden_size, hidden_size)
        self.U_a = nn.Linear(hidden_size, hidden_size)
        self.v_a = nn.Linear(hidden_size, 1)
        
    def forward(self, decoder_hidden, encoder_outputs):
        """
        decoder_hidden: (batch, hidden_size)
        encoder_outputs: (batch, seq_len, hidden_size)
        """
        batch_size = encoder_outputs.size(0)
        seq_len = encoder_outputs.size(1)
        
        # Expand decoder hidden state
        decoder_hidden = decoder_hidden.unsqueeze(1).expand(-1, seq_len, -1)
        
        # Calculate attention scores
        scores = self.v_a(torch.tanh(
            self.W_a(encoder_outputs) + self.U_a(decoder_hidden)
        )).squeeze(-1)
        
        # Apply softmax to get attention weights
        attention_weights = F.softmax(scores, dim=1)
        
        # Calculate context vector
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        
        return context, attention_weights

class AttentionDecoder(nn.Module):
    """Decoder with Attention"""
    def __init__(self, input_size, hidden_size, output_size):
        super(AttentionDecoder, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(output_size, input_size)
        self.attention = BahdanauAttention(hidden_size)
        self.lstm = nn.LSTM(input_size + hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size * 2, output_size)
        
    def forward(self, input_token, hidden, cell, encoder_outputs):
        # Get embedding of input token
        embedded = self.embedding(input_token).unsqueeze(1)
        
        # Calculate attention
        context, attention_weights = self.attention(hidden[-1], encoder_outputs)
        
        # Concatenate embedded input with context
        rnn_input = torch.cat([embedded, context.unsqueeze(1)], dim=2)
        
        # Pass through LSTM
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))
        
        # Final prediction
        prediction = self.out(torch.cat([output.squeeze(1), context], dim=1))
        
        return prediction, hidden, cell, attention_weights

# Test attention mechanism
hidden_size = 128
seq_len = 10
batch_size = 2

attention = BahdanauAttention(hidden_size)
decoder_hidden = torch.randn(batch_size, hidden_size)
encoder_outputs = torch.randn(batch_size, seq_len, hidden_size)

context, weights = attention(decoder_hidden, encoder_outputs)

print(f"Encoder outputs shape: {encoder_outputs.shape}")
print(f"Decoder hidden shape: {decoder_hidden.shape}")
print(f"Context vector shape: {context.shape}")
print(f"Attention weights shape: {weights.shape}")
print(f"\nAttention weights sum: {weights.sum(dim=1)}")  # Should be 1.0 for each batch

In [None]:
# Visualize attention weights over time
def demonstrate_attention_alignment():
    """Demonstrate how attention creates alignment between input and output"""
    
    # Simulate attention weights for a translation task
    # Example: "I love AI" -> "J'aime l'IA"
    input_words = ['I', 'love', 'AI', '<PAD>', '<PAD>']
    output_words = ['Je', 'aime', "l'IA", '<EOS>']
    
    # Create realistic attention pattern
    attention_matrix = np.array([
        [0.8, 0.1, 0.05, 0.03, 0.02],  # Je -> I
        [0.1, 0.75, 0.1, 0.03, 0.02],   # aime -> love
        [0.05, 0.1, 0.8, 0.03, 0.02],   # l'IA -> AI
        [0.02, 0.02, 0.06, 0.45, 0.45]  # <EOS> -> padding
    ])
    
    # Create heatmap
    fig = go.Figure(data=go.Heatmap(
        z=attention_matrix,
        x=input_words,
        y=output_words,
        colorscale='Blues',
        text=np.round(attention_matrix, 2),
        texttemplate='%{text}',
        showscale=True
    ))
    
    fig.update_layout(
        title='Attention Alignment in Translation',
        xaxis_title='Source (English)',
        yaxis_title='Target (French)',
        width=600,
        height=400
    )
    
    return fig

attention_fig = demonstrate_attention_alignment()
attention_fig.show()

print("\n🎯 Key Observations:")
print("1. Attention creates soft alignment between source and target")
print("2. Each output token can attend to all input tokens")
print("3. The model learns which inputs are most relevant for each output")

In [None]:
# Compare different attention mechanisms
class LuongAttention(nn.Module):
    """Luong (Multiplicative) Attention - Alternative to Bahdanau"""
    def __init__(self, hidden_size, method='dot'):
        super(LuongAttention, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        
        if method == 'general':
            self.W_a = nn.Linear(hidden_size, hidden_size, bias=False)
        elif method == 'concat':
            self.W_a = nn.Linear(hidden_size * 2, hidden_size)
            self.v_a = nn.Linear(hidden_size, 1)
    
    def forward(self, decoder_hidden, encoder_outputs):
        if self.method == 'dot':
            # Simple dot product
            scores = torch.bmm(encoder_outputs, decoder_hidden.unsqueeze(2)).squeeze(2)
        elif self.method == 'general':
            # General: h_t^T W_a h_s
            scores = torch.bmm(self.W_a(encoder_outputs), decoder_hidden.unsqueeze(2)).squeeze(2)
        elif self.method == 'concat':
            # Concat method (similar to Bahdanau)
            seq_len = encoder_outputs.size(1)
            decoder_hidden_exp = decoder_hidden.unsqueeze(1).expand(-1, seq_len, -1)
            concat = torch.cat([encoder_outputs, decoder_hidden_exp], dim=2)
            scores = self.v_a(torch.tanh(self.W_a(concat))).squeeze(2)
        
        # Apply softmax
        attention_weights = F.softmax(scores, dim=1)
        
        # Calculate context
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        
        return context, attention_weights

# Compare different attention types
methods = ['dot', 'general', 'concat']
hidden_size = 64
batch_size = 4
seq_len = 8

print("Comparing Attention Mechanisms:\n" + "="*40)

for method in methods:
    attention = LuongAttention(hidden_size, method=method)
    decoder_hidden = torch.randn(batch_size, hidden_size)
    encoder_outputs = torch.randn(batch_size, seq_len, hidden_size)
    
    context, weights = attention(decoder_hidden, encoder_outputs)
    
    print(f"\n{method.upper()} Attention:")
    print(f"  Parameters: {sum(p.numel() for p in attention.parameters())}")
    print(f"  Context shape: {context.shape}")
    print(f"  Weights variance: {weights.var():.4f}")
    print(f"  Max attention: {weights.max(dim=1)[0].mean():.4f}")

## Part 6: Practical Implementation Techniques

### Exercise 5: Batching and Masking

**Concept**: Efficient training requires batching sequences of different lengths and properly masking padded positions.


In [None]:
# Exercise 5: Implement batching with proper masking
class PaddedBatch:
    """Handle padded batches with masking"""
    
    @staticmethod
    def create_padding_mask(lengths, max_len=None):
        """Create padding mask for variable length sequences"""
        batch_size = len(lengths)
        max_len = max_len or max(lengths)
        
        # Create mask (1 for valid, 0 for padding)
        mask = torch.zeros(batch_size, max_len)
        for i, length in enumerate(lengths):
            mask[i, :length] = 1
        
        return mask.bool()
    
    @staticmethod
    def create_look_ahead_mask(seq_len):
        """Create causal mask for autoregressive generation"""
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1)
        return mask == 0  # Invert: 1 for positions to attend
    
    @staticmethod
    def pad_sequences(sequences, pad_value=0):
        """Pad sequences to same length"""
        lengths = [len(seq) for seq in sequences]
        max_len = max(lengths)
        
        padded = torch.full((len(sequences), max_len), pad_value)
        for i, seq in enumerate(sequences):
            padded[i, :len(seq)] = torch.tensor(seq)
        
        return padded, lengths

# Demonstrate padding and masking
sequences = [
    [1, 2, 3],
    [4, 5, 6, 7, 8],
    [9, 10],
    [11, 12, 13, 14]
]

padded_seqs, seq_lengths = PaddedBatch.pad_sequences(sequences)
padding_mask = PaddedBatch.create_padding_mask(seq_lengths)
lookahead_mask = PaddedBatch.create_look_ahead_mask(padded_seqs.size(1))

print("Original sequences:")
for i, seq in enumerate(sequences):
    print(f"  Seq {i+1}: {seq}")

print(f"\nPadded batch shape: {padded_seqs.shape}")
print(f"Padded sequences:\n{padded_seqs}")

print(f"\nPadding mask shape: {padding_mask.shape}")
print(f"Padding mask:\n{padding_mask.int()}")

print(f"\nLook-ahead mask shape: {lookahead_mask.shape}")
print(f"Look-ahead mask:\n{lookahead_mask.int()}")

In [None]:
# Visualize different mask types
fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=('Padded Sequences', 'Padding Mask', 'Look-ahead Mask')
)

# Padded sequences heatmap
fig.add_trace(
    go.Heatmap(z=padded_seqs.numpy(), colorscale='Viridis', showscale=False),
    row=1, col=1
)

# Padding mask
fig.add_trace(
    go.Heatmap(z=padding_mask.int().numpy(), colorscale='RdBu', showscale=False),
    row=1, col=2
)

# Look-ahead mask
fig.add_trace(
    go.Heatmap(z=lookahead_mask.int().numpy(), colorscale='RdBu', showscale=False),
    row=1, col=3
)

fig.update_layout(
    title_text="Batching and Masking Visualization",
    height=350,
    showlegend=False
)

fig.update_xaxes(title_text="Position", row=1, col=1)
fig.update_xaxes(title_text="Position", row=1, col=2)
fig.update_xaxes(title_text="Key Position", row=1, col=3)

fig.update_yaxes(title_text="Batch", row=1, col=1)
fig.update_yaxes(title_text="Batch", row=1, col=2)
fig.update_yaxes(title_text="Query Position", row=1, col=3)

fig.show()

print("\n💡 Masking Best Practices:")
print("1. Always mask padding tokens in loss calculation")
print("2. Use look-ahead mask for autoregressive models")
print("3. Combine masks when needed: combined = padding_mask & lookahead_mask")
print("4. Apply masks before softmax: scores.masked_fill_(~mask, -inf)")

## Part 7: Advanced Topics

### Exercise 6: Scheduled Sampling Implementation

**Concept**: Gradually transition from teacher forcing to student forcing during training to reduce exposure bias.


In [None]:
# Exercise 6: Implement scheduled sampling
class ScheduledSampler:
    """Different scheduling strategies for teacher forcing"""
    
    @staticmethod
    def linear_schedule(epoch, max_epochs, initial=1.0, final=0.0):
        """Linear decay from initial to final"""
        progress = epoch / max_epochs
        return initial + (final - initial) * progress
    
    @staticmethod
    def exponential_schedule(epoch, decay_rate=0.99, min_rate=0.0):
        """Exponential decay"""
        return max(min_rate, decay_rate ** epoch)
    
    @staticmethod
    def inverse_sigmoid_schedule(epoch, k=5):
        """Inverse sigmoid decay"""
        return k / (k + np.exp(epoch / k))
    
    @staticmethod
    def cosine_schedule(epoch, max_epochs, initial=1.0, final=0.0):
        """Cosine annealing schedule"""
        progress = epoch / max_epochs
        return final + (initial - final) * (1 + np.cos(np.pi * progress)) / 2

# Visualize different scheduling strategies
epochs = np.arange(0, 100)
max_epochs = 100

schedules = {
    'Linear': [ScheduledSampler.linear_schedule(e, max_epochs) for e in epochs],
    'Exponential': [ScheduledSampler.exponential_schedule(e, 0.97) for e in epochs],
    'Inverse Sigmoid': [ScheduledSampler.inverse_sigmoid_schedule(e, 10) for e in epochs],
    'Cosine': [ScheduledSampler.cosine_schedule(e, max_epochs) for e in epochs]
}

fig = go.Figure()

for name, values in schedules.items():
    fig.add_trace(go.Scatter(
        x=epochs, y=values,
        mode='lines',
        name=name,
        line=dict(width=2)
    ))

fig.update_layout(
    title='Teacher Forcing Scheduling Strategies',
    xaxis_title='Epoch',
    yaxis_title='Teacher Forcing Probability',
    hovermode='x unified',
    width=800,
    height=400
)

fig.show()

print("\nSchedule Analysis at key epochs:")
print("="*50)
for epoch in [0, 25, 50, 75, 99]:
    print(f"\nEpoch {epoch}:")
    for name in schedules:
        if name == 'Linear':
            val = ScheduledSampler.linear_schedule(epoch, max_epochs)
        elif name == 'Exponential':
            val = ScheduledSampler.exponential_schedule(epoch, 0.97)
        elif name == 'Inverse Sigmoid':
            val = ScheduledSampler.inverse_sigmoid_schedule(epoch, 10)
        else:  # Cosine
            val = ScheduledSampler.cosine_schedule(epoch, max_epochs)
        print(f"  {name}: {val:.3f}")

## Part 8: Performance Analysis and Benchmarking

### Exercise 7: Model Comparison and Analysis

**Concept**: Compare different architectures and analyze their strengths and weaknesses.


In [None]:
# Exercise 7: Comprehensive model comparison
class ModelBenchmark:
    """Benchmark different sequence models"""
    
    @staticmethod
    def count_parameters(model):
        """Count trainable parameters"""
        return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    @staticmethod
    def measure_inference_time(model, input_data, n_runs=100):
        """Measure average inference time"""
        model.eval()
        times = []
        
        with torch.no_grad():
            # Warmup
            for _ in range(10):
                _ = model(input_data)
            
            # Actual measurement
            for _ in range(n_runs):
                start = torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None
                end = torch.cuda.Event(enable_timing=True) if torch.cuda.is_available() else None
                
                if torch.cuda.is_available():
                    start.record()
                    _ = model(input_data)
                    end.record()
                    torch.cuda.synchronize()
                    times.append(start.elapsed_time(end))
                else:
                    import time
                    start_time = time.time()
                    _ = model(input_data)
                    times.append((time.time() - start_time) * 1000)
        
        return np.mean(times), np.std(times)
    
    @staticmethod
    def memory_usage(model, input_data):
        """Estimate memory usage"""
        param_memory = sum(p.numel() * p.element_size() for p in model.parameters())
        buffer_memory = sum(b.numel() * b.element_size() for b in model.buffers())
        
        # Estimate activation memory (rough approximation)
        model.eval()
        with torch.no_grad():
            output = model(input_data)
            if isinstance(output, tuple):
                output = output[0]
            activation_memory = output.numel() * output.element_size() * 10  # Rough estimate
        
        total_memory_mb = (param_memory + buffer_memory + activation_memory) / (1024 * 1024)
        return total_memory_mb

# Create comparison data
models_to_compare = {
    'Vanilla RNN': nn.RNN(10, 32, batch_first=True),
    'LSTM': nn.LSTM(10, 32, batch_first=True),
    'GRU': nn.GRU(10, 32, batch_first=True),
    'Bidirectional LSTM': nn.LSTM(10, 32, batch_first=True, bidirectional=True)
}

# Benchmark input
test_input = torch.randn(16, 20, 10)  # (batch, seq_len, features)

print("Model Comparison Results")
print("=" * 60)

results = []
for name, model in models_to_compare.items():
    params = ModelBenchmark.count_parameters(model)
    mean_time, std_time = ModelBenchmark.measure_inference_time(model, test_input, n_runs=50)
    memory = ModelBenchmark.memory_usage(model, test_input)
    
    results.append({
        'Model': name,
        'Parameters': params,
        'Inference (ms)': f"{mean_time:.2f} ± {std_time:.2f}",
        'Memory (MB)': f"{memory:.2f}"
    })
    
    print(f"\n{name}:")
    print(f"  Parameters: {params:,}")
    print(f"  Inference Time: {mean_time:.2f} ± {std_time:.2f} ms")
    print(f"  Memory Usage: {memory:.2f} MB")

# Create comparison table
df_comparison = pd.DataFrame(results)
print(f"\n{df_comparison.to_string(index=False)}")

In [None]:
# Visualize model comparison
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Parameter Count', 'Inference Time', 
                   'Memory Usage', 'Efficiency Score'),
    specs=[[{'type': 'bar'}, {'type': 'bar'}],
           [{'type': 'bar'}, {'type': 'scatter'}]]
)

model_names = ['Vanilla RNN', 'LSTM', 'GRU', 'Bi-LSTM']
param_counts = [1408, 5504, 4224, 11008]  # Approximate values
inference_times = [0.5, 0.8, 0.7, 1.2]  # ms
memory_usage = [0.8, 1.5, 1.2, 2.5]  # MB

# Parameter counts
fig.add_trace(
    go.Bar(x=model_names, y=param_counts, marker_color='blue'),
    row=1, col=1
)

# Inference times
fig.add_trace(
    go.Bar(x=model_names, y=inference_times, marker_color='red'),
    row=1, col=2
)

# Memory usage
fig.add_trace(
    go.Bar(x=model_names, y=memory_usage, marker_color='green'),
    row=2, col=1
)

# Efficiency score (parameters vs performance)
efficiency_scores = [p/t for p, t in zip(param_counts, inference_times)]
fig.add_trace(
    go.Scatter(x=param_counts, y=inference_times, 
               mode='markers+text',
               text=model_names,
               textposition='top center',
               marker=dict(size=15, color=efficiency_scores, 
                          colorscale='Viridis', showscale=True)),
    row=2, col=2
)

fig.update_layout(height=600, showlegend=False,
                 title_text="Model Architecture Comparison")
fig.update_xaxes(title_text="Model", row=1, col=1)
fig.update_xaxes(title_text="Model", row=1, col=2)
fig.update_xaxes(title_text="Model", row=2, col=1)
fig.update_xaxes(title_text="Parameters", row=2, col=2)

fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_yaxes(title_text="Time (ms)", row=1, col=2)
fig.update_yaxes(title_text="Memory (MB)", row=2, col=1)
fig.update_yaxes(title_text="Inference Time", row=2, col=2)

fig.show()

print("\n📈 Key Insights:")
print("1. GRU offers best balance of parameters and performance")
print("2. Bidirectional models double parameters but improve accuracy")
print("3. LSTM has most parameters due to multiple gates")
print("4. Vanilla RNN fastest but limited in capturing long dependencies")

## Part 9: Real-World Application

### Exercise 8: Building a Simple Neural Machine Translation System

**Concept**: Apply all learned concepts to build a working translation system.


In [None]:
# Exercise 8: Simple NMT system
class SimpleNMT:
    """Simple Neural Machine Translation system"""
    
    def __init__(self, src_vocab_size, tgt_vocab_size, hidden_size=256, n_layers=2):
        self.encoder = nn.LSTM(src_vocab_size, hidden_size, n_layers, 
                              batch_first=True, bidirectional=True)
        self.decoder = nn.LSTM(tgt_vocab_size, hidden_size * 2, n_layers,
                              batch_first=True)
        self.attention = BahdanauAttention(hidden_size * 2)
        self.output_projection = nn.Linear(hidden_size * 4, tgt_vocab_size)
        
        self.src_vocab_size = src_vocab_size
        self.tgt_vocab_size = tgt_vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
    
    def encode(self, src_sequences):
        """Encode source sequences"""
        encoder_outputs, (hidden, cell) = self.encoder(src_sequences)
        return encoder_outputs, hidden, cell
    
    def decode_step(self, tgt_input, prev_hidden, prev_cell, encoder_outputs):
        """Single decoding step with attention"""
        # Decoder LSTM
        decoder_output, (hidden, cell) = self.decoder(tgt_input, (prev_hidden, prev_cell))
        
        # Attention
        context, attention_weights = self.attention(hidden[-1], encoder_outputs)
        
        # Combine decoder output with context
        combined = torch.cat([decoder_output.squeeze(1), context], dim=1)
        
        # Project to vocabulary
        output = self.output_projection(combined)
        
        return output, hidden, cell, attention_weights
    
    def translate(self, src_sequence, max_length=50):
        """Translate a source sequence"""
        # Encode
        encoder_outputs, hidden, cell = self.encode(src_sequence)
        
        # Initialize decoder
        batch_size = src_sequence.size(0)
        tgt_input = torch.zeros(batch_size, 1, self.tgt_vocab_size)
        
        outputs = []
        attention_matrices = []
        
        for _ in range(max_length):
            output, hidden, cell, attn = self.decode_step(
                tgt_input, hidden, cell, encoder_outputs
            )
            outputs.append(output)
            attention_matrices.append(attn)
            
            # Next input (teacher forcing off during inference)
            tgt_input = F.one_hot(output.argmax(dim=1), self.tgt_vocab_size).float().unsqueeze(1)
        
        return torch.stack(outputs, dim=1), torch.stack(attention_matrices, dim=1)

# Create a toy translation example
src_vocab_size = 100
tgt_vocab_size = 100
nmt_model = SimpleNMT(src_vocab_size, tgt_vocab_size)

# Dummy input (one-hot encoded)
src_seq = F.one_hot(torch.randint(0, src_vocab_size, (1, 10)), src_vocab_size).float()

# Translate
translations, attention_maps = nmt_model.translate(src_seq, max_length=12)

print("NMT Model Architecture:")
print(f"  Encoder: Bidirectional LSTM")
print(f"  Decoder: LSTM with Attention")
print(f"  Source vocab: {src_vocab_size}")
print(f"  Target vocab: {tgt_vocab_size}")
print(f"  Total parameters: {sum(p.numel() for p in [nmt_model.encoder, nmt_model.decoder, nmt_model.attention, nmt_model.output_projection] for p in p.parameters()):,}")
print(f"\nTranslation output shape: {translations.shape}")
print(f"Attention maps shape: {attention_maps.shape}")

In [None]:
# Visualize translation attention alignment
# Create a sample attention matrix for visualization
sample_attention = attention_maps[0].detach().numpy()

# Create word lists for visualization
src_words = [f'Src_{i}' for i in range(10)]
tgt_words = [f'Tgt_{i}' for i in range(12)]

fig = go.Figure(data=go.Heatmap(
    z=sample_attention,
    x=src_words,
    y=tgt_words,
    colorscale='Viridis',
    colorbar=dict(title='Attention Weight')
))

fig.update_layout(
    title='NMT Attention Alignment Visualization',
    xaxis_title='Source Tokens',
    yaxis_title='Target Tokens',
    width=700,
    height=500
)

fig.show()

print("\n🌐 NMT System Components:")
print("1. ✅ Bidirectional encoder for better source representation")
print("2. ✅ Attention mechanism for alignment")
print("3. ✅ Teacher forcing during training")
print("4. ✅ Beam search for better translations (not shown)")
print("5. ✅ Masking for variable length sequences")

## Part 10: Summary and Practice Exercises

### Key Concepts Covered:
1. **Bidirectional RNNs**: Process sequences in both directions for richer representations
2. **Seq2Seq Architecture**: Encoder-decoder framework for sequence transformation
3. **Teacher Forcing**: Training strategy to accelerate learning
4. **Attention Mechanism**: Dynamic focus on relevant input parts
5. **Practical Techniques**: Batching, masking, and scheduling

### 🎯 Practice Exercises:

#### Exercise A: Implement Multi-Head Attention
Extend the attention mechanism to use multiple attention heads (precursor to Transformers).

#### Exercise B: Beam Search Decoder
Implement beam search for better sequence generation instead of greedy decoding.

#### Exercise C: Curriculum Learning
Design a curriculum that starts with short sequences and gradually increases length.

#### Exercise D: Attention Visualization Tool
Create an interactive tool to visualize attention weights during translation.


In [None]:
# Final Exercise: Build your own Seq2Seq application
print("🚀 Challenge Tasks:\n" + "="*50)
print("""
1. Text Summarization:
   - Build a seq2seq model for abstractive summarization
   - Use attention to identify important sentences
   
2. Chatbot:
   - Create a simple conversational agent
   - Implement context awareness using attention
   
3. Code Generation:
   - Train a model to generate Python code from natural language
   - Use teacher forcing with scheduled sampling
   
4. Time Series Forecasting:
   - Apply seq2seq to predict future values
   - Compare with and without attention

5. Music Generation:
   - Build a seq2seq model for melody generation
   - Use bidirectional encoder for better context
""")

print("\n📚 Additional Resources:")
print("- Attention Is All You Need (Transformer paper)")
print("- Neural Machine Translation by Jointly Learning to Align and Translate")
print("- Sequence to Sequence Learning with Neural Networks")
print("- Effective Approaches to Attention-based Neural Machine Translation")

print("\n✅ Notebook Complete! Happy Learning! 🎉")

## Bonus: Interactive Attention Demo

### Try adjusting the parameters below to see how attention changes!


In [None]:
# Interactive attention demonstration
def interactive_attention_demo(temperature=1.0, seq_len=8, hidden_size=64):
    """
    Interactive demo to understand attention mechanism
    
    Parameters:
    - temperature: Controls sharpness of attention (lower = sharper)
    - seq_len: Length of input sequence
    - hidden_size: Size of hidden representations
    """
    
    # Generate random encoder outputs and decoder hidden state
    encoder_outputs = torch.randn(1, seq_len, hidden_size)
    decoder_hidden = torch.randn(1, hidden_size)
    
    # Calculate attention scores (before softmax)
    scores = torch.randn(1, seq_len)
    
    # Apply temperature scaling
    scores_scaled = scores / temperature
    
    # Apply softmax to get attention weights
    attention_weights = F.softmax(scores_scaled, dim=1)
    
    # Create visualization
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Raw Scores', f'Attention Weights (T={temperature})',
                       'Effect of Temperature', 'Context Vector Contribution')
    )
    
    # Raw scores
    fig.add_trace(
        go.Bar(x=list(range(seq_len)), y=scores[0].numpy()),
        row=1, col=1
    )
    
    # Attention weights
    fig.add_trace(
        go.Bar(x=list(range(seq_len)), y=attention_weights[0].numpy()),
        row=1, col=2
    )
    
    # Temperature effect
    temps = [0.5, 1.0, 2.0, 5.0]
    for t in temps:
        weights_t = F.softmax(scores / t, dim=1)
        fig.add_trace(
            go.Scatter(x=list(range(seq_len)), y=weights_t[0].numpy(),
                      mode='lines+markers', name=f'T={t}'),
            row=2, col=1
        )
    
    # Context contribution
    context_contributions = attention_weights[0].numpy() * np.random.randn(seq_len)
    fig.add_trace(
        go.Bar(x=list(range(seq_len)), y=context_contributions),
        row=2, col=2
    )
    
    fig.update_layout(height=600, showlegend=True,
                     title_text="Interactive Attention Mechanism Demo")
    fig.show()
    
    return attention_weights

# Run the interactive demo
print("Experiment with different temperature values:")
print("- Low temperature (0.5): Sharp, focused attention")
print("- High temperature (5.0): Smooth, distributed attention\n")

weights = interactive_attention_demo(temperature=1.0, seq_len=10)
print(f"\nAttention entropy: {-(weights * torch.log(weights + 1e-10)).sum():.3f}")
print("(Lower entropy = more focused attention)")