# TransformerKit Demo

This notebook demonstrates the key features of TransformerKit, including:
- Basic model creation and usage
- Training on a simple task
- Attention weight visualization
- Model architecture exploration

## 1. Setup & Installation

First, ensure TransformerKit is installed:

```bash
pip install transformerkit
```

Or for development:

```bash
pip install -e .
```

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt

from transformerkit import (
    create_transformer,
    TransformerConfig,
    count_parameters,
    create_padding_mask,
    create_target_mask,
    greedy_decode,
)
from transformerkit.visualization import (
    plot_attention_heatmap,
    plot_multihead_attention,
)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Device selection
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using CUDA (NVIDIA GPU)")
elif torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using MPS (Apple Silicon GPU)")
else:
    device = torch.device('cpu')
    print("Using CPU")

## 2. Basic Usage

In [None]:
# Create a small model for demonstration
config = TransformerConfig(
    d_model=128,
    n_heads=4,
    n_layers=2,
    d_ff=256,
    dropout=0.1,
    vocab_size=1000,
    max_seq_length=50
)

model = create_transformer(config).to(device)
print(f"Model created with {count_parameters(model):,} parameters")
print(f"Configuration: {config.n_layers} layers, {config.n_heads} heads, d_model={config.d_model}")

In [None]:
# Simple forward pass
src = torch.randint(3, 1000, (2, 10)).to(device)  # (batch_size, src_len)
tgt = torch.randint(3, 1000, (2, 8)).to(device)   # (batch_size, tgt_len)

# Forward pass
output = model(src, tgt)

print(f"Input shapes:")
print(f"  Source: {src.shape}")
print(f"  Target: {tgt.shape}")
print(f"\nOutput shape: {output.shape}")
print(f"  Expected: (batch_size={src.shape[0]}, seq_len={tgt.shape[1]}, vocab_size={config.vocab_size})")

## 3. Training Demo

Let's train a small model on a copy task to demonstrate the training loop.

In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class SimpleCopyDataset(Dataset):
    """Simple dataset for copy task."""
    
    def __init__(self, num_samples=1000, seq_len=8, vocab_size=100):
        self.data = np.random.randint(3, vocab_size, size=(num_samples, seq_len))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        seq = self.data[idx]
        src = torch.tensor(seq, dtype=torch.long)
        tgt_input = torch.cat([torch.tensor([1]), torch.tensor(seq, dtype=torch.long)])
        tgt_output = torch.cat([torch.tensor(seq, dtype=torch.long), torch.tensor([2])])
        return src, tgt_input, tgt_output

# Create dataset
train_dataset = SimpleCopyDataset(num_samples=500, seq_len=8, vocab_size=100)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

print(f"Created training dataset with {len(train_dataset)} samples")

In [None]:
# Training loop
model.train()
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5
losses = []

print(f"Training for {num_epochs} epochs...\n")

for epoch in range(num_epochs):
    epoch_loss = 0
    
    for batch_idx, (src, tgt_input, tgt_output) in enumerate(train_loader):
        src = src.to(device)
        tgt_input = tgt_input.to(device)
        tgt_output = tgt_output.to(device)
        
        # Create masks
        src_mask = create_padding_mask(src, pad_idx=0)
        tgt_mask = create_target_mask(tgt_input, pad_idx=0)
        
        # Forward pass
        output = model(src, tgt_input, src_mask, tgt_mask)
        
        # Calculate loss
        loss = criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_loader)
    losses.append(avg_loss)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

print("\nTraining complete!")

In [None]:
# Plot training loss
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), losses, marker='o', linewidth=2, markersize=8)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Training Loss Over Time', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 4. Attention Visualization

Now let's visualize attention weights to understand what the model is learning.

In [None]:
# Get a sample for visualization
model.eval()
sample_src, sample_tgt_input, sample_tgt_output = train_dataset[0]
sample_src = sample_src.unsqueeze(0).to(device)
sample_tgt_input = sample_tgt_input.unsqueeze(0).to(device)

print(f"Sample source sequence: {sample_src.squeeze().cpu().numpy()}")
print(f"Sample target sequence: {sample_tgt_output.cpu().numpy()}")

In [None]:
# Create synthetic attention weights for demonstration
# In a real scenario, you'd extract these from the model during forward pass
tgt_len = sample_tgt_input.shape[1]
src_len = sample_src.shape[1]
n_heads = config.n_heads

# Simulate attention pattern (diagonal for copy task)
attention_weights = torch.zeros(tgt_len, src_len)
for i in range(min(tgt_len, src_len)):
    attention_weights[i, i] = 0.8
    if i > 0:
        attention_weights[i, i-1] = 0.15
    if i < src_len - 1:
        attention_weights[i, i+1] = 0.05

# Normalize
attention_weights = attention_weights / attention_weights.sum(dim=1, keepdim=True)

print(f"Attention weights shape: {attention_weights.shape}")

In [None]:
# Plot single attention heatmap
fig, ax = plot_attention_heatmap(
    attention_weights,
    title="Self-Attention Pattern (Copy Task)",
    figsize=(8, 6)
)
plt.show()

In [None]:
# Simulate multi-head attention
multihead_attention = torch.stack([
    attention_weights + torch.randn(tgt_len, src_len) * 0.1
    for _ in range(n_heads)
])
# Ensure valid probabilities
multihead_attention = torch.clamp(multihead_attention, min=0)
multihead_attention = multihead_attention / multihead_attention.sum(dim=2, keepdim=True)

fig = plot_multihead_attention(
    multihead_attention,
    n_heads=n_heads,
    layer_idx=0,
    figsize=(15, 8)
)
plt.show()

## 5. Model Architecture Overview

In [None]:
print("="*60)
print("Transformer Architecture Overview")
print("="*60)
print(f"\nConfiguration:")
print(f"  Model dimension (d_model): {config.d_model}")
print(f"  Number of attention heads: {config.n_heads}")
print(f"  Dimension per head: {config.d_model // config.n_heads}")
print(f"  Number of layers: {config.n_layers}")
print(f"  Feed-forward dimension: {config.d_ff}")
print(f"  Dropout rate: {config.dropout}")
print(f"  Vocabulary size: {config.vocab_size}")
print(f"  Maximum sequence length: {config.max_seq_length}")

print(f"\nTotal parameters: {count_parameters(model):,}")

print(f"\nModel Structure:")
print(f"  Encoder:")
print(f"    - Token Embedding")
print(f"    - Positional Encoding")
print(f"    - {config.n_layers}x Encoder Layers:")
print(f"        - Multi-Head Self-Attention ({config.n_heads} heads)")
print(f"        - Add & Norm")
print(f"        - Feed-Forward Network (d_ff={config.d_ff})")
print(f"        - Add & Norm")
print(f"\n  Decoder:")
print(f"    - Token Embedding")
print(f"    - Positional Encoding")
print(f"    - {config.n_layers}x Decoder Layers:")
print(f"        - Masked Multi-Head Self-Attention ({config.n_heads} heads)")
print(f"        - Add & Norm")
print(f"        - Multi-Head Cross-Attention ({config.n_heads} heads)")
print(f"        - Add & Norm")
print(f"        - Feed-Forward Network (d_ff={config.d_ff})")
print(f"        - Add & Norm")
print(f"\n  Output:")
print(f"    - Linear projection to vocabulary (d_model → vocab_size)")
print("="*60)

## 6. Sequence Generation

In [None]:
# Test sequence generation with greedy decoding
test_src = torch.tensor([[5, 10, 15, 20, 25, 30]]).to(device)
src_mask = create_padding_mask(test_src, pad_idx=0)

generated = greedy_decode(
    model,
    test_src,
    src_mask,
    max_len=10,
    start_idx=1,
    end_idx=2
)

print(f"Source sequence: {test_src.squeeze().cpu().numpy()}")
print(f"Generated sequence: {generated[0].cpu().numpy()}")
print(f"Length: {len(generated[0])}")

## Conclusion

This notebook demonstrated:
- ✅ Creating and configuring a Transformer model
- ✅ Training on a simple copy task
- ✅ Visualizing attention weights
- ✅ Understanding the model architecture
- ✅ Generating sequences with the trained model

### Next Steps

- Try different model configurations
- Train on real-world tasks (translation, text generation)
- Experiment with different decoding strategies (beam search)
- Analyze attention patterns for different tasks

For more information, visit: https://github.com/charansoma3001/transformerkit