# 03. Micro LSTM

This notebook implements a minimal LSTM for character-level text generation.

## Experiment Overview
- **Goal**: Generate text using a minimal LSTM
- **Model**: Single-layer LSTM with embedding layer
- **Features**: Character-level text generation, training on simple sequences
- **Learning**: Understanding recurrent neural networks and sequence modeling

## What You'll Learn
- Building LSTM architectures
- Character-level text processing
- Sequence generation and sampling
- Training recurrent networks


In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import sys
import os

# Add scripts directory to path
sys.path.append('../scripts')
from utils import get_device, set_seed

# Set random seed for reproducibility
set_seed(42)

# Get device
device = get_device()
print(f"Using device: {device}")

# Sample text data for training
text = """
The quick brown fox jumps over the lazy dog.
Machine learning is fascinating and powerful.
Neural networks can learn complex patterns.
Deep learning has revolutionized AI.
Artificial intelligence is the future.
"""

# Create character mappings
chars = sorted(list(set(text)))
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}
vocab_size = len(chars)

print(f"Vocabulary size: {vocab_size}")
print(f"Characters: {chars}")
print(f"Text length: {len(text)}")


In [None]:
# Define the Micro LSTM model
class MicroLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size=64, embedding_size=32):
        super(MicroLSTM, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        
        # Output layer
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, hidden=None):
        # Embed input
        embedded = self.embedding(x)
        
        # LSTM forward pass
        lstm_out, hidden = self.lstm(embedded, hidden)
        
        # Output layer
        output = self.fc(lstm_out)
        
        return output, hidden
    
    def init_hidden(self, batch_size):
        """Initialize hidden state."""
        return (torch.zeros(1, batch_size, self.hidden_size).to(device),
                torch.zeros(1, batch_size, self.hidden_size).to(device))

# Create model instance
model = MicroLSTM(vocab_size).to(device)

# Print model summary
print("Model Architecture:")
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Model size: {sum(p.numel() for p in model.parameters()) * 4 / 1024 / 1024:.2f} MB")


In [None]:
# Prepare training data
def create_sequences(text, seq_length=20):
    """Create input-output sequences for training."""
    sequences = []
    targets = []
    
    for i in range(len(text) - seq_length):
        seq = text[i:i + seq_length]
        target = text[i + seq_length]
        
        # Convert to indices
        seq_idx = [char_to_idx[ch] for ch in seq]
        target_idx = char_to_idx[target]
        
        sequences.append(seq_idx)
        targets.append(target_idx)
    
    return torch.tensor(sequences), torch.tensor(targets)

# Create sequences
seq_length = 20
X, y = create_sequences(text, seq_length)

print(f"Number of sequences: {len(X)}")
print(f"Sequence length: {seq_length}")
print(f"Input shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Show example sequence
print(f"\nExample sequence:")
print(f"Input:  {''.join([idx_to_char[idx.item()] for idx in X[0]])}")
print(f"Target: {idx_to_char[y[0].item()]}")


In [None]:
# Train the LSTM
def train_lstm(model, X, y, epochs=100, lr=0.01, batch_size=32):
    """Train the LSTM model."""
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    losses = []
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        # Mini-batch training
        for i in range(0, len(X), batch_size):
            batch_X = X[i:i+batch_size].to(device)
            batch_y = y[i:i+batch_size].to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            output, _ = model(batch_X)
            loss = criterion(output.view(-1, model.vocab_size), batch_y)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / (len(X) // batch_size)
        losses.append(avg_loss)
        
        if (epoch + 1) % 20 == 0:
            print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')
    
    return losses

# Train the model
print("Starting LSTM training...")
losses = train_lstm(model, X, y, epochs=100, lr=0.01)

# Plot training loss
plt.figure(figsize=(10, 4))
plt.plot(losses)
plt.title('LSTM Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.savefig('../results/plots/lstm_training.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Text generation function
def generate_text(model, start_text, length=100, temperature=1.0):
    """Generate text using the trained model."""
    model.eval()
    
    # Convert start text to indices
    start_indices = [char_to_idx[ch] for ch in start_text]
    generated = start_indices.copy()
    
    with torch.no_grad():
        # Initialize hidden state
        hidden = model.init_hidden(1)
        
        # Feed the start sequence
        for idx in start_indices:
            input_tensor = torch.tensor([[idx]]).to(device)
            output, hidden = model(input_tensor, hidden)
        
        # Generate new characters
        for _ in range(length):
            # Get probabilities
            probs = F.softmax(output[0, -1, :] / temperature, dim=0)
            
            # Sample next character
            next_char_idx = torch.multinomial(probs, 1).item()
            generated.append(next_char_idx)
            
            # Use the generated character as next input
            input_tensor = torch.tensor([[next_char_idx]]).to(device)
            output, hidden = model(input_tensor, hidden)
    
    # Convert back to text
    generated_text = ''.join([idx_to_char[idx] for idx in generated])
    return generated_text

# Generate text with different temperatures
print("Generating text with different temperatures:")
print("=" * 50)

start_text = "The quick brown"
for temp in [0.5, 1.0, 1.5]:
    generated = generate_text(model, start_text, length=50, temperature=temp)
    print(f"Temperature {temp}: {generated}")
    print()

# Generate multiple samples
print("Multiple generation samples:")
print("=" * 50)
for i in range(3):
    generated = generate_text(model, "Machine learning", length=30, temperature=1.0)
    print(f"Sample {i+1}: {generated}")
