# MLPs in PyTorch

This notebook demonstrates Multi-Layer Perceptron (MLP) implementation and training.

## Table of Contents
1. [From Equation to Code](#from-equation-to-code)
2. [Training MLP on Synthetic Data](#training-mlp-on-synthetic-data)
3. [Common Gotchas](#common-gotchas)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## From Equation to Code

Mathematical form: $\mathbf{h}_1 = \sigma(\mathbf{W}_1 \mathbf{x} + \mathbf{b}_1)$, $\mathbf{h}_2 = \sigma(\mathbf{W}_2 \mathbf{h}_1 + \mathbf{b}_2)$, etc.

In [None]:
class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(SimpleMLP, self).__init__()
        
        # First layer
        layers = [nn.Linear(input_size, hidden_size), nn.ReLU()]
        
        # Hidden layers
        for _ in range(num_layers - 2):
            layers.extend([nn.Linear(hidden_size, hidden_size), nn.ReLU()])
        
        # Output layer (no activation - we'll apply it later)
        layers.append(nn.Linear(hidden_size, output_size))
        
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.network(x)

# Manual implementation to show the math explicitly
class ExplicitMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ExplicitMLP, self).__init__()
        
        # Explicitly define each layer
        self.W1 = nn.Linear(input_size, hidden_size)   # W1*x + b1
        self.W2 = nn.Linear(hidden_size, hidden_size)  # W2*h1 + b2
        self.W3 = nn.Linear(hidden_size, output_size)  # W3*h2 + b3
        
    def forward(self, x):
        # h1 = σ(W1*x + b1)
        h1 = F.relu(self.W1(x))
        
        # h2 = σ(W2*h1 + b2)
        h2 = F.relu(self.W2(h1))
        
        # output = W3*h2 + b3 (no activation)
        output = self.W3(h2)
        
        return output

# Compare the two approaches
input_size, hidden_size, output_size = 10, 64, 3
model1 = SimpleMLP(input_size, hidden_size, output_size, num_layers=3)
model2 = ExplicitMLP(input_size, hidden_size, output_size)

print(f"SimpleMLP parameters: {sum(p.numel() for p in model1.parameters()):,}")
print(f"ExplicitMLP parameters: {sum(p.numel() for p in model2.parameters()):,}")

# Test with sample input
x = torch.randn(5, input_size)  # Batch of 5 samples
out1 = model1(x)
out2 = model2(x)

print(f"\nInput shape: {x.shape}")
print(f"Output shape (both models): {out1.shape}, {out2.shape}")

## Training MLP on Synthetic Data

In [None]:
# Generate synthetic classification data
def generate_synthetic_data(n_samples=1000, n_features=20, n_classes=3):
    """Generate synthetic classification data"""
    torch.manual_seed(42)
    
    # Generate features
    X = torch.randn(n_samples, n_features)
    
    # Create non-linear decision boundary
    # Use a simple polynomial to generate labels
    weights = torch.randn(n_features, 1)
    linear_combo = X @ weights
    
    # Add some non-linearity
    scores = linear_combo.squeeze() + 0.5 * (X[:, 0] * X[:, 1]) + 0.3 * (X[:, 2] ** 2)
    
    # Convert to class labels
    percentiles = torch.quantile(scores, torch.tensor([1/3, 2/3]))
    y = torch.zeros(n_samples, dtype=torch.long)
    y[scores > percentiles[0]] = 1
    y[scores > percentiles[1]] = 2
    
    return X, y

# Generate data
X, y = generate_synthetic_data(n_samples=2000, n_features=20, n_classes=3)
print(f"Data shape: {X.shape}, Labels shape: {y.shape}")
print(f"Class distribution: {torch.bincount(y)}")

# Split into train/val
train_size = int(0.8 * len(X))
X_train, X_val = X[:train_size], X[train_size:]
y_train, y_val = y[:train_size], y[train_size:]

print(f"Train size: {len(X_train)}, Val size: {len(X_val)}")

# Create model
model = SimpleMLP(input_size=20, hidden_size=128, output_size=3, num_layers=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train_epoch(model, X, y, optimizer, criterion, batch_size=64):
    model.train()
    total_loss = 0
    correct = 0
    
    # Simple batching (normally you'd use DataLoader)
    for i in range(0, len(X), batch_size):
        batch_X = X[i:i+batch_size]
        batch_y = y[i:i+batch_size]
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == batch_y).sum().item()
    
    return total_loss / (len(X) // batch_size), correct / len(X)

def evaluate(model, X, y):
    model.eval()
    with torch.no_grad():
        outputs = model(X)
        loss = criterion(outputs, y)
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == y).sum().item() / len(y)
    return loss.item(), accuracy

# Training loop
train_losses, train_accs = [], []
val_losses, val_accs = [], []

for epoch in range(50):
    # Train
    train_loss, train_acc = train_epoch(model, X_train, y_train, optimizer, criterion)
    
    # Evaluate
    val_loss, val_acc = evaluate(model, X_val, y_val)
    
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    val_losses.append(val_loss)
    val_accs.append(val_acc)
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch:2d}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

print(f"\nFinal validation accuracy: {val_accs[-1]:.4f}")

In [None]:
# Plot training curves
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Loss curves
ax1.plot(train_losses, label='Train Loss', color='blue')
ax1.plot(val_losses, label='Val Loss', color='red')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training and Validation Loss')
ax1.legend()
ax1.grid(True)

# Accuracy curves
ax2.plot(train_accs, label='Train Acc', color='blue')
ax2.plot(val_accs, label='Val Acc', color='red')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Training and Validation Accuracy')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()

print("Training completed successfully!")

## Common Gotchas

In [None]:
print("=== COMMON MLP GOTCHAS ===")

# 1. Forgetting to set model.train() / model.eval()
print("\n1. Train/Eval Mode:")
model_with_dropout = nn.Sequential(
    nn.Linear(10, 64),
    nn.ReLU(),
    nn.Dropout(0.5),  # 50% dropout
    nn.Linear(64, 1)
)

x = torch.randn(1, 10)

model_with_dropout.train()
out_train = model_with_dropout(x)

model_with_dropout.eval()
out_eval = model_with_dropout(x)

print(f"Output in train mode: {out_train.item():.4f}")
print(f"Output in eval mode: {out_eval.item():.4f}")
print("Notice how outputs can be different due to dropout!")

# 2. Wrong loss function for the task
print("\n2. Loss Function Selection:")
# For classification: CrossEntropyLoss
# For regression: MSELoss or L1Loss
# For binary classification: BCEWithLogitsLoss

logits = torch.randn(3, 5)  # 3 samples, 5 classes
targets = torch.tensor([0, 2, 4])  # Class indices

ce_loss = nn.CrossEntropyLoss()
print(f"Correct (CrossEntropy): {ce_loss(logits, targets).item():.4f}")

# Wrong: using MSE for classification
mse_loss = nn.MSELoss()
# This would be wrong: mse_loss(logits, targets)  # Shape mismatch!
print("MSE for classification would cause shape errors!")

# 3. Gradient explosion/vanishing
print("\n3. Gradient Issues:")

# Very deep network without proper initialization
deep_model = nn.Sequential()
for i in range(20):  # 20 layers!
    deep_model.add_module(f'linear_{i}', nn.Linear(64, 64))
    deep_model.add_module(f'sigmoid_{i}', nn.Sigmoid())  # Sigmoid can cause vanishing gradients
deep_model.add_module('final', nn.Linear(64, 1))

# Check gradient norms
x = torch.randn(10, 64)
y = torch.randn(10, 1)
criterion = nn.MSELoss()

output = deep_model(x)
loss = criterion(output, y)
loss.backward()

# Check gradients in first vs last layer
first_layer_grad = deep_model[0].weight.grad
last_layer_grad = deep_model[-1].weight.grad

if first_layer_grad is not None and last_layer_grad is not None:
    print(f"First layer gradient norm: {first_layer_grad.norm().item():.8f}")
    print(f"Last layer gradient norm: {last_layer_grad.norm().item():.8f}")
    print("Notice how gradients can vanish in deep networks with sigmoid!")

# 4. Learning rate too high/low
print("\n4. Learning Rate Effects:")
print("Too high: Loss explodes or oscillates")
print("Too low: Very slow convergence")
print("Rule of thumb: Start with 1e-3 for Adam, 1e-2 for SGD")

# 5. Batch size effects
print("\n5. Batch Size Considerations:")
print("Small batches: Noisy gradients, may help with generalization")
print("Large batches: Smooth gradients, faster training, may overfit")
print("Typical range: 16-256 for most problems")

print("\n=== END OF GOTCHAS ===")