# PyTorch Autograd: Automatic Differentiation

## 🎯 Introduction

Welcome to the magic of automatic differentiation! Autograd is PyTorch's secret weapon that makes neural network training possible. Without it, we'd be stuck computing gradients by hand for every parameter in our models - imagine doing that for GPT-3's 175 billion parameters!

### 🧠 What You'll Learn

This notebook will teach you:
- **Gradient computation**: How PyTorch automatically computes derivatives
- **Training integration**: How autograd powers neural network optimization
- **Memory management**: When to use `torch.no_grad()` and `.detach()`
- **Gradient clipping**: Preventing exploding gradients in deep networks
- **Advanced patterns**: Higher-order derivatives and optimization tricks

### 🎓 Prerequisites

- Basic calculus (derivatives, chain rule)
- Understanding of tensors from the previous notebook
- Basic knowledge of neural network training concepts

### 🚀 Why Autograd Matters

Autograd is revolutionary because:
- **Eliminates manual differentiation**: No more error-prone hand calculations
- **Handles any computation graph**: Works with any combination of operations
- **Efficient backpropagation**: Optimized for the computations you actually use
- **Dynamic graphs**: Change your network structure on-the-fly

---

## 📚 Table of Contents

1. **[Basic Gradient Computation](#basic-gradient-computation)** - Understanding the fundamentals
2. **[Optimizer Integration](#optimizer-integration)** - How gradients drive learning
3. **[No-Grad Context and Detach](#no-grad-context-and-detach)** - Memory optimization techniques
4. **[Gradient Clipping Demo](#gradient-clipping-demo)** - Preventing training instabilities

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np

# Set seed for reproducibility
torch.manual_seed(42)

print(f"PyTorch version: {torch.__version__}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Basic Gradient Computation

In [None]:
# Simple function: f(x) = x^2 + 2x + 1
# Derivative: f'(x) = 2x + 2

x = torch.tensor(3.0, requires_grad=True)  # Enable gradient computation
print(f"x = {x.item()}, requires_grad = {x.requires_grad}")

# Forward pass
y = x**2 + 2*x + 1
print(f"y = f(x) = x^2 + 2x + 1 = {y.item()}")

# Backward pass (compute gradient)
y.backward()

print(f"dy/dx = {x.grad.item()}")
print(f"Analytical derivative at x=3: 2*3 + 2 = {2*3 + 2}")
print(f"Match: {abs(x.grad.item() - (2*3 + 2)) < 1e-6}")

# Multi-variable function
print("\n=== Multi-variable Example ===")
x = torch.tensor(1.0, requires_grad=True)
y = torch.tensor(2.0, requires_grad=True)

# f(x,y) = x^2*y + x*y^2
z = x**2 * y + x * y**2
print(f"f(1,2) = 1^2*2 + 1*2^2 = {z.item()}")

z.backward()

print(f"∂f/∂x = {x.grad.item()} (should be 2xy + y^2 = 2*1*2 + 2^2 = {2*1*2 + 2**2})")
print(f"∂f/∂y = {y.grad.item()} (should be x^2 + 2xy = 1^2 + 2*1*2 = {1**2 + 2*1*2})")

In [None]:
# Chain rule in action
print("=== Chain Rule Demo ===")

x = torch.tensor(2.0, requires_grad=True)

# Composite function: f(g(h(x))) where h(x)=x^2, g(u)=u+1, f(v)=v^3
h = x**2        # h(x) = x^2, h'(x) = 2x
g = h + 1       # g(h) = h + 1, g'(h) = 1
f = g**3        # f(g) = g^3, f'(g) = 3g^2

print(f"x = {x.item()}")
print(f"h(x) = x^2 = {h.item()}")
print(f"g(h) = h + 1 = {g.item()}")
print(f"f(g) = g^3 = {f.item()}")

f.backward()

print(f"\nComputed gradient: df/dx = {x.grad.item()}")

# Manual chain rule: df/dx = (df/dg) * (dg/dh) * (dh/dx)
# df/dg = 3g^2 = 3*(x^2 + 1)^2
# dg/dh = 1
# dh/dx = 2x
# So df/dx = 3*(x^2 + 1)^2 * 1 * 2x = 6x*(x^2 + 1)^2
manual_grad = 6 * x.item() * (x.item()**2 + 1)**2
print(f"Manual chain rule: df/dx = 6x(x^2 + 1)^2 = {manual_grad}")
print(f"Match: {abs(x.grad.item() - manual_grad) < 1e-6}")

## Optimizer Integration

In [None]:
# Simple optimization example: minimize f(x) = (x - 5)^2
print("=== Optimization Example ===")
print("Minimizing f(x) = (x - 5)^2 using gradient descent")

# Starting point
x = torch.tensor(0.0, requires_grad=True)
optimizer = optim.SGD([x], lr=0.1)

# Track progress
history = []

for step in range(20):
    # Zero gradients (important!)
    optimizer.zero_grad()
    
    # Forward pass
    loss = (x - 5)**2
    
    # Backward pass
    loss.backward()
    
    # Update parameters
    optimizer.step()
    
    # Track progress
    history.append((step, x.item(), loss.item()))
    
    if step % 5 == 0:
        print(f"Step {step:2d}: x = {x.item():.4f}, loss = {loss.item():.4f}")

print(f"\nFinal result: x = {x.item():.4f} (target: 5.0)")
print(f"Error: {abs(x.item() - 5.0):.6f}")

In [None]:
# Visualize optimization progress
steps, x_vals, losses = zip(*history)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Plot parameter evolution
ax1.plot(steps, x_vals, 'b-o', markersize=4)
ax1.axhline(y=5.0, color='r', linestyle='--', label='Target (x=5)')
ax1.set_xlabel('Step')
ax1.set_ylabel('x value')
ax1.set_title('Parameter Convergence')
ax1.grid(True)
ax1.legend()

# Plot loss evolution
ax2.plot(steps, losses, 'r-o', markersize=4)
ax2.set_xlabel('Step')
ax2.set_ylabel('Loss')
ax2.set_title('Loss Convergence')
ax2.set_yscale('log')
ax2.grid(True)

plt.tight_layout()
plt.show()

print("Optimization completed successfully!")

## No-Grad Context and Detach

In [None]:
print("=== No-Grad Context ===")

x = torch.tensor(2.0, requires_grad=True)

# Normal operation (gradients tracked)
y1 = x**2
print(f"y1 = x^2, requires_grad = {y1.requires_grad}")

# Using torch.no_grad() context
with torch.no_grad():
    y2 = x**2
    print(f"y2 = x^2 (in no_grad), requires_grad = {y2.requires_grad}")

# Using .detach() method
y3 = x.detach()**2
print(f"y3 = x.detach()^2, requires_grad = {y3.requires_grad}")

# Practical example: evaluation mode
print("\n=== Practical Example: Model Evaluation ===")

model = nn.Linear(5, 1)
x_data = torch.randn(10, 5)

# During training (gradients needed)
model.train()
output_train = model(x_data)
print(f"Training mode - output requires_grad: {output_train.requires_grad}")

# During evaluation (no gradients needed, saves memory)
model.eval()
with torch.no_grad():
    output_eval = model(x_data)
    print(f"Eval mode - output requires_grad: {output_eval.requires_grad}")

print("\nUsing no_grad during evaluation:")
print("✓ Saves memory (no gradient computation graph)")
print("✓ Faster inference")
print("✓ Prevents accidental gradient updates")

## Gradient Clipping Demo

In [None]:
print("=== Gradient Clipping Demo ===")

# Create a simple model that might have exploding gradients
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(2, 1)
        # Initialize with large weights to cause gradient explosion
        nn.init.constant_(self.linear.weight, 10.0)
        nn.init.constant_(self.linear.bias, 0.0)
    
    def forward(self, x):
        return self.linear(x)

model = SimpleModel()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Create some data that will cause large gradients
x = torch.tensor([[10.0, 10.0], [5.0, 5.0], [-10.0, -10.0]])
y_true = torch.tensor([[1.0], [0.5], [-1.0]])

print("Training without gradient clipping:")
gradient_norms = []

for step in range(5):
    optimizer.zero_grad()
    
    y_pred = model(x)
    loss = nn.functional.mse_loss(y_pred, y_true)
    loss.backward()
    
    # Calculate gradient norm before clipping
    total_norm = 0
    for p in model.parameters():
        if p.grad is not None:
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item() ** 2
    total_norm = total_norm ** (1. / 2)
    gradient_norms.append(total_norm)
    
    print(f"Step {step}: Loss = {loss.item():.4f}, Grad norm = {total_norm:.4f}")
    
    optimizer.step()

print(f"\nAverage gradient norm: {np.mean(gradient_norms):.4f}")

# Reset model
model = SimpleModel()
optimizer = optim.SGD(model.parameters(), lr=0.1)

print("\nTraining WITH gradient clipping (max_norm=1.0):")
clipped_gradient_norms = []

for step in range(5):
    optimizer.zero_grad()
    
    y_pred = model(x)
    loss = nn.functional.mse_loss(y_pred, y_true)
    loss.backward()
    
    # Clip gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    
    # Calculate gradient norm after clipping
    total_norm = 0
    for p in model.parameters():
        if p.grad is not None:
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item() ** 2
    total_norm = total_norm ** (1. / 2)
    clipped_gradient_norms.append(total_norm)
    
    print(f"Step {step}: Loss = {loss.item():.4f}, Clipped grad norm = {total_norm:.4f}")
    
    optimizer.step()

print(f"\nAverage clipped gradient norm: {np.mean(clipped_gradient_norms):.4f}")
print("\nGradient clipping benefits:")
print("✓ Prevents exploding gradients")
print("✓ Stabilizes training")
print("✓ Allows higher learning rates")
print("✓ Essential for RNNs and deep networks")

In [None]:
# Advanced autograd example: Higher-order derivatives
print("=== Higher-Order Derivatives ===")

x = torch.tensor(2.0, requires_grad=True)

# f(x) = x^4
y = x**4

# First derivative: f'(x) = 4x^3
grad1 = torch.autograd.grad(y, x, create_graph=True)[0]
print(f"f(x) = x^4 = {y.item()}")
print(f"f'(x) = 4x^3 = {grad1.item()} (analytical: {4 * x.item()**3})")

# Second derivative: f''(x) = 12x^2
grad2 = torch.autograd.grad(grad1, x, create_graph=True)[0]
print(f"f''(x) = 12x^2 = {grad2.item()} (analytical: {12 * x.item()**2})")

# Third derivative: f'''(x) = 24x
grad3 = torch.autograd.grad(grad2, x, create_graph=True)[0]
print(f"f'''(x) = 24x = {grad3.item()} (analytical: {24 * x.item()})")

print("\n🎉 Autograd exploration completed!")
print("\nKey takeaways:")
print("• requires_grad=True enables gradient tracking")
print("• .backward() computes gradients")
print("• optimizer.zero_grad() clears old gradients")
print("• torch.no_grad() disables gradient computation")
print("• Gradient clipping prevents exploding gradients")