# 6. Learning Rate and Decay

Learning rate controls how big steps we take when updating weights!
Too big = overshoot, too small = slow learning.
Learning rate decay gradually reduces the learning rate over time.


In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt


## 1. What is Learning Rate?

Learning rate determines step size when updating weights!
Weight update: new_weight = old_weight - learning_rate × gradient
Bigger learning rate = bigger steps, smaller = smaller steps.


In [None]:
# Example: Gradient descent with different learning rates
# Simulate finding minimum of f(x) = (x - 2)²
# Gradient: f'(x) = 2(x - 2)
# Minimum is at x = 2

def gradient(x):
    """Gradient of f(x) = (x - 2)²"""
    return 2 * (x - 2)

# Starting point
x_start = 5.0

# Different learning rates
learning_rates = [0.1, 0.5, 1.0, 2.0]

print("Gradient Descent with Different Learning Rates:")
print("Finding minimum of f(x) = (x - 2)² (minimum at x = 2)")
print(f"Starting point: x = {x_start}")
print()

steps = 10
for lr in learning_rates:
    x = x_start
    path = [x]
    
    for step in range(steps):
        grad = gradient(x)
        x = x - lr * grad  # Update: x = x - lr * gradient
        path.append(x)
    
    final_x = path[-1]
    print(f"Learning rate {lr:3.1f}: Final x = {final_x:.4f} (target: 2.0)")
    print(f"              Steps: {path[:5]} ... → {final_x:.4f}")

# Visualize convergence
x_range = np.linspace(-1, 6, 100)
f_x = (x_range - 2) ** 2

plt.figure(figsize=(12, 8))
plt.plot(x_range, f_x, 'k-', linewidth=2, label='f(x) = (x-2)²', alpha=0.3)

colors = ['b', 'g', 'orange', 'r']
for idx, lr in enumerate(learning_rates):
    x = x_start
    path = [x]
    
    for step in range(steps):
        grad = gradient(x)
        x = x - lr * grad
        path.append(x)
    
    plt.plot(path, [(p - 2)**2 for p in path], 'o-', 
             color=colors[idx], linewidth=2, label=f'LR={lr}', alpha=0.7)

plt.axvline(x=2, color='k', linewidth=1, linestyle='--', label='Minimum (x=2)')
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('Gradient Descent with Different Learning Rates')
plt.grid(True, alpha=0.3)
plt.legend()
plt.xlim(-1, 6)
plt.show()

print("\nObservations:")
print("- LR too small (0.1): Converges slowly but safely")
print("- LR moderate (0.5): Good balance")
print("- LR too large (2.0): Overshoots, oscillates, may diverge!")


## 2. Simple Gradient Descent

Basic gradient descent updates weights using learning rate!
Let's implement it!


In [None]:
# Simple gradient descent example
# Goal: Find weights that minimize loss = (prediction - target)²
# Simple model: prediction = weight × input

# Data
inputs = torch.tensor([1.0, 2.0, 3.0])
targets = torch.tensor([2.0, 4.0, 6.0])  # Target: weight = 2.0

# Initialize weight
weight = torch.tensor(1.0, requires_grad=False)  # Start at 1.0

# Learning rate
learning_rate = 0.1

print("Simple Gradient Descent:")
print("Model: prediction = weight × input")
print("Target weight: 2.0")
print(f"Initial weight: {weight.item():.2f}")
print(f"Learning rate: {learning_rate}")
print()

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    # Forward pass
    predictions = weight * inputs
    
    # Loss (mean squared error)
    loss = ((predictions - targets) ** 2).mean()
    
    # Gradient (manual calculation)
    # d(loss)/d(weight) = 2 * mean((weight*input - target) * input)
    gradient = 2 * ((weight * inputs - targets) * inputs).mean()
    
    # Update weight
    weight = weight - learning_rate * gradient
    
    if epoch < 5 or epoch % 2 == 0:
        print(f"Epoch {epoch:2d}: weight = {weight.item():.4f}, loss = {loss.item():.4f}, gradient = {gradient.item():.4f}")

print(f"\nFinal weight: {weight.item():.4f} (target: 2.0)")
print(f"Converged: {abs(weight.item() - 2.0) < 0.01}")
