# Coding Task (Solution): Adam vs GD Learning Rate Sensitivity
**Objective:** Compare optimization paths of Adam vs GD with different learning rates on following Rosenbrock function:
$$
f(x, y) = (1 - x)^2 + 10 (y - x^2)^2,
$$
which has a unique global minimum at (1,1).

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline

# Rosenbrock function and gradient
def f(x, y):
    return (1 - x)**2 + 10*(y - x**2)**2

def grad(x, y):
    dx = -2*(1 - x) - 40*x*(y - x**2)
    dy = 20*(y - x**2)
    return np.array([dx, dy])

## 2. Function Visualization

In [None]:
x = np.linspace(-2, 2, 100)
y = np.linspace(-1, 3, 100)
X, Y = np.meshgrid(x, y)
Z = f(X, Y)

plt.figure(figsize=(10,6))
cs = plt.contourf(X, Y, Z, levels=50, cmap=cm.viridis)
plt.plot(1, 1, 'r*', markersize=15, label='Global Minimum')
plt.colorbar(cs)
plt.title("Rosenbrock Function Contour Plot")
plt.legend()
plt.show()

## 3. Optimization Algorithms

In [None]:
def gd(start, lr, n_iters=100):
    """Vanilla Gradient Descent"""
    path = [start]
    p = start.copy()
    for _ in range(n_iters):
        g = grad(*p)
        p -= lr * g
        path.append(p.copy())
    return np.array(path)

def adam(start, lr, beta1=0.9, beta2=0.999, eps=1e-8, n_iters=100):
    """Adam Optimizer"""
    path = [start]
    p = start.copy()
    v = np.zeros_like(p)
    s = np.zeros_like(p)
    
    for t in range(1, n_iters+1):
        g = grad(*p)
        v = beta1*v + (1-beta1)*g
        s = beta2*s + (1-beta2)*(g*g)
        v_hat = v/(1 - beta1**t)
        s_hat = s/(1 - beta2**t)
        p -= lr * v_hat/(np.sqrt(s_hat) + eps)
        path.append(p.copy())
    return np.array(path)

## 4. Learning Rate Comparison

In [None]:
lrs = [0.5, 0.1, 0.05, 0.01, 0.005]  # Test learning rates
start_point = np.array([-1.5, 2.5])    # Starting point
n_iters = 100000

plt.figure(figsize=(15, 8))

# Compare each learning rate
for i, lr in enumerate(lrs):
    # GD
    plt.subplot(2, len(lrs), i+1)
    path = gd(start_point, lr)
    plt.contourf(X, Y, Z, levels=30, cmap=cm.viridis, alpha=0.6, n_iters=n_iters)
    plt.plot(*path.T, 'r.-', linewidth=1, markersize=2)
    plt.title(f"GD lr={lr}")
    
    # Adam
    plt.subplot(2, len(lrs), len(lrs)+i+1)
    path = adam(start_point, lr)
    plt.contourf(X, Y, Z, levels=30, cmap=cm.viridis, alpha=0.6, n_iters=n_iters)
    plt.plot(*path.T, 'b.-', linewidth=1, markersize=2)
    plt.title(f"Adam lr={lr}")

plt.tight_layout()
plt.show()

## 5. Observations
- **GD**: Highly sensitive to learning rate
  - Large LR (>0.1): Diverges
  - Small LR (<0.001): Slow convergence
- **Adam**: Robust across LRs
  - Stable convergence even with large LR=1.0
  - Fast convergence across different LRs

### Why Adam is More Robust?
1. **Per-parameter learning rates**: Adapts to gradient magnitudes
2. **Momentum**: Smooths gradient updates

## 6. Exercises
1. Try different start positions
2. Test with other functions (Beale, Himmelblau)
3. Compare number of iterations to convergence
4. Implement RMSprop and compare