# 7. Adam Optimizer

Adam (Adaptive Moment Estimation) is a popular optimizer!
It adapts learning rate for each parameter based on past gradients.
Combines ideas from momentum and RMSprop.


In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt


## 1. What is Adam?

Adam adapts learning rate per parameter!
It keeps track of:
- **First moment (m)**: Exponential moving average of gradients (momentum)
- **Second moment (v)**: Exponential moving average of squared gradients

Then updates: weight = weight - lr × m_hat / (√v_hat + eps)


In [None]:
class AdamOptimizer:
    """Adam optimizer from scratch"""
    
    def __init__(self, params, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        """
        params: dictionary of parameters {name: tensor}
        lr: learning rate
        beta1: decay rate for first moment (momentum)
        beta2: decay rate for second moment
        eps: small constant for numerical stability
        """
        self.params = params
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        
        # Initialize moments (all zeros)
        self.m = {name: torch.zeros_like(param) for name, param in params.items()}
        self.v = {name: torch.zeros_like(param) for name, param in params.items()}
        
        # Time step
        self.t = 0
    
    def step(self, gradients):
        """
        gradients: dictionary of gradients {name: gradient}
        Update parameters using Adam
        """
        self.t += 1
        
        for name in self.params.keys():
            grad = gradients[name]
            
            # Update first moment (momentum)
            self.m[name] = self.beta1 * self.m[name] + (1 - self.beta1) * grad
            
            # Update second moment (squared gradients)
            self.v[name] = self.beta2 * self.v[name] + (1 - self.beta2) * (grad ** 2)
            
            # Bias correction
            m_hat = self.m[name] / (1 - self.beta1 ** self.t)
            v_hat = self.v[name] / (1 - self.beta2 ** self.t)
            
            # Update parameter
            self.params[name] = self.params[name] - self.lr * m_hat / (torch.sqrt(v_hat) + self.eps)

# Example: Optimize simple function f(x) = (x - 2)²
# Gradient: f'(x) = 2(x - 2)

# Initialize
x = torch.tensor(5.0)
params = {'x': x}
optimizer = AdamOptimizer(params, lr=0.1)

print("Adam Optimizer:")
print(f"Initial x: {x.item():.2f}")
print(f"Target: x = 2.0 (minimum of f(x) = (x-2)²)")
print()

# Training loop
for epoch in range(20):
    # Compute gradient
    grad = 2 * (params['x'] - 2)
    gradients = {'x': grad}
    
    # Update with Adam
    optimizer.step(gradients)
    
    # Compute loss
    loss = (params['x'] - 2) ** 2
    
    if epoch < 5 or epoch % 5 == 0:
        print(f"Epoch {epoch:2d}: x = {params['x'].item():.4f}, "
              f"grad = {grad.item():.4f}, loss = {loss.item():.6f}, "
              f"m = {optimizer.m['x'].item():.4f}, v = {optimizer.v['x'].item():.4f}")

print(f"\nFinal x: {params['x'].item():.4f} (target: 2.0)")
