# Optimization for Neural Networks

This notebook contains PyTorch examples demonstrating optimization concepts essential for understanding neural networks.

## Table of Contents
1. [Gradient Descent](#gradient-descent)
2. [Adam Optimizer](#adam-optimizer)

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt

## Gradient Descent

**Formula:** $\theta_{t+1} = \theta_t - \alpha \nabla_\theta \mathcal{L}(\theta_t)$

Fundamental learning algorithm for neural networks.

In [None]:
# Manual gradient descent
def manual_gradient_descent():
    x = torch.tensor([0.0, 0.0], requires_grad=True)
    learning_rate = 0.1
    
    for step in range(50):
        loss = x[0]**2 + x[1]**2  # Simple quadratic function
        loss.backward()
        
        with torch.no_grad():
            x -= learning_rate * x.grad
            x.grad.zero_()
        
        if step % 10 == 0:
            print(f"Step {step}: x = {x.detach().numpy()}, loss = {loss.item():.6f}")

manual_gradient_descent()

# Comparing optimizers
model = torch.nn.Linear(10, 1)
target = torch.randn(100, 1)
data = torch.randn(100, 10)

# SGD vs Adam comparison
losses_sgd = []
losses_adam = []

for optimizer_class, losses_list in [(torch.optim.SGD, losses_sgd), (torch.optim.Adam, losses_adam)]:
    model_copy = torch.nn.Linear(10, 1)
    model_copy.load_state_dict(model.state_dict())  # Same initialization
    optimizer = optimizer_class(model_copy.parameters(), lr=0.01)
    
    for epoch in range(100):
        optimizer.zero_grad()
        loss = torch.nn.functional.mse_loss(model_copy(data), target)
        loss.backward()
        optimizer.step()
        losses_list.append(loss.item())

print(f"Final loss - SGD: {losses_sgd[-1]:.6f}, Adam: {losses_adam[-1]:.6f}")

## Adam Optimizer

**Formula:** $m_t = \beta_1 m_{t-1} + (1-\beta_1)g_t$, $v_t = \beta_2 v_{t-1} + (1-\beta_2)g_t^2$, $\theta_t = \theta_{t-1} - \frac{\alpha}{\sqrt{v_t} + \epsilon}\hat{m}_t$

Adaptive learning rate optimizer with momentum.

In [None]:
# Manual Adam implementation
class ManualAdam:
    def __init__(self, params, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.params = list(params)
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.t = 0
        self.m = [torch.zeros_like(p) for p in self.params]
        self.v = [torch.zeros_like(p) for p in self.params]
    
    def step(self):
        self.t += 1
        for i, param in enumerate(self.params):
            if param.grad is None:
                continue
            
            # Update momentum and variance
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * param.grad
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * param.grad**2
            
            # Bias correction
            m_hat = self.m[i] / (1 - self.beta1**self.t)
            v_hat = self.v[i] / (1 - self.beta2**self.t)
            
            # Update parameters
            param.data -= self.lr * m_hat / (torch.sqrt(v_hat) + self.eps)

# Compare manual vs PyTorch Adam
model1 = torch.nn.Linear(5, 1)
model2 = torch.nn.Linear(5, 1)
model2.load_state_dict(model1.state_dict())

manual_adam = ManualAdam(model1.parameters())
pytorch_adam = torch.optim.Adam(model2.parameters())

x = torch.randn(32, 5)
y = torch.randn(32, 1)

for epoch in range(10):
    # Manual Adam
    loss1 = torch.nn.functional.mse_loss(model1(x), y)
    model1.zero_grad()
    loss1.backward()
    manual_adam.step()
    
    # PyTorch Adam
    loss2 = torch.nn.functional.mse_loss(model2(x), y)
    pytorch_adam.zero_grad()
    loss2.backward()
    pytorch_adam.step()
    
    if epoch % 3 == 0:
        print(f"Epoch {epoch}: Manual Adam loss: {loss1:.6f}, PyTorch Adam loss: {loss2:.6f}")