<a href="https://colab.research.google.com/github/scaomath/wustl-math450/blob/main/Lectures/Math_450_Notebook_11_(Momentum).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Coding lecture 11 of Math 450

## Last couple of weeks
- A complete pipeline of training a machine learning model
- Validation

## Today
- Add momentum to the formula.

In [None]:
import torch
import numpy as np
from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Optimizer
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("dark")

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = datasets.MNIST(root='./', 
                       train=True, 
                       download=True, 
                       transform = transforms.ToTensor());

train_loader = DataLoader(train, batch_size=8) 

class MLP(nn.Module):
    def __init__(self, 
                 input_size: int = 28*28,
                 output_size: int = 10):
        super(MLP, self).__init__() 
        self.linear0 = nn.Linear(input_size, 256)
        self.activation = nn.ReLU()
        self.linear1 = nn.Linear(256, output_size)
        self.dropout = nn.Dropout(0.1) 
        # 10% of the weight does not get updated: dropout
        
    def forward(self, x): 
        x = x.view(x.size(0), -1) 
        x1 = self.linear0(x)
        a1 = self.activation(x1)
        output = self.linear1(a1)

        return output

In [None]:
class SGD(Optimizer):
    """
      Implements the SGD with momentum simplified 
      from the torch official one for Math 450 WashU
      
      Args:
          params (iterable): iterable of parameters to optimize or dicts defining
              parameter groups
          lr (float): learning rate
          weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
          nesterov (bool, optional): whether to use Nesterov's momentum (default: False)
      
      For final project:
          update a version with nesterov's momentum in it
          
      Example:
          >>> optimizer = SGD(model.parameters(), lr=1e-2)
          >>> optimizer.zero_grad()
          >>> loss_fn(model(input), target).backward()
          >>> optimizer.step()
      """

    def __init__(self, params, lr=1e-3, momentum=0, dampening=0,
                 weight_decay=0, 
                 nesterov=False,
                 ):
      defaults = dict(lr=lr, 
                      momentum=momentum, 
                      dampening=dampening,
                      weight_decay=weight_decay,
                      nesterov=nesterov,
                      )
      super(SGD, self).__init__(params, defaults)
      

    def step(self, closure=None):

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buffer = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buffer.mul_(momentum).add_(d_p)
                    else:
                        buffer = param_state['momentum_buffer']
                        buffer.mul_(momentum).add_(1 - dampening, d_p)
                    d_p = buffer

                p.data = p.data - group['lr']*d_p

        return loss

In [None]:
model = MLP() # initialize the model
loss_func = nn.CrossEntropyLoss() # set up the loss
# crossentropyloss is for the case of a balanced classification problem
epochs = 2
learning_rate = 1e-3
optimizer = SGD(model.parameters(), lr=learning_rate)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train.data.float()[:10000]
y = train.targets[:10000]
print(X.size(), y.size())
X_tr, X_val, y_tr, y_val = \
train_test_split(X, y, random_state=0, train_size=0.8)

In [None]:
print(X_tr.size(), X_val.size())

In [None]:
train_set = TensorDataset(X_tr, y_tr)
train_loader = DataLoader(train_set, batch_size=32)

valid_set = TensorDataset(X_val, y_val)
val_loader = DataLoader(valid_set, batch_size=32)

In [None]:
# pipeline
for epoch in range(epochs):
    
    model.train() # formalism, useful when we have dropout
    
    loss_vals = []
    acc_on_valid = []
    
    with tqdm(total=len(train_loader)) as pbar: # progress bar
      for data, targets in train_loader:
        
        # forward pass
        outputs = model(data)
        
        # loss function
        loss = loss_func(outputs, targets)
        
        # record loss function values .item()
        loss_vals.append(loss.item())
        
        # clean the gradient from last iteration
        # param.grad is not zero in last iteration
        optimizer.zero_grad()
        
        # backprop
        # autograd
        loss.backward()
        
        # stochastic gradient descent
        # no with torch.no_grad(): block, param operation is using .data
        optimizer.step()
        
        # check accuracy (add validation here)
        with torch.no_grad():
           for x, y in val_loader:
             yhat = model(x)
             yhat = yhat.argmax(dim=-1)
             acc = (yhat == y).float().mean()
             acc_on_valid.append(acc)

        # tqdm template
        desc = f"epoch: [{epoch+1}/{epochs}] loss: {np.mean(loss_vals):.2f}"
        pbar.set_description(desc)
        pbar.update()
    print(f"accuracy on validation: {np.mean(acc_on_valid):.2f}")

# Explicit gradient checking


$$f = 3a^3 - b^2$$

In [None]:
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

f = (3*a**3 - b**2).sum()

In [None]:
optimizer = SGD([a, b], lr=learning_rate)

In [None]:
f.backward()
optimizer.step()