<a href="https://colab.research.google.com/github/scaomath/wustl-math450/blob/main/Lectures/Math_450_Notebook_11_(Momentum).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Coding lecture 11 of Math 450

## Last couple of weeks
- A complete pipeline of training a machine learning model
- Validation

## Today
- Add momentum to the formula.

In [34]:
import torch
import numpy as np
from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Optimizer
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("dark")

import warnings
warnings.filterwarnings("ignore")

In [35]:
train = datasets.MNIST(root='./', 
                       train=True, 
                       download=True, 
                       transform = transforms.ToTensor());

train_loader = DataLoader(train, batch_size=8) 

class MLP(nn.Module):
    def __init__(self, 
                 input_size: int = 28*28,
                 output_size: int = 10):
        super(MLP, self).__init__() 
        self.linear0 = nn.Linear(input_size, 256)
        self.activation = nn.ReLU()
        self.linear1 = nn.Linear(256, output_size)
        self.dropout = nn.Dropout(0.1) 
        # 10% of the weight does not get updated: dropout
        
    def forward(self, x): 
        x = x.view(x.size(0), -1) 
        x1 = self.linear0(x)
        a1 = self.activation(x1)
        output = self.linear1(a1)

        return output

In [None]:
class SGD(Optimizer):
    """
      Implements the SGD with momentum simplified 
      from the torch official one for Math 450 WashU
      
      Args:
          params (iterable): iterable of parameters to optimize or dicts defining
              parameter groups
          lr (float): learning rate
          weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
          nesterov (bool, optional): whether to use Nesterov's momentum (default: False)
      
      For final project:
          update a version with nesterov's momentum in it
          
      Example:
          >>> optimizer = SGD(model.parameters(), lr=1e-2)
          >>> optimizer.zero_grad()
          >>> loss_fn(model(input), target).backward()
          >>> optimizer.step()
      """

    def __init__(self, params, lr=1e-3, 
                 momentum=0, # beta: momentum constant
                 dampening=0, # a dampening constant for momentum
                 weight_decay=0, # epsilon: weight decay constant
                 nesterov=False,
                 ):
      defaults = dict(lr=lr, 
                      momentum=momentum, 
                      dampening=dampening,
                      weight_decay=weight_decay,
                      nesterov=nesterov,
                      )
      super(SGD, self).__init__(params, defaults)
      

    def step(self, closure=None):

        for group in self.param_groups:
            '''
            self.param_groups: 
            dict that has every parameters of this optimizer
            as well as the weights of the NN
            '''

            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                '''
                if SGD(model.parameters(), lr=learning_rate)
                then group['params'] will be model.parameters()
                '''
                if p.grad is None:
                    '''
                    non-trainable layers
                    frozen layers (fine-tuning for downstream tasks)
                    '''
                    continue

                d_p = p.grad.data

                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)

                if momentum != 0:
                    '''
                    self.state[p]: the current SGD parameters state
                    for the NN's weights p
                    dictionary containing momentum
                    and other things
                    '''
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buffer = param_state['momentum_buffer'] \
                               = torch.zeros_like(p.data)
                        buffer.mul_(momentum).add_(d_p)
                    else:
                        # update the buffer
                        buffer = param_state['momentum_buffer']
                        buffer.mul_(momentum).add_(1 - dampening, d_p)
                    d_p = buffer

                # w_{k+1} = w_k + v_{k+1}
                p.data = p.data - group['lr']*d_p

        return loss

In [36]:
# add_
a = torch.tensor([2.0])
b = torch.tensor([3.0])
torch.add(a, b)

tensor([5.])

In [39]:
a.add(b)
print(a.add(b))
print(a)

tensor([5.])
tensor([2.])


In [41]:
torch.add(a, b, alpha=5) # a + b*5

tensor([17.])

In [43]:
a.add_(b, alpha=5)

tensor([17.])

In [44]:
print(a)

tensor([17.])


In [46]:
a.add_(5, b) # a <= a + 5*b

tensor([32.])

In [47]:
model = MLP() # initialize the model
loss_func = nn.CrossEntropyLoss() # set up the loss
# crossentropyloss is for the case of a balanced classification problem
epochs = 2
learning_rate = 1e-3
optimizer = SGD(model.parameters(), lr=learning_rate)

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
X = train.data.float()[:10000]
y = train.targets[:10000]
print(X.size(), y.size())
X_tr, X_val, y_tr, y_val = \
train_test_split(X, y, random_state=0, train_size=0.8)

torch.Size([10000, 28, 28]) torch.Size([10000])


In [None]:
print(X_tr.size(), X_val.size())

In [50]:
train_set = TensorDataset(X_tr, y_tr)
train_loader = DataLoader(train_set, batch_size=32)

valid_set = TensorDataset(X_val, y_val)
val_loader = DataLoader(valid_set, batch_size=32)

In [51]:
# pipeline
for epoch in range(epochs):
    
    model.train() # formalism, useful when we have dropout
    
    loss_vals = []
    acc_on_valid = []
    
    with tqdm(total=len(train_loader)) as pbar: # progress bar
      for data, targets in train_loader:
        
        # forward pass
        outputs = model(data)
        
        # loss function
        loss = loss_func(outputs, targets)
        
        # record loss function values .item()
        loss_vals.append(loss.item())
        
        # clean the gradient from last iteration
        # param.grad is not zero in last iteration
        optimizer.zero_grad()
        
        # backprop
        # autograd
        loss.backward()
        
        # stochastic gradient descent
        # no with torch.no_grad(): block, param operation is using .data
        optimizer.step()
        
        # check accuracy (add validation here)
        with torch.no_grad():
           for x, y in val_loader:
             # x: validation image
             # y: validation target
             yhat = model(x) # yhat is (n_batch, 10)
             yhat = yhat.argmax(dim=-1) # yhat is (n_batch, )
             acc = (yhat == y).float().mean()
             acc_on_valid.append(acc)

        # tqdm template
        desc = f"epoch: [{epoch+1}/{epochs}] loss: {np.mean(loss_vals):.2f}"
        pbar.set_description(desc)
        pbar.update()
    print(f"accuracy on validation: {np.mean(acc_on_valid):.2f}")

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))


accuracy on validation: 0.81


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))


accuracy on validation: 0.90


# Explicit gradient checking

Element-wise loss function:
$$f = 3a^3 - b^2$$

In [80]:
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

f = (3*a**3 - b**2).mean()

In [81]:
print(f)

tensor(26.5000, grad_fn=<MeanBackward0>)


In [82]:
optimizer = SGD([a, b], lr=learning_rate, momentum=0)

In [83]:
optimizer.zero_grad()
f.backward()
optimizer.step();

In [85]:
for group in optimizer.param_groups:
  for p in group['params']:
    print(p)
    print(optimizer.state[p])
    # first step there when there is no momentum

tensor([1.9820, 2.9595], requires_grad=True)
{}
tensor([6.0060, 4.0040], requires_grad=True)
{}


In [86]:
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

f = (3*a**3 - b**2).mean()

optimizer = SGD([a, b], lr=learning_rate, momentum=0.9)

In [87]:
optimizer.zero_grad()
f.backward(retain_graph=True)
optimizer.step();

In [88]:
# first iteration will be the same
for group in optimizer.param_groups:
  for p in group['params']:
    print(p)
    print(optimizer.state[p])
    # first step there when there is no momentum
    # the result is the same with GD
    # but momentum buffer will be updated

tensor([1.9820, 2.9595], requires_grad=True)
{'momentum_buffer': tensor([18.0000, 40.5000])}
tensor([6.0060, 4.0040], requires_grad=True)
{'momentum_buffer': tensor([-6., -4.])}


In [89]:
print(a)
print(b)

tensor([1.9820, 2.9595], requires_grad=True)
tensor([6.0060, 4.0040], requires_grad=True)


In [90]:
# now we step again
f = (3*a**3 - b**2).mean()
optimizer.zero_grad()
f.backward(retain_graph=True)
optimizer.step();

In [91]:
for group in optimizer.param_groups:
  for p in group['params']:
    print(p)
    print(optimizer.state[p])

# the result will be different from GD
# and we can verify by hand that the momentum computation is good

tensor([1.9481, 2.8836], requires_grad=True)
{'momentum_buffer': tensor([33.8775, 75.8639])}
tensor([6.0174, 4.0116], requires_grad=True)
{'momentum_buffer': tensor([-11.4060,  -7.6040])}
