In [2]:
import torch as t
import utils
import typing
from typing import Callable, Iterable

# Optimizer Exercises

(The first part of the day was spent reading up on the different optimizers. Gained some interesting intuitions! Especially liked the one that essentially all optimizers are acting as preconditioners to your gradients in an attempt to reflect 'how much the gradients are allowed to change'; gradient descrent just means your aren't preconditioning.)

In [3]:
def rosenbrocks_banana(x: t.Tensor, y: t.Tensor, a=1, b=100) -> t.Tensor:
    return (a-x)**2 + b*(y-x**2)**2

x_range = [-2, 3]
y_range = [-1, 3]
fig = utils.plot_fn(rosenbrocks_banana, x_range, y_range, log_scale = True)
fig

In [4]:
def opt_fn_with_sgd(fn: Callable, xy: t.Tensor, lr=0.001, momentum=0.98, n_iters: int = 100):
    '''
    Optimize the a given function starting from the specified point.

    xy: shape (2,). The (x, y) starting point.
    n_iters: number of steps.

    Return: (n_iters, 2). The (x,y) BEFORE each step. So out[0] is the starting point.
    '''
    assert xy.requires_grad

    to_optimize = {xy}
    sgd = t.optim.SGD(to_optimize,lr,momentum)

    out = t.zeros((n_iters, 2))
    for i in range(n_iters):
        out[i] = xy.detach() # add to all steps

        loss = fn(xy[0], xy[1]) # measure loss
        loss.backward() # calculate gradients
        sgd.step() # take a step backward (with momentum?)
        sgd.zero_grad()
    print(out[99])
    return out

    
    


xy = t.tensor([-1.5, 2.5], requires_grad=True)
x_range = [-2, 2]
y_range = [-1, 3]

fig = utils.plot_optimization_sgd(opt_fn_with_sgd, rosenbrocks_banana, xy, x_range, y_range, lr=0.001, momentum=0.98, show_min=True)

fig.show()

tensor([1.0234, 1.1983])


## Implementing Optimizers

My GOODNESS! You can literally read a whole entire paragraph warning about in place operations with tensors, but STILL spend over 20 minutes trying to debug an optimizer which simply didn't do an in place operation.

In [5]:
class SGD:
    params: list

    def __init__(self, params: Iterable[t.nn.parameter.Parameter], lr: float, momentum: float = 0., weight_decay: float = 0.):
        '''Implements SGD with momentum.

        Like the PyTorch version, but assume nesterov=False, maximize=False, and dampening=0
            https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD
        '''
        
        self.params = list(params)
        self.lastGrads = [t.zeros_like(p) for p in self.params]
        #print(self.lastGrads.shape) # (had temporarily forgotten that grad is same size)
        
        self.lr = lr
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.t = 0

        

    def zero_grad(self) -> None:
        for i in self.params:
            i.grad = None

            
    @t.inference_mode()
    def step(self) -> None:
        for index, param in enumerate(self.params):
            
            g = param.grad

            if self.weight_decay != 0:
                g = g + self.weight_decay * param

            if self.momentum != 0 and self.t > 0:
                g = (self.momentum * self.lastGrads[index]) + g
                
            param -= self.lr * g
            self.lastGrads[index] = g

        self.t = self.t + 1

        
        

    def __repr__(self) -> str:
        return f"SGD(lr={self.lr}, momentum={self.momentum}, weight_decay={self.weight_decay})"
        

utils.test_sgd(SGD)


Testing configuration:  {'lr': 0.1, 'momentum': 0.0, 'weight_decay': 0.0}

Testing configuration:  {'lr': 0.1, 'momentum': 0.7, 'weight_decay': 0.0}

Testing configuration:  {'lr': 0.1, 'momentum': 0.5, 'weight_decay': 0.0}

Testing configuration:  {'lr': 0.1, 'momentum': 0.5, 'weight_decay': 0.05}

Testing configuration:  {'lr': 0.2, 'momentum': 0.8, 'weight_decay': 0.05}


In [6]:
class RMSprop:
    def __init__(
        self,
        params: Iterable[t.nn.parameter.Parameter],
        lr: float = 0.01,
        alpha: float = 0.99,
        eps: float = 1e-08,
        weight_decay: float = 0.,
        momentum: float = 0.,
    ):
        '''Implements RMSprop.

        Like the PyTorch version, but assumes centered=False
            https://pytorch.org/docs/stable/generated/torch.optim.RMSprop.html#torch.optim.RMSprop
        '''
        self.params = list(params)
        self.lr = lr
        self.alpha = alpha
        self.eps = eps
        self.weight_decay = weight_decay
        self.momentum = momentum
        self.vs = [t.zeros_like(p) for p in self.params]
        self.bs = [t.zeros_like(p) for p in self.params]
        

    def zero_grad(self) -> None:
        for i in self.params:
            i.grad = None

    @t.inference_mode()
    def step(self) -> None:
        for index, param in enumerate(self.params):
            g = param.grad
            
            if self.weight_decay != 0:
                g += self.weight_decay * param
            
            v = self.alpha * self.vs[index] + (1 - self.alpha) * (g * g)

            if self.momentum > 0:
                b = self.momentum * self.bs[index] + g / (t.sqrt(v) + self.eps)
                param -= self.lr * b
                self.bs[index] = b

            else:
                param -= self.lr * g / (t.sqrt(v) + self.eps)
            self.vs[index] = v

            
            
        

    def __repr__(self) -> str:
        return f"RMS(lr={self.lr}, alpha ={self.alpha}, momentum={self.momentum}, weight_decay={self.weight_decay})"



utils.test_rmsprop(RMSprop)


Testing configuration:  {'lr': 0.1, 'alpha': 0.9, 'eps': 0.001, 'weight_decay': 0.0, 'momentum': 0.0}

Testing configuration:  {'lr': 0.1, 'alpha': 0.95, 'eps': 0.0001, 'weight_decay': 0.05, 'momentum': 0.0}

Testing configuration:  {'lr': 0.1, 'alpha': 0.95, 'eps': 0.0001, 'weight_decay': 0.05, 'momentum': 0.5}

Testing configuration:  {'lr': 0.1, 'alpha': 0.95, 'eps': 0.0001, 'weight_decay': 0.05, 'momentum': 0.0}


In [7]:
class Adam:
    def __init__(
        self,
        params: Iterable[t.nn.parameter.Parameter],
        lr: float = 0.001,
        betas: tuple[float, float] = (0.9, 0.999),
        eps: float = 1e-08,
        weight_decay: float = 0.,
    ):
        '''Implements Adam.

        Like the PyTorch version, but assumes amsgrad=False and maximize=False
            https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.Adam
        '''
        self.params = list(params)
        self.lr = lr
        self.beta1, self.beta2 = betas
        self.eps = eps
        self.weight_decay = weight_decay

        self.ms = [t.zeros_like(p) for p in self.params]
        self.vs = [t.zeros_like(p) for p in self.params]
        self.t = 1

    def zero_grad(self) -> None:
        for i in self.params:
            i.grad = None

    @t.inference_mode()
    def step(self) -> None:
        for index, param in enumerate(self.params):
            
            g = param.grad

            if self.weight_decay != 0:
                g = g + self.weight_decay * param

            m = self.beta1 * self.ms[index] + (1 - self.beta1) * g
            v = self.beta2 * self.vs[index] + (1 - self.beta2) * (g * g)

            m_hat = m / (1 - self.beta1 ** self.t)
            v_hat = v / (1 - self.beta2 ** self.t)
            

            param -= self.lr * m_hat / (v_hat.sqrt() + self.eps)
            
            self.ms[index] = m
            self.vs[index] = v

        self.t += 1

    def __repr__(self) -> str:
        return f"Adam(lr={self.lr}, beta1={self.beta1}, beta2={self.beta2}, weight_decay={self.weight_decay})"


utils.test_adam(Adam)


Testing configuration:  {'lr': 0.1, 'betas': (0.8, 0.95), 'eps': 0.001, 'weight_decay': 0.0}

Testing configuration:  {'lr': 0.1, 'betas': (0.8, 0.9), 'eps': 0.001, 'weight_decay': 0.05}

Testing configuration:  {'lr': 0.2, 'betas': (0.9, 0.95), 'eps': 0.01, 'weight_decay': 0.08}


# Comparing Optimizers

In [8]:
def opt_fn(fn: Callable, xy: t.Tensor, optimizer_class, optimizer_kwargs, n_iters: int = 100):
    '''Optimize the a given function starting from the specified point.

    optimizer_class: one of the optimizers you've defined, either SGD, RMSprop, or Adam
    optimzer_kwargs: keyword arguments passed to your optimiser (e.g. lr and weight_decay)
    '''
    assert xy.requires_grad

    to_optimize = {xy}
    optimizer = optimizer_class([xy], **optimizer_kwargs)

    out = t.zeros((n_iters, 2))
    for i in range(n_iters):
        out[i] = xy.detach() # add to all steps

        loss = fn(xy[0], xy[1]) # measure loss
        loss.backward() # calculate gradients
        optimizer.step() # take a step backward (with momentum?)
        optimizer.zero_grad()
    
    
    return out

## Insights

Hi, it's me back from a good night sleep. I made the functions and everything worked well, but I feel as if I'm not quite understanding what's going on. Some parameters feel like black boxes, so I'm going to run small experiments to see if I can really understand these parameters.

RMSprop:
- momentum is designed to act as an "acceleration" of sorts. the higher the momentum, the more important past gradients are, and the less 'friction' there is. notice how, when optimizing, the optimizer will jump across back and forth, but then start to gain acceleration in the direction of highest optimization; this is because the momentum stores the results of previous gradients, and is in essence summing them together. the jumps, although generally going back and forth, are also sliding in one direction: that direction is preserved and added to, but the jumps are cancelled out since they aren't going anywhere.
- alpha is the parameter which resizes the gradients of specific features. its trying to divide the super large gradients by super large numbers to give the smaller, more hidden, features a time to shine: the more you have updated a feature in the last few steps, the less you will update it in the upcoming steps. if you have a higher alpha, it takes into account past sizes more than it will new ones.


I think these two explains the two trends in the bottom graphs: lower momentum better resembles SGD, and is faster.
similarily, lower alphas

In [9]:
xy = t.tensor([-1.5, 2.5], requires_grad=True)
x_range = [-2, 2]
y_range = [-1, 3]
optimizers = [
    (SGD, dict(lr=1e-3, momentum=0.98)),
    (RMSprop, dict(lr =0.01, alpha=0.8, momentum = 0.9 )),
    (RMSprop, dict(lr =0.01, alpha=0.7, momentum = 0.9 )),
    (RMSprop, dict(lr =0.01, alpha=0.6, momentum = 0.9 )),
    (RMSprop, dict(lr =0.01, alpha=0.5, momentum = 0.9 )),
    (RMSprop, dict(lr =0.01, alpha=0.4, momentum = 0.9 )),
    (RMSprop, dict(lr =0.01, alpha=0.3, momentum = 0.9 )),
    (RMSprop, dict(lr =0.01, alpha=0.2, momentum = 0.9 )),
    (RMSprop, dict(lr =0.01, alpha=0.1, momentum = 0.9 )),
    (RMSprop, dict(lr =0.01, alpha=0.05, momentum = 0.9 )),

]

fig = utils.plot_optimization(opt_fn, rosenbrocks_banana, xy, optimizers, x_range, y_range, n_iters = 100)

fig.show()

In [10]:
xy = t.tensor([-1.5, 2.5], requires_grad=True)
x_range = [-2, 2]
y_range = [-1, 3]
optimizers = [
    (SGD, dict(lr=1e-3, momentum=0.98)),
    (RMSprop, dict(lr =0.01, alpha=0.5, momentum = 0.9 )),
    (RMSprop, dict(lr =0.01, alpha=0.5, momentum = 0.7 )),
    (RMSprop, dict(lr =0.01, alpha=0.5, momentum = 0.5 )),
    (RMSprop, dict(lr =0.01, alpha=0.5, momentum = 0.3 )),

]

fig = utils.plot_optimization(opt_fn, rosenbrocks_banana, xy, optimizers, x_range, y_range, n_iters = 160)

fig.show()

I'll visit these later :)

# Learning Rate Schedulers

ExponentialLR: updates every epoch

In [30]:
class ExponentialLR():
    def __init__(self, optimizer, gamma):
        '''Implements ExponentialLR.

        Like the PyTorch version, but assumes last_epoch=-1 and verbose=False
            https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ExponentialLR.html
        '''

        self.last_epoch = -1
        self.optimizer = optimizer
        self.gamma = gamma

    def step(self):
        self.last_epoch += 1
        self.optimizer.lr = self.optimizer.lr * self.gamma

    def __repr__(self):
        return "ExponentialLR with gamma " + str(self.gamma)

utils.test_ExponentialLR(ExponentialLR, SGD)

Testing ExponentialLR, training loop has 30 epochs, 4 batches per epoch

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  gamma=1.0

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  gamma=0.5

Testing configuration:
	optimizer:  lr=0.01, momentum=0.9, weight_decay=0.1 
	scheduler:  gamma=0.5

All tests in `test_ExponentialLR` passed!


StepLR: updates every step_size steps

In [31]:
class StepLR():
    def __init__(self, optimizer, step_size, gamma=0.1):
        '''Implements StepLR.

        Like the PyTorch version, but assumes last_epoch=-1 and verbose=False
            https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.StepLR.html
        '''
        self.step_size = step_size
        self.steps_till_update = step_size
        self.optimizer = optimizer
        self.gamma = gamma

    def step(self):
        self.steps_till_update -= 1
        if self.steps_till_update == 0:
            self.optimizer.lr = self.optimizer.lr * self.gamma
            self.steps_till_update = self.step_size

    def __repr__(self):
        return "StepLR with step count" + str(self.step_size)

utils.test_StepLR(StepLR, SGD)

Testing StepLR, training loop has 30 epochs, 4 batches per epoch

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  step_size=30, gamma=1.0

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  step_size=3, gamma=1.0

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  step_size=1, gamma=0.5

Testing configuration:
	optimizer:  lr=0.01, momentum=0.9, weight_decay=0.1 
	scheduler:  step_size=3, gamma=0.5

All tests in `test_StepLR` passed!


MultiStepLR: updates if the number of steps is in milestones

In [32]:
class MultiStepLR():
    def __init__(self, optimizer, milestones: list, gamma=0.1):
        '''Implements MultiStepLR.

        Like the PyTorch version, but assumes last_epoch=-1 and verbose=False
            https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.MultiStepLR.html
        '''
        self.optimizer = optimizer
        self.milestones = milestones
        self.steps = 0
        self.gamma = gamma

    def step(self):
        self.steps = self.steps + 1
        if self.steps in self.milestones:
            self.optimizer.lr = self.optimizer.lr * self.gamma

        

    def __repr__(self):
        return "multistepLR with gamma " + str(self.gamma)

utils.test_MultiStepLR(MultiStepLR, SGD)

Testing MultiStepLR, training loop has 30 epochs, 4 batches per epoch

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  milestones=[40], gamma=1.0

Testing configuration:
	optimizer:  lr=0.01, momentum=0.0, weight_decay=0.0 
	scheduler:  milestones=[10], gamma=0.5

Testing configuration:
	optimizer:  lr=0.01, momentum=0.9, weight_decay=0.1 
	scheduler:  milestones=[10, 15], gamma=0.5

All tests in `test_MultiStepLR` passed!


## Plotting LR Schedulers

In [33]:
def opt_fn_with_scheduler(
    fn: Callable, 
    xy: t.Tensor, 
    optimizer_class, 
    optimizer_kwargs, 
    scheduler_class = None, 
    scheduler_kwargs = dict(), 
    n_iters: int = 100
):
    '''Optimize the a given function starting from the specified point.

    scheduler_class: one of the schedulers you've defined, either ExponentialLR, StepLR or MultiStepLR
    scheduler_kwargs: keyword arguments passed to your optimiser (e.g. gamma)
    '''
    assert xy.requires_grad
    to_optimize = {xy}
    optimizer = optimizer_class([xy], **optimizer_kwargs)
    scheduler = None
    if scheduler_class is not None:
        scheduler = scheduler_class(optimizer, **scheduler_kwargs)
    out = t.zeros((n_iters, 2))

    for i in range(n_iters):
        out[i] = xy.detach()
        loss = fn(xy[0], xy[1])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if scheduler is not None:
            scheduler.step()
    return out

In [40]:
xy = t.tensor([-1.5, 2.5], requires_grad=True)
x_range = [-2, 2]
y_range = [-1, 3]
optimizers = [
    (SGD, dict(lr=1e-3, momentum=0.98)),
    (SGD, dict(lr=1e-3, momentum=0.98)),
    (SGD, dict(lr=1e-3, momentum=0.98)),
    (SGD, dict(lr=1e-3, momentum=0.98))



]
schedulers = [
    (), # Empty list stands for no scheduler
    (ExponentialLR, dict(gamma=0.99)),
    (StepLR, dict(step_size = 4, gamma=0.99)),
    (MultiStepLR, dict(milestones = [20,40,50,70], gamma = 0.9))

]

fig = utils.plot_optimization_with_schedulers(opt_fn_with_scheduler, rosenbrocks_banana, xy, optimizers, schedulers, x_range, y_range, show_min=True)

fig.show()

# 