# Optimizer Mangement
> Handling PyTorch Cuda

In [None]:
# default_exp ftorch.optimizer

This handler is created to handle the multiple optimizer situations

In [1]:
# export
import torch

class Opts(object):
    def __init__(self, *args, **kwargs):
        """
        opts = Opts(opt1 = opt1, opt2 = opt2)
        opts.opt2.zero_grad()
        opts["opt3"] = opt3
        print(len(opts))
        """
        self.optlist = []
        self.optnames = []
        for i in range(len(args)):
            oname = f"optimizer_no{i + 1}"
            setattr(self, oname, args[i])
            self.optlist.append(args[i])
            self.optnames.append(oname)
        for k, v in kwargs.items():
            setattr(self, k, v)
            self.optlist.append(v)
            self.optnames.append(k)

    def __repr__(self):
        return "\n".join(list(
            f"{self.optnames[i]}\n\t{self.optlist[i].__class__}\n\t{self.read_opt(self.optlist[i])}" for i in
            range(len(self.optnames))))

    def get_pg(self, opt):
        """
        Get paramgroups dictionary, informations about an optimizer
        opt:torch.optim.optimizer
        """
        return dict.copy(opt.param_groups[0])

    def read_opt(self, opt):
        rt = self.get_pg(opt)
        if "params" in rt:
            del rt["params"]
        return rt

    def __len__(self):
        """
        Total number of optimizers
        """
        return len(self.optlist)

    def __contains__(self, item):
        return item in self.optlist

    def __getitem__(self, item):
        return getattr(self, item)

    def __setitem__(self, key, optimizer):
        self.optlist.append(optimizer)
        self.optnames.append(key)
        setattr(self, key, optimizer)

    def zero_all(self):
        """
        Zero gradient on all the optimizers
        """
        for opt in self.optlist:
            opt.zero_grad()

    def step_all(self):
        """
        All the optimizers match a step
        """
        for opt in self.optlist:
            opt.step()

## Experiment

In [4]:
import torch
from torch import nn

In [5]:
layer1 = nn.Linear(5,5)
layer2 = nn.Linear(5,5)

In [8]:
op1 = torch.optim.Adam(layer1.parameters())
op2 = torch.optim.Adagrad(layer2.parameters())

In [9]:
opts = Opts(op1, op2)

In [10]:
opts.optimizer_no1

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [11]:
opts.optimizer_no2

Adagrad (
Parameter Group 0
    eps: 1e-10
    initial_accumulator_value: 0
    lr: 0.01
    lr_decay: 0
    weight_decay: 0
)

In [12]:
opts

optimizer_no1
	<class 'torch.optim.adam.Adam'>
	{'lr': 0.001, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}
optimizer_no2
	<class 'torch.optim.adagrad.Adagrad'>
	{'lr': 0.01, 'lr_decay': 0, 'eps': 1e-10, 'weight_decay': 0, 'initial_accumulator_value': 0}

Let's create some gradients

In [28]:
x = torch.rand(2,5)
y_ = -(layer2(torch.nn.functional.relu(layer1(x))).mean())

In [29]:
y_.backward()

In [30]:
layer1.weight.grad,layer1.bias.grad,layer2.weight.grad,layer2.bias.grad

(tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [-5.7813e-02, -5.0724e-02, -6.1752e-02, -1.1506e-01, -6.0402e-02],
         [-4.5386e-04, -2.3771e-04, -2.1409e-04, -6.2308e-04, -3.3177e-04],
         [-1.3012e-01, -1.1416e-01, -1.3899e-01, -2.5898e-01, -1.3595e-01],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]]),
 tensor([ 0.0000, -0.1238, -0.0007, -0.2787,  0.0000]),
 tensor([[ 0.0000, -0.1671, -0.0036, -0.1062,  0.0000],
         [ 0.0000, -0.1671, -0.0036, -0.1062,  0.0000],
         [ 0.0000, -0.1671, -0.0036, -0.1062,  0.0000],
         [ 0.0000, -0.1671, -0.0036, -0.1062,  0.0000],
         [ 0.0000, -0.1671, -0.0036, -0.1062,  0.0000]]),
 tensor([-0.2000, -0.2000, -0.2000, -0.2000, -0.2000]))

### Take a step for all optimizers

In [31]:
opts.step_all()

### Zero gradient on all optimizers

In [26]:
opts.zero_all()

In [27]:
layer1.weight.grad,layer1.bias.grad,layer2.weight.grad,layer2.bias.grad

(tensor([[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]),
 tensor([0., 0., 0., 0., 0.]),
 tensor([[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]),
 tensor([0., 0., 0., 0., 0.]))