In [4]:
import torch
import torch.optim as optim
import math

class Adagrad(optim.Optimizer):
    def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0):
        defaults = dict(lr=lr, lr_decay=lr_decay, weight_decay=weight_decay, 
                        initial_accumulator_value=initial_accumulator_value)
        super(Adagrad, self).__init__(params, defaults)

    @torch.no_grad()
    def step(self, closure=None):
        loss = closure() if closure is not None else None

        for group in self.param_groups:
            lr, lr_decay, weight_decay, initial_accumulator_value = \
                group['lr'], group['lr_decay'], group['weight_decay'], group['initial_accumulator_value']

            for param in group['params']:
                if param.grad is None:
                    continue
                grad = param.grad.data

                if weight_decay != 0:
                    grad = grad.add(weight_decay, param.data)

                state = self.state[param]

                if len(state) == 0:
                    state['step'] = 0
                    state['accumulator'] = torch.full_like(param.data, initial_accumulator_value)

                state['step'] += 1
                accumulator = state['accumulator']

                accumulator.addcmul_(1, grad, grad)

                adjusted_lr = lr / (1 + (state['step'] - 1) * lr_decay)

                std = accumulator.sqrt().add_(1e-10)
                param.data.addcdiv_(-adjusted_lr, grad, std)

        return loss