Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
py-optim/PyOptim/algorithms/vsgd.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
164 lines (131 sloc)
6.24 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scipy import mean, ones_like, clip, logical_or, median, zeros_like | |
from sgd import SGD | |
from core.gradientalgos import BbpropHessians, FiniteDifferenceHessians | |
class vSGD(SGD, BbpropHessians): | |
""" vSGD: SGD with variance-adapted learning rates, | |
as described in Schaul, Zhang & LeCun 2012. """ | |
# how to initialize the running averages | |
slow_constant = 2 | |
init_samples = 10 | |
# avoiding numerical instability | |
epsilon = 1e-9 | |
outlier_level = 1 | |
def _additionalInit(self): | |
# default setting | |
if self.slow_constant is None: | |
self.slow_constant = max(1, int(self.paramdim / 10.)) | |
if self.init_samples > 0: | |
# get a few initial samples to work with | |
tmp = self.batch_size | |
self.batch_size = self.init_samples * tmp | |
self._collectGradients() | |
self._num_updates += self.init_samples | |
self.batch_size = tmp | |
# mean gradient vector | |
self._gbar = mean(self._last_gradients, axis=0) | |
# mean squared gradient | |
self._vbar = (mean(self._last_gradients ** 2, axis=0) + self.epsilon) * self.slow_constant | |
# mean diagonal Hessian | |
#hs = clip(mean(self._last_diaghessians, axis=0), 1, 1 / self.epsilon) | |
#self._hbar = hs * self.slow_constant | |
self._hbar = mean(self._last_diaghessians, axis=0) | |
else: | |
# just start small, in the hope that the statistics will become accurate quickly enough | |
self._gbar = zeros_like(self.parameters) | |
self._vbar = ones_like(self.parameters) * self.epsilon | |
self._hbar = ones_like(self.parameters) | |
self._vpart = self._gbar ** 2 / self._vbar | |
# time constants | |
self._taus = (ones_like(self.parameters) + self.epsilon) * 2#* self.init_samples | |
# for debugging | |
self._print_quantities = [('p', self.parameters), | |
('tau', self._taus), | |
('g', self._gbar), | |
('v', self._vbar), | |
('vpa', self._vpart), | |
('h', self._hbar), | |
] | |
def _detectOutliers(self): | |
""" Binary vector for which dimension saw an outlier gradient. """ | |
var = (self._vbar - self._gbar ** 2) / self.batch_size | |
return (self._last_gradient - self._gbar) ** 2 > self.outlier_level ** 2 * var | |
def _computeStatistics(self): | |
grads = self._last_gradients | |
hs = mean(self._last_diaghessians, axis=0) | |
# slow down updates if the last sample was an outlier | |
if self.outlier_level is not None: | |
self._taus[self._detectOutliers()] += 1 | |
# update statistics | |
fract = 1. / self._taus | |
self._gbar *= 1. - fract | |
self._gbar += fract * mean(grads, axis=0) | |
self._vbar *= 1. - fract | |
self._vbar += fract * mean(grads ** 2, axis=0) + self.epsilon | |
self._hbar *= 1. - fract | |
self._hbar += fract * hs | |
# update time constants based on the variance-part of the learning rate | |
self._vpart *= 0 | |
if self.batch_size > 1: | |
self._vpart += self._gbar ** 2 / (1. / self.batch_size * self._vbar + | |
(self.batch_size - 1.) / self.batch_size * self._gbar ** 2) | |
else: | |
self._vpart += self._gbar ** 2 / self._vbar | |
self._taus *= 1. - self._vpart | |
self._taus += 1 + self.epsilon | |
del hs | |
return fract | |
@property | |
def learning_rate(self): | |
return self._vpart / (self._hbar + self.epsilon) | |
def _printStuff(self): | |
print self._num_updates, | |
for n, a in self._print_quantities: | |
#print n, type(a) | |
if abs(median(a)) > 1e4 or abs(median(a)) < 1e-3: | |
print n, median(a), '\t', | |
else: | |
print n, round(median(a), 4), '\t', | |
class vSGD_original(vSGD): | |
""" The original version from 2012 had not outlier detection. """ | |
outlier_level = None | |
class vSGDfd(FiniteDifferenceHessians, vSGD): | |
""" vSGD with finite-difference estimate of diagonal Hessian, | |
as described in Schaul & LeCun 2013. """ | |
@property | |
def learning_rate(self): | |
return self._vpart * self._hpart | |
def _additionalInit(self): | |
vSGD._additionalInit(self) | |
#h2s = clip(mean(self._last_diaghessians ** 2, axis=0), 1, 1 / self.epsilon) | |
#self._vhbar = h2s * self.slow_constant #** 2 | |
if self.init_samples > 0: | |
self._vhbar = mean(self._last_diaghessians ** 2, axis=0) | |
else: | |
self._hbar = zeros_like(self.parameters) | |
self._vhbar = ones_like(self.parameters) * self.epsilon | |
self._hpart = (self._hbar+self.epsilon) / (self._vhbar + self.epsilon) | |
self._print_quantities.extend([('vh', self._vhbar), | |
('hpa', self._hpart), | |
]) | |
def _computeStatistics(self): | |
fract = vSGD._computeStatistics(self) | |
self._vhbar *= (1. - fract) | |
self._vhbar += fract * mean(self._last_diaghessians ** 2, axis=0) | |
self._hpart *= 0 | |
self._hpart += (self._hbar+self.epsilon) / (self._vhbar + self.epsilon) | |
del fract | |
def _fdDirection(self): | |
if self._num_updates == 0: | |
return self._last_gradient + self.epsilon | |
else: | |
return self._gbar + self.epsilon | |
def _detectOutliers(self): | |
""" Binary vector for which dimension saw an outlier gradient: | |
considers oultier curvature as well. """ | |
hs = mean(self._last_diaghessians, axis=0) | |
var = (self._vhbar - self._hbar ** 2) / self.batch_size | |
res = logical_or(vSGD._detectOutliers(self), | |
(hs - self._hbar) ** 2 > self.outlier_level ** 2 * var) | |
del hs, var | |
return res | |