Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions test/optim/test_optim.py
Original file line number Diff line number Diff line change
Expand Up @@ -1085,8 +1085,10 @@ def test_adam(self):
constructor_accepts_foreach=True,
)
self._test_complex_2d(optim.Adam)
self._test_complex_2d(functools.partial(optim.Adam, foreach=True))
self._test_complex_2d(functools.partial(optim.Adam, foreach=True, weight_decay=0.2))
self._test_complex_2d(functools.partial(optim.Adam, foreach=False))
self._test_complex_2d(functools.partial(optim.Adam, foreach=False, amsgrad=True))
self._test_complex_2d(functools.partial(optim.Adam, weight_decay=0.2))
self._test_complex_2d(functools.partial(optim.Adam, weight_decay=0.2, amsgrad=True))

with self.assertRaisesRegex(
ValueError, "Invalid beta parameter at index 0: 1.0"
Expand Down
16 changes: 13 additions & 3 deletions torch/optim/adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,8 @@ def _single_tensor_adam(params: List[Tensor],
grad = torch.view_as_real(grad)
exp_avg = torch.view_as_real(exp_avg)
exp_avg_sq = torch.view_as_real(exp_avg_sq)
if amsgrad:
max_exp_avg_sqs[i] = torch.view_as_real(max_exp_avg_sqs[i])
param = torch.view_as_real(param)

# Decay the first and second moment running average coefficient
Expand All @@ -375,10 +377,12 @@ def _single_tensor_adam(params: List[Tensor],
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
if differentiable:
max_exp_avg_sqs_i = max_exp_avg_sqs[i].clone()
max_exp_avg_sq = max_exp_avg_sqs[i].clone()
else:
max_exp_avg_sqs_i = max_exp_avg_sqs[i]
max_exp_avg_sqs[i].copy_(torch.maximum(max_exp_avg_sqs_i, exp_avg_sq))
max_exp_avg_sq = max_exp_avg_sqs[i]

max_exp_avg_sqs[i].copy_(torch.maximum(max_exp_avg_sq, exp_avg_sq))

# Uses the max. for normalizing running avg. of gradient
# Folds in (admittedly ugly) 1-elem step_size math here to avoid extra param-set-sized read+write
# (can't fold it into addcdiv_ below because addcdiv_ requires value is a Number, not a Tensor)
Expand All @@ -400,13 +404,18 @@ def _single_tensor_adam(params: List[Tensor],
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i])

# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sqs[i].sqrt() / bias_correction2_sqrt).add_(eps)
else:
denom = (exp_avg_sq.sqrt() / bias_correction2_sqrt).add_(eps)

param.addcdiv_(exp_avg, denom, value=-step_size)

# Lastly, switch back to complex view
if amsgrad and torch.is_complex(params[i]):
max_exp_avg_sqs[i] = torch.view_as_complex(max_exp_avg_sqs[i])


def _multi_tensor_adam(params: List[Tensor],
grads: List[Tensor],
Expand Down Expand Up @@ -456,6 +465,7 @@ def _multi_tensor_adam(params: List[Tensor],
device_grads = [torch.view_as_real(x) if torch.is_complex(x) else x for x in device_grads]
device_exp_avgs = [torch.view_as_real(x) if torch.is_complex(x) else x for x in device_exp_avgs]
device_exp_avg_sqs = [torch.view_as_real(x) if torch.is_complex(x) else x for x in device_exp_avg_sqs]
device_max_exp_avg_sqs = [torch.view_as_real(x) if torch.is_complex(x) else x for x in device_max_exp_avg_sqs]
device_params = [torch.view_as_real(x) if torch.is_complex(x) else x for x in device_params]

# update steps
Expand Down