In [1]:
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""AdamW optimizer implementation."""

import tensorflow.compat.v2 as tf

from keras import backend_config
from keras.optimizers.legacy import optimizer_v2

# isort: off
from tensorflow.python.util.tf_export import keras_export


@keras_export(
    "keras.optimizers.legacy.AdamW",
    v1=["keras.optimizers.AdamW", "keras.optimizers.legacy.AdamW"],
)
class AdamW(optimizer_v2.OptimizerV2):
    r"""Optimizer that implements the AdamW algorithm.
    AdamW optimization is a stochastic gradient descent method that is based on
    adaptive estimation of first-order and second-order moments, with weight-decay.
    Args:
      learning_rate: A `Tensor`, floating point value, or a schedule that is a
        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
        that takes no arguments and returns the actual value to use, The
        learning rate. Defaults to 0.001.
      weight_decay: A float value or a constant float tensor, or a callable
        that takes no arguments and returns the actual value to use, The
        weight decay rate. Defaults to 0.01.
      beta_1: A float value or a constant float tensor, or a callable
        that takes no arguments and returns the actual value to use. The
        exponential decay rate for the 1st moment estimates. Defaults to 0.9.
      beta_2: A float value or a constant float tensor, or a callable
        that takes no arguments and returns the actual value to use, The
        exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
      epsilon: A small constant for numerical stability. This epsilon is
        "epsilon hat" in the Kingma and Ba paper (in the formula just before
        Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
        1e-7.
      amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
        the paper "On the Convergence of AdamW and beyond". Defaults to `False`.
      name: Optional name for the operations created when applying gradients.
        Defaults to `"AdamW"`.
      **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
        `clipnorm`, `global_clipnorm`.
        If `clipvalue` (float) is set, the gradient of each weight
        is clipped to be no higher than this value.
        If `clipnorm` (float) is set, the gradient of each weight
        is individually clipped so that its norm is no higher than this value.
        If `global_clipnorm` (float) is set the gradient of all weights is
        clipped so that their global norm is no higher than this value.
    Usage:
    >>> opt = tf.keras.optimizers.legacy.AdamW(learning_rate=0.1)
    >>> var1 = tf.Variable(10.0)
    >>> loss = lambda: (var1 ** 2)/2.0       # d(loss)/d(var1) == var1
    >>> step_count = opt.minimize(loss, [var1]).numpy()
    >>> # The first step is `-learning_rate*sign(grad)`
    >>> var1.numpy()
    9.9
    """

    _HAS_AGGREGATE_GRAD = True

    def __init__(
        self,
        learning_rate = 0.001,
        weight_decay  = 0.01,
        beta_1        = 0.9,
        beta_2        = 0.999,
        epsilon       = 1e-7,
        amsgrad       = False,
        name="AdamW",
        **kwargs
    ):
        super().__init__(name, **kwargs)
        self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
        #self._set_hyper("weight_decay" , kwargs.get("wd", weight_decay))
        self._set_hyper("decay"        , self._initial_decay)
        self._set_hyper("beta_1"       , beta_1)
        self._set_hyper("beta_2"       , beta_2)
        self.epsilon      = epsilon or backend_config.epsilon()
        self.weight_decay = weight_decay
        self.amsgrad      = amsgrad

    def _create_slots(self, var_list):
        # Create slots for the first and second moments.
        # Separate for-loops to respect the ordering of slot variables from v1.
        for var in var_list:
            self.add_slot(var, "m")
        for var in var_list:
            self.add_slot(var, "v")
        if self.amsgrad:
            for var in var_list:
                self.add_slot(var, "vhat")

    def _prepare_local(self, var_device, var_dtype, apply_state):
        super()._prepare_local(var_device, var_dtype, apply_state)

        local_step   = tf.cast(self.iterations + 1, var_dtype)
        #wd_t         = tf.identity(self._get_hyper("wd"    , var_dtype))
        beta_1_t     = tf.identity(self._get_hyper("beta_1", var_dtype))
        beta_2_t     = tf.identity(self._get_hyper("beta_2", var_dtype))
        beta_1_power = tf.pow(beta_1_t, local_step)
        beta_2_power = tf.pow(beta_2_t, local_step)
        lr = apply_state[(var_device, var_dtype)]["lr_t"] * (
            tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)
        )
        apply_state[(var_device, var_dtype)].update(
            dict(
                lr                 = lr,
                #wd_t               = wd_t,
                wd                 = tf.convert_to_tensor(self.weight_decay, var_dtype),
                epsilon            = tf.convert_to_tensor(self.epsilon, var_dtype),
                beta_1_t           = beta_1_t,
                beta_1_power       = beta_1_power,
                one_minus_beta_1_t = 1 - beta_1_t,
                beta_2_t           = beta_2_t,
                beta_2_power       = beta_2_power,
                one_minus_beta_2_t = 1 - beta_2_t,
            )
        )

    def set_weights(self, weights):
        params = self.weights
        # If the weights are generated by Keras V1 optimizer, it includes vhats
        # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
        # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
        num_vars = int((len(params) - 1) / 2)
        if len(weights) == 3 * num_vars + 1:
            weights = weights[: len(params)]
        super().set_weights(weights)

    def _resource_apply_dense(self, grad, var, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = (apply_state or {}).get(
            (var_device, var_dtype)
        ) or self._fallback_apply_state(var_device, var_dtype)

        m = self.get_slot(var, "m")
        v = self.get_slot(var, "v")

        if not self.amsgrad:
            raise NotImplementedError("tf.raw_ops.ResourceApplyAdamW idoes not exist")
            return tf.raw_ops.ResourceApplyAdamW(
                var         = var.handle,
                m           = m.handle,
                v           = v.handle,
                beta1_power = coefficients["beta_1_power"],
                beta2_power = coefficients["beta_2_power"],
                lr          = coefficients["lr_t"],
                wd          = coefficients["wd"],
                beta1       = coefficients["beta_1_t"],
                beta2       = coefficients["beta_2_t"],
                epsilon     = coefficients["epsilon"],
                grad        = grad,
                use_locking = self._use_locking,
            )
        else:
            vhat = self.get_slot(var, "vhat")
            raise NotImplementedError("tf.raw_ops.ResourceApplyAdamWWithAmsgrad idoes not exist")
            return tf.raw_ops.ResourceApplyAdamWWithAmsgrad(
                var         = var.handle,
                m           = m.handle,
                v           = v.handle,
                vhat        = vhat.handle,
                beta1_power = coefficients["beta_1_power"],
                beta2_power = coefficients["beta_2_power"],
                lr          = coefficients["lr_t"],
                wd          = coefficients["wd"],
                beta1       = coefficients["beta_1_t"],
                beta2       = coefficients["beta_2_t"],
                epsilon     = coefficients["epsilon"],
                grad        = grad,
                use_locking = self._use_locking,
            )

    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = (apply_state or {}).get(
            (var_device, var_dtype)
        ) or self._fallback_apply_state(var_device, var_dtype)

        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, "m")
        m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"]
        m_t = tf.compat.v1.assign(
            m, m * coefficients["beta_1_t"], use_locking=self._use_locking
        )
        with tf.control_dependencies([m_t]):
            m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)

        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v = self.get_slot(var, "v")
        v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"]
        v_t = tf.compat.v1.assign(
            v, v * coefficients["beta_2_t"], use_locking=self._use_locking
        )
        with tf.control_dependencies([v_t]):
            v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)

        lr  = coefficients["lr"]
        wd  = coefficients["wd"]
        eps = coefficients["epsilon"]
        
        if not self.amsgrad:
            v_sqrt = tf.sqrt(v_t)
            var_update = tf.compat.v1.assign_sub(
                var,
                lr * (wd*var + m_t / (v_sqrt + eps)),
                use_locking=self._use_locking,
            )
            return tf.group(*[var_update, m_t, v_t])
        else:
            v_hat = self.get_slot(var, "vhat")
            v_hat_t = tf.maximum(v_hat, v_t)
            with tf.control_dependencies([v_hat_t]):
                v_hat_t = tf.compat.v1.assign(
                    v_hat, v_hat_t, use_locking=self._use_locking
                )
            v_hat_sqrt = tf.sqrt(v_hat_t)
            var_update = tf.compat.v1.assign_sub(
                var,
                lr * (wd*var + m_t / (v_hat_sqrt + eps)),
                use_locking=self._use_locking,
            )
            return tf.group(*[var_update, m_t, v_t, v_hat_t])

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "learning_rate" : self._serialize_hyperparameter("learning_rate"),
                "weight_decay"  : self.weight_decay,
                "decay"         : self._initial_decay,
                "beta_1"        : self._serialize_hyperparameter("beta_1"),
                "beta_2"        : self._serialize_hyperparameter("beta_2"),
                "epsilon"       : self.epsilon,
                "amsgrad"       : self.amsgrad,
            }
        )
        return config


In [2]:

from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model

x_in = Input(1)
x    = Dense(1)(x_in)

model = Model(x_in, x, name="test_model")
model.compile(loss="mse", optimizer=AdamW(learning_rate=1e-3, weight_decay=1e-2))


In [3]:
import numpy as np

X = np.random.normal(size=(200,1))
Y = np.random.normal(size=(200,1))

X = tf.constant(X)
Y = tf.constant(Y)

model.fit(X, Y)

2023-06-22 20:01:16.769608: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


NotImplementedError: in user code:

    File "/Users/Ste/miniforge3/envs/tf_macos_230511/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/Users/Ste/miniforge3/envs/tf_macos_230511/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/Ste/miniforge3/envs/tf_macos_230511/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/Users/Ste/miniforge3/envs/tf_macos_230511/lib/python3.10/site-packages/keras/engine/training.py", line 1054, in train_step
        self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/Users/Ste/miniforge3/envs/tf_macos_230511/lib/python3.10/site-packages/keras/optimizers/legacy/optimizer_v2.py", line 588, in minimize
        return self.apply_gradients(grads_and_vars, name=name)
    File "/Users/Ste/miniforge3/envs/tf_macos_230511/lib/python3.10/site-packages/keras/optimizers/legacy/optimizer_v2.py", line 747, in apply_gradients
        return tf.__internal__.distribute.interim.maybe_merge_call(
    File "/Users/Ste/miniforge3/envs/tf_macos_230511/lib/python3.10/site-packages/keras/optimizers/legacy/optimizer_v2.py", line 806, in _distributed_apply
        update_op = distribution.extended.update(
    File "/Users/Ste/miniforge3/envs/tf_macos_230511/lib/python3.10/site-packages/keras/optimizers/legacy/optimizer_v2.py", line 785, in apply_grad_to_update_var  **
        update_op = self._resource_apply_dense(grad, var, **apply_kwargs)
    File "/var/folders/6_/gprzxt797d5098h8dtk22nch0000gn/T/ipykernel_32360/3753812170.py", line 155, in _resource_apply_dense
        raise NotImplementedError("tf.raw_ops.ResourceApplyAdamW idoes not exist")

    NotImplementedError: tf.raw_ops.ResourceApplyAdamW idoes not exist
