feat(lac): implement Han et al. 2020 hyperparameters (#399)

This commit aligns the hyperparameters for the LAC algorithm, specifically adjusting the value of gamma ($\gamma$) and the neural network architecture, to match those described in the research paper by [Han et al. 2020](https://arxiv.org/abs/2004.14288).
rickstaa · Feb 6, 2024 · 574b651 · 574b651
1 parent 779201c
commit 574b651
Show file tree

Hide file tree

Showing 10 changed files with 70 additions and 50 deletions.
diff --git a/stable_learning_control/algos/pytorch/lac/lac.py b/stable_learning_control/algos/pytorch/lac/lac.py
@@ -9,6 +9,7 @@
         - We use a `targ` suffix to distinguish actions/values coming from the target
           network.
 """
+
 import argparse
 import glob
 import itertools
@@ -94,7 +95,7 @@ def __init__(
         env,
         actor_critic=None,
         ac_kwargs=dict(
-            hidden_sizes={"actor": [64] * 2, "critic": [128] * 2},
+            hidden_sizes={"actor": [256] * 2, "critic": [256] * 2},
             activation=nn.ReLU,
             output_activation={"actor": nn.ReLU},
         ),
@@ -156,8 +157,8 @@ def __init__(
                 =======================  ============================================
                 Kwarg                    Value
                 =======================  ============================================
-                ``hidden_sizes_actor``    ``64 x 2``
-                ``hidden_sizes_critic``   ``128 x 2``
+                ``hidden_sizes_actor``    ``256 x 2``
+                ``hidden_sizes_critic``   ``256 x 2``
                 ``activation``            :class:`torch.nn.ReLU`
                 ``output_activation``     :class:`torch.nn.ReLU`
                 =======================  ============================================
@@ -171,7 +172,8 @@ def __init__(
             labda (float, optional): The Lyapunov Lagrance multiplier. Defaults to
                 ``0.99``.
             gamma (float, optional): Discount factor. (Always between 0 and 1.).
-                Defaults to ``0.99``.
+                Defaults to ``0.99`` per Haarnoja et al. 2018, not ``0.995`` as in
+                Han et al. 2020.
             polyak (float, optional): Interpolation factor in polyak averaging for
                 target networks. Target networks are updated towards main networks
                 according to:
@@ -201,7 +203,7 @@ def __init__(
             This class will behave differently when the ``actor_critic`` argument
             is set to the :class:`~stable_learning_control.algos.pytorch.policies.lyapunov_actor_twin_critic.LyapunovActorTwinCritic`.
             For more information see the :ref:`LATC <latc>` documentation.
-        """  # noqa: E501
+        """  # noqa: E501, D301
         super().__init__()
         self._setup_kwargs = {
             k: v for k, v in locals().items() if k not in ["self", "__class__", "env"]
@@ -693,9 +695,9 @@ def state_dict(self):
         saves the current class name. This is used to enable easy loading of the model.
         """
         state_dict = super().state_dict()
-        state_dict[
-            "alg_name"
-        ] = self.__class__.__name__  # Save algorithm name state dict.
+        state_dict["alg_name"] = (
+            self.__class__.__name__
+        )  # Save algorithm name state dict.
         return state_dict
 
     def bound_lr(
@@ -845,7 +847,7 @@ def lac(
     env_fn,
     actor_critic=None,
     ac_kwargs=dict(
-        hidden_sizes={"actor": [64] * 2, "critic": [128] * 2},
+        hidden_sizes={"actor": [256] * 2, "critic": [256] * 2},
         activation=nn.ReLU,
         output_activation=nn.ReLU,
     ),
@@ -925,8 +927,8 @@ def lac(
             =======================  ============================================
             Kwarg                    Value
             =======================  ============================================
-            ``hidden_sizes_actor``    ``64 x 2``
-            ``hidden_sizes_critic``   ``128 x 2``
+            ``hidden_sizes_actor``    ``256 x 2``
+            ``hidden_sizes_critic``   ``256 x 2``
             ``activation``            :class:`torch.nn.ReLU`
             ``output_activation``     :class:`torch.nn.ReLU`
             =======================  ============================================
@@ -1017,7 +1019,7 @@ def lac(
             -   policy (:class:`LAC`): The trained actor-critic policy.
             -   replay_buffer (union[:class:`~stable_learning_control.algos.pytorch.common.buffers.ReplayBuffer`, :class:`~stable_learning_control.algos.pytorch.common.buffers.FiniteHorizonReplayBuffer`]):
                 The replay buffer used during training.
-    """  # noqa: E501
+    """  # noqa: E501, D301
     validate_args(**locals())
 
     # Retrieve hyperparameters while filtering out the logger_kwargs.
@@ -1309,6 +1311,8 @@ def lac(
             ep_ret, ep_len = 0, 0
 
         # Update handling.
+        # NOTE: Improved compared to Han et al. 2020. Previously, updates were based on
+        # memory size, which only changed at terminal states.
         if (t + 1) >= update_after and ((t + 1) - update_after) % update_every == 0:
             # Step based learning rate decay.
             if lr_decay_ref.lower() == "step":
@@ -1465,14 +1469,14 @@ def lac(
     parser.add_argument(
         "--hid_a",
         type=int,
-        default=64,
-        help="hidden layer size of the actor (default: 64)",
+        default=256,
+        help="hidden layer size of the actor (default: 256)",
     )
     parser.add_argument(
         "--hid_c",
         type=int,
-        default=128,
-        help="hidden layer size of the lyapunov critic (default: 128)",
+        default=256,
+        help="hidden layer size of the lyapunov critic (default: 256)",
     )
     parser.add_argument(
         "--l_a",

diff --git a/stable_learning_control/algos/pytorch/latc/latc.py b/stable_learning_control/algos/pytorch/latc/latc.py
@@ -22,6 +22,7 @@
     :meth:`~stable_learning_control.algos.pytorch.policies.lyapunov_actor_twin_critic.LyapunovActorTwinCritic.update`
     method is modified such that the two critics.
 """  # noqa: E501
+
 import argparse
 import os.path as osp
 import time
@@ -120,14 +121,14 @@ def latc(env_fn, actor_critic=None, *args, **kwargs):
     parser.add_argument(
         "--hid_a",
         type=int,
-        default=64,
-        help="hidden layer size of the actor (default: 64)",
+        default=256,
+        help="hidden layer size of the actor (default: 256)",
     )
     parser.add_argument(
         "--hid_c",
         type=int,
-        default=128,
-        help="hidden layer size of the lyapunov critic (default: 128)",
+        default=256,
+        help="hidden layer size of the lyapunov critic (default: 256)",
     )
     parser.add_argument(
         "--l_a",

diff --git a/stable_learning_control/algos/pytorch/policies/lyapunov_actor_critic.py b/stable_learning_control/algos/pytorch/policies/lyapunov_actor_critic.py
@@ -3,6 +3,7 @@
 This module contains a Pytorch implementation of the Lyapunov Actor Critic policy of
 `Han et al. 2020 <https://arxiv.org/abs/2004.14288>`_.
 """
+
 import torch
 import torch.nn as nn
 
@@ -14,7 +15,7 @@
 from stable_learning_control.common.helpers import strict_dict_update
 from stable_learning_control.utils.log_utils.helpers import log_to_std_out
 
-HIDDEN_SIZES_DEFAULT = {"actor": (64, 64), "critic": (128, 128)}
+HIDDEN_SIZES_DEFAULT = {"actor": (256, 256), "critic": (256, 256)}
 ACTIVATION_DEFAULT = {"actor": nn.ReLU, "critic": nn.ReLU}
 OUTPUT_ACTIVATION_DEFAULT = {
     "actor": nn.ReLU,

diff --git a/stable_learning_control/algos/pytorch/policies/lyapunov_actor_twin_critic.py b/stable_learning_control/algos/pytorch/policies/lyapunov_actor_twin_critic.py
@@ -5,6 +5,7 @@
 algorithm, this LAC variant uses two critics instead of one to mitigate a possible
 underestimation bias, while the original LAC only uses one critic.
 """
+
 import torch
 import torch.nn as nn
 
@@ -16,7 +17,7 @@
 from stable_learning_control.common.helpers import strict_dict_update
 from stable_learning_control.utils.log_utils.helpers import log_to_std_out
 
-HIDDEN_SIZES_DEFAULT = {"actor": (64, 64), "critic": (128, 128)}
+HIDDEN_SIZES_DEFAULT = {"actor": (256, 256), "critic": (256, 256)}
 ACTIVATION_DEFAULT = {"actor": nn.ReLU, "critic": nn.ReLU}
 OUTPUT_ACTIVATION_DEFAULT = {
     "actor": nn.ReLU,

diff --git a/stable_learning_control/algos/pytorch/sac/sac.py b/stable_learning_control/algos/pytorch/sac/sac.py
@@ -9,6 +9,7 @@
         - We use a `targ` suffix to distinguish actions/values coming from the target
           network.
 """
+
 import argparse
 import glob
 import itertools
@@ -182,7 +183,7 @@ def __init__(
                 Defaults to ``1e-4``.
             device (str, optional): The device the networks are placed on (``cpu``
                 or ``gpu``). Defaults to ``cpu``.
-        """  # noqa: E501
+        """  # noqa: E501, D301
         super().__init__()
         self._setup_kwargs = {
             k: v for k, v in locals().items() if k not in ["self", "__class__", "env"]
@@ -600,9 +601,9 @@ def state_dict(self):
         saves the current class name. This is used to enable easy loading of the model.
         """
         state_dict = super().state_dict()
-        state_dict[
-            "alg_name"
-        ] = self.__class__.__name__  # Save algorithm name state dict.
+        state_dict["alg_name"] = (
+            self.__class__.__name__
+        )  # Save algorithm name state dict.
         return state_dict
 
     def bound_lr(self, lr_a_final=None, lr_c_final=None, lr_alpha_final=None):
@@ -886,7 +887,7 @@ def sac(
             -   policy (:class:`SAC`): The trained actor-critic policy.
             -   replay_buffer (union[:class:`~stable_learning_control.algos.common.buffers.ReplayBuffer`, :class:`~stable_learning_control.algos.common.buffers.FiniteHorizonReplayBuffer`]):
                 The replay buffer used during training.
-    """  # noqa: E501
+    """  # noqa: E501, D301
     validate_args(**locals())
 
     # Retrieve hyperparameters while filtering out the logger_kwargs.
@@ -1135,6 +1136,8 @@ def sac(
             ep_ret, ep_len = 0, 0
 
         # Update handling.
+        # NOTE: Improved compared to Han et al. 2020. Previously, updates were based on
+        # memory size, which only changed at terminal states.
         if (t + 1) >= update_after and ((t + 1) - update_after) % update_every == 0:
             # Step based learning rate decay.
             if lr_decay_ref.lower() == "step":

diff --git a/stable_learning_control/algos/tf2/lac/lac.py b/stable_learning_control/algos/tf2/lac/lac.py
@@ -9,6 +9,7 @@
         - We use a `targ` suffix to distinguish actions/values coming from the target
           network.
 """
+
 # noqa: E402
 import argparse
 import os
@@ -91,7 +92,7 @@ def __init__(
         env,
         actor_critic=None,
         ac_kwargs=dict(
-            hidden_sizes={"actor": [64] * 2, "critic": [128] * 2},
+            hidden_sizes={"actor": [256] * 2, "critic": [256] * 2},
             activation=nn.relu,
             output_activation={"actor": nn.relu},
         ),
@@ -154,8 +155,8 @@ def __init__(
                 =======================  ============================================
                 Kwarg                    Value
                 =======================  ============================================
-                ``hidden_sizes_actor``    ``64 x 2``
-                ``hidden_sizes_critic``   ``128 x 2``
+                ``hidden_sizes_actor``    ``256 x 2``
+                ``hidden_sizes_critic``   ``256 x 2``
                 ``activation``            :class:`tf.nn.relu`
                 ``output_activation``     :class:`tf.nn.relu`
                 =======================  ============================================
@@ -169,7 +170,8 @@ def __init__(
             labda (float, optional): The Lyapunov Lagrance multiplier. Defaults to
                 ``0.99``.
             gamma (float, optional): Discount factor. (Always between 0 and 1.).
-                Defaults to ``0.99``.
+                Defaults to ``0.99`` per Haarnoja et al. 2018, not ``0.995`` as in
+                Han et al. 2020.
             polyak (float, optional): Interpolation factor in polyak averaging for
                 target networks. Target networks are updated towards main networks
                 according to:
@@ -199,7 +201,7 @@ def __init__(
             This class will behave differently when the ``actor_critic`` argument
             is set to the :class:`~stable_learning_control.algos.pytorch.policies.lyapunov_actor_twin_critic.LyapunovActorTwinCritic`.
             For more information see the :ref:`LATC <latc>` documentation.
-        """  # noqa: E501
+        """  # noqa: E501, D301
         self._device = set_device(
             device
         )  # NOTE: Needs to be called before super().__init__() call.
@@ -642,7 +644,7 @@ def build(self):
     def summary(self):
         """Small wrapper around the :meth:`tf.keras.Model.summary()` method used to
         apply a custom format to the model summary.
-        """
+        """  # noqa: D402
         if not self.built:  # Ensure the model is built.
             self.build()
         super().summary()
@@ -777,7 +779,7 @@ def lac(
     env_fn,
     actor_critic=None,
     ac_kwargs=dict(
-        hidden_sizes={"actor": [64] * 2, "critic": [128] * 2},
+        hidden_sizes={"actor": [256] * 2, "critic": [256] * 2},
         activation=nn.relu,
         output_activation=nn.relu,
     ),
@@ -857,8 +859,8 @@ def lac(
             =======================  ============================================
             Kwarg                    Value
             =======================  ============================================
-            ``hidden_sizes_actor``    ``64 x 2``
-            ``hidden_sizes_critic``   ``128 x 2``
+            ``hidden_sizes_actor``    ``256 x 2``
+            ``hidden_sizes_critic``   ``256 x 2``
             ``activation``            :class:`tf.nn.ReLU`
             ``output_activation``     :class:`tf.nn.ReLU`
             =======================  ============================================
@@ -949,7 +951,7 @@ def lac(
             -   policy (:class:`LAC`): The trained actor-critic policy.
             -   replay_buffer (union[:class:`~stable_learning_control.algos.common.buffers.ReplayBuffer`, :class:`~stable_learning_control.algos.common.buffers.FiniteHorizonReplayBuffer`]):
                 The replay buffer used during training.
-    """  # noqa: E501
+        """  # noqa: E501, D301
     validate_args(**locals())
 
     # Retrieve hyperparameters while filtering out the logger_kwargs.
@@ -1221,6 +1223,8 @@ def lac(
             ep_ret, ep_len = 0, 0
 
         # Update handling.
+        # NOTE: Improved compared to Han et al. 2020. Previously, updates were based on
+        # memory size, which only changed at terminal states.
         if (t + 1) >= update_after and ((t + 1) - update_after) % update_every == 0:
             # Step based learning rate decay.
             if lr_decay_ref.lower() == "step":
@@ -1384,14 +1388,14 @@ def lac(
     parser.add_argument(
         "--hid_a",
         type=int,
-        default=64,
-        help="hidden layer size of the actor (default: 64)",
+        default=256,
+        help="hidden layer size of the actor (default: 256)",
     )
     parser.add_argument(
         "--hid_c",
         type=int,
-        default=128,
-        help="hidden layer size of the lyapunov critic (default: 128)",
+        default=256,
+        help="hidden layer size of the lyapunov critic (default: 256)",
     )
     parser.add_argument(
         "--l_a",

diff --git a/stable_learning_control/algos/tf2/latc/latc.py b/stable_learning_control/algos/tf2/latc/latc.py
@@ -23,6 +23,7 @@
     :meth:`~stable_learning_control.algos.tf2.policies.lyapunov_actor_twin_critic.LyapunovActorTwinCritic.update`
     method is modified such that the two critics.
 """  # noqa: E501
+
 import argparse
 import os.path as osp
 import time
@@ -122,14 +123,14 @@ def latc(env_fn, actor_critic=None, *args, **kwargs):
     parser.add_argument(
         "--hid_a",
         type=int,
-        default=64,
-        help="hidden layer size of the actor (default: 64)",
+        default=256,
+        help="hidden layer size of the actor (default: 256)",
     )
     parser.add_argument(
         "--hid_c",
         type=int,
-        default=128,
-        help="hidden layer size of the lyapunov critic (default: 128)",
+        default=256,
+        help="hidden layer size of the lyapunov critic (default: 256)",
     )
     parser.add_argument(
         "--l_a",

diff --git a/stable_learning_control/algos/tf2/policies/lyapunov_actor_critic.py b/stable_learning_control/algos/tf2/policies/lyapunov_actor_critic.py
@@ -3,6 +3,7 @@
 This module contains a TensorFlow 2.x implementation of the Lyapunov Actor Critic policy
 of `Han et al. 2020 <https://arxiv.org/abs/2004.14288>`_.
 """
+
 import tensorflow as tf
 from tensorflow import nn
 
@@ -13,7 +14,7 @@
 from stable_learning_control.common.helpers import strict_dict_update
 from stable_learning_control.utils.log_utils.helpers import log_to_std_out
 
-HIDDEN_SIZES_DEFAULT = {"actor": (64, 64), "critic": (128, 128)}
+HIDDEN_SIZES_DEFAULT = {"actor": (256, 256), "critic": (256, 256)}
 ACTIVATION_DEFAULT = {"actor": nn.relu, "critic": nn.relu}
 OUTPUT_ACTIVATION_DEFAULT = {
     "actor": nn.relu,

diff --git a/stable_learning_control/algos/tf2/policies/lyapunov_actor_twin_critic.py b/stable_learning_control/algos/tf2/policies/lyapunov_actor_twin_critic.py
@@ -5,6 +5,7 @@
 algorithm, this LAC variant uses two critics instead of one to mitigate a possible
 underestimation bias, while the original LAC only uses one critic.
 """
+
 import tensorflow as tf
 from tensorflow import nn
 
@@ -15,7 +16,7 @@
 from stable_learning_control.common.helpers import strict_dict_update
 from stable_learning_control.utils.log_utils.helpers import log_to_std_out
 
-HIDDEN_SIZES_DEFAULT = {"actor": (64, 64), "critic": (128, 128)}
+HIDDEN_SIZES_DEFAULT = {"actor": (256, 256), "critic": (256, 256)}
 ACTIVATION_DEFAULT = {"actor": nn.relu, "critic": nn.relu}
 OUTPUT_ACTIVATION_DEFAULT = {
     "actor": nn.relu,