Skip to content

Commit

Permalink
feat(lac): implement Han et al. 2020 hyperparameters (#399)
Browse files Browse the repository at this point in the history
This commit aligns the hyperparameters for the LAC algorithm,
specifically adjusting the value of gamma ($\gamma$) and the neural
network architecture, to match those described in the research paper by
[Han et al. 2020](https://arxiv.org/abs/2004.14288).
  • Loading branch information
rickstaa committed Feb 6, 2024
1 parent 779201c commit 574b651
Show file tree
Hide file tree
Showing 10 changed files with 70 additions and 50 deletions.
36 changes: 20 additions & 16 deletions stable_learning_control/algos/pytorch/lac/lac.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- We use a `targ` suffix to distinguish actions/values coming from the target
network.
"""

import argparse
import glob
import itertools
Expand Down Expand Up @@ -94,7 +95,7 @@ def __init__(
env,
actor_critic=None,
ac_kwargs=dict(
hidden_sizes={"actor": [64] * 2, "critic": [128] * 2},
hidden_sizes={"actor": [256] * 2, "critic": [256] * 2},
activation=nn.ReLU,
output_activation={"actor": nn.ReLU},
),
Expand Down Expand Up @@ -156,8 +157,8 @@ def __init__(
======================= ============================================
Kwarg Value
======================= ============================================
``hidden_sizes_actor`` ``64 x 2``
``hidden_sizes_critic`` ``128 x 2``
``hidden_sizes_actor`` ``256 x 2``
``hidden_sizes_critic`` ``256 x 2``
``activation`` :class:`torch.nn.ReLU`
``output_activation`` :class:`torch.nn.ReLU`
======================= ============================================
Expand All @@ -171,7 +172,8 @@ def __init__(
labda (float, optional): The Lyapunov Lagrance multiplier. Defaults to
``0.99``.
gamma (float, optional): Discount factor. (Always between 0 and 1.).
Defaults to ``0.99``.
Defaults to ``0.99`` per Haarnoja et al. 2018, not ``0.995`` as in
Han et al. 2020.
polyak (float, optional): Interpolation factor in polyak averaging for
target networks. Target networks are updated towards main networks
according to:
Expand Down Expand Up @@ -201,7 +203,7 @@ def __init__(
This class will behave differently when the ``actor_critic`` argument
is set to the :class:`~stable_learning_control.algos.pytorch.policies.lyapunov_actor_twin_critic.LyapunovActorTwinCritic`.
For more information see the :ref:`LATC <latc>` documentation.
""" # noqa: E501
""" # noqa: E501, D301
super().__init__()
self._setup_kwargs = {
k: v for k, v in locals().items() if k not in ["self", "__class__", "env"]
Expand Down Expand Up @@ -693,9 +695,9 @@ def state_dict(self):
saves the current class name. This is used to enable easy loading of the model.
"""
state_dict = super().state_dict()
state_dict[
"alg_name"
] = self.__class__.__name__ # Save algorithm name state dict.
state_dict["alg_name"] = (
self.__class__.__name__
) # Save algorithm name state dict.
return state_dict

def bound_lr(
Expand Down Expand Up @@ -845,7 +847,7 @@ def lac(
env_fn,
actor_critic=None,
ac_kwargs=dict(
hidden_sizes={"actor": [64] * 2, "critic": [128] * 2},
hidden_sizes={"actor": [256] * 2, "critic": [256] * 2},
activation=nn.ReLU,
output_activation=nn.ReLU,
),
Expand Down Expand Up @@ -925,8 +927,8 @@ def lac(
======================= ============================================
Kwarg Value
======================= ============================================
``hidden_sizes_actor`` ``64 x 2``
``hidden_sizes_critic`` ``128 x 2``
``hidden_sizes_actor`` ``256 x 2``
``hidden_sizes_critic`` ``256 x 2``
``activation`` :class:`torch.nn.ReLU`
``output_activation`` :class:`torch.nn.ReLU`
======================= ============================================
Expand Down Expand Up @@ -1017,7 +1019,7 @@ def lac(
- policy (:class:`LAC`): The trained actor-critic policy.
- replay_buffer (union[:class:`~stable_learning_control.algos.pytorch.common.buffers.ReplayBuffer`, :class:`~stable_learning_control.algos.pytorch.common.buffers.FiniteHorizonReplayBuffer`]):
The replay buffer used during training.
""" # noqa: E501
""" # noqa: E501, D301
validate_args(**locals())

# Retrieve hyperparameters while filtering out the logger_kwargs.
Expand Down Expand Up @@ -1309,6 +1311,8 @@ def lac(
ep_ret, ep_len = 0, 0

# Update handling.
# NOTE: Improved compared to Han et al. 2020. Previously, updates were based on
# memory size, which only changed at terminal states.
if (t + 1) >= update_after and ((t + 1) - update_after) % update_every == 0:
# Step based learning rate decay.
if lr_decay_ref.lower() == "step":
Expand Down Expand Up @@ -1465,14 +1469,14 @@ def lac(
parser.add_argument(
"--hid_a",
type=int,
default=64,
help="hidden layer size of the actor (default: 64)",
default=256,
help="hidden layer size of the actor (default: 256)",
)
parser.add_argument(
"--hid_c",
type=int,
default=128,
help="hidden layer size of the lyapunov critic (default: 128)",
default=256,
help="hidden layer size of the lyapunov critic (default: 256)",
)
parser.add_argument(
"--l_a",
Expand Down
9 changes: 5 additions & 4 deletions stable_learning_control/algos/pytorch/latc/latc.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
:meth:`~stable_learning_control.algos.pytorch.policies.lyapunov_actor_twin_critic.LyapunovActorTwinCritic.update`
method is modified such that the two critics.
""" # noqa: E501

import argparse
import os.path as osp
import time
Expand Down Expand Up @@ -120,14 +121,14 @@ def latc(env_fn, actor_critic=None, *args, **kwargs):
parser.add_argument(
"--hid_a",
type=int,
default=64,
help="hidden layer size of the actor (default: 64)",
default=256,
help="hidden layer size of the actor (default: 256)",
)
parser.add_argument(
"--hid_c",
type=int,
default=128,
help="hidden layer size of the lyapunov critic (default: 128)",
default=256,
help="hidden layer size of the lyapunov critic (default: 256)",
)
parser.add_argument(
"--l_a",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
This module contains a Pytorch implementation of the Lyapunov Actor Critic policy of
`Han et al. 2020 <https://arxiv.org/abs/2004.14288>`_.
"""

import torch
import torch.nn as nn

Expand All @@ -14,7 +15,7 @@
from stable_learning_control.common.helpers import strict_dict_update
from stable_learning_control.utils.log_utils.helpers import log_to_std_out

HIDDEN_SIZES_DEFAULT = {"actor": (64, 64), "critic": (128, 128)}
HIDDEN_SIZES_DEFAULT = {"actor": (256, 256), "critic": (256, 256)}
ACTIVATION_DEFAULT = {"actor": nn.ReLU, "critic": nn.ReLU}
OUTPUT_ACTIVATION_DEFAULT = {
"actor": nn.ReLU,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
algorithm, this LAC variant uses two critics instead of one to mitigate a possible
underestimation bias, while the original LAC only uses one critic.
"""

import torch
import torch.nn as nn

Expand All @@ -16,7 +17,7 @@
from stable_learning_control.common.helpers import strict_dict_update
from stable_learning_control.utils.log_utils.helpers import log_to_std_out

HIDDEN_SIZES_DEFAULT = {"actor": (64, 64), "critic": (128, 128)}
HIDDEN_SIZES_DEFAULT = {"actor": (256, 256), "critic": (256, 256)}
ACTIVATION_DEFAULT = {"actor": nn.ReLU, "critic": nn.ReLU}
OUTPUT_ACTIVATION_DEFAULT = {
"actor": nn.ReLU,
Expand Down
13 changes: 8 additions & 5 deletions stable_learning_control/algos/pytorch/sac/sac.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- We use a `targ` suffix to distinguish actions/values coming from the target
network.
"""

import argparse
import glob
import itertools
Expand Down Expand Up @@ -182,7 +183,7 @@ def __init__(
Defaults to ``1e-4``.
device (str, optional): The device the networks are placed on (``cpu``
or ``gpu``). Defaults to ``cpu``.
""" # noqa: E501
""" # noqa: E501, D301
super().__init__()
self._setup_kwargs = {
k: v for k, v in locals().items() if k not in ["self", "__class__", "env"]
Expand Down Expand Up @@ -600,9 +601,9 @@ def state_dict(self):
saves the current class name. This is used to enable easy loading of the model.
"""
state_dict = super().state_dict()
state_dict[
"alg_name"
] = self.__class__.__name__ # Save algorithm name state dict.
state_dict["alg_name"] = (
self.__class__.__name__
) # Save algorithm name state dict.
return state_dict

def bound_lr(self, lr_a_final=None, lr_c_final=None, lr_alpha_final=None):
Expand Down Expand Up @@ -886,7 +887,7 @@ def sac(
- policy (:class:`SAC`): The trained actor-critic policy.
- replay_buffer (union[:class:`~stable_learning_control.algos.common.buffers.ReplayBuffer`, :class:`~stable_learning_control.algos.common.buffers.FiniteHorizonReplayBuffer`]):
The replay buffer used during training.
""" # noqa: E501
""" # noqa: E501, D301
validate_args(**locals())

# Retrieve hyperparameters while filtering out the logger_kwargs.
Expand Down Expand Up @@ -1135,6 +1136,8 @@ def sac(
ep_ret, ep_len = 0, 0

# Update handling.
# NOTE: Improved compared to Han et al. 2020. Previously, updates were based on
# memory size, which only changed at terminal states.
if (t + 1) >= update_after and ((t + 1) - update_after) % update_every == 0:
# Step based learning rate decay.
if lr_decay_ref.lower() == "step":
Expand Down
32 changes: 18 additions & 14 deletions stable_learning_control/algos/tf2/lac/lac.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- We use a `targ` suffix to distinguish actions/values coming from the target
network.
"""

# noqa: E402
import argparse
import os
Expand Down Expand Up @@ -91,7 +92,7 @@ def __init__(
env,
actor_critic=None,
ac_kwargs=dict(
hidden_sizes={"actor": [64] * 2, "critic": [128] * 2},
hidden_sizes={"actor": [256] * 2, "critic": [256] * 2},
activation=nn.relu,
output_activation={"actor": nn.relu},
),
Expand Down Expand Up @@ -154,8 +155,8 @@ def __init__(
======================= ============================================
Kwarg Value
======================= ============================================
``hidden_sizes_actor`` ``64 x 2``
``hidden_sizes_critic`` ``128 x 2``
``hidden_sizes_actor`` ``256 x 2``
``hidden_sizes_critic`` ``256 x 2``
``activation`` :class:`tf.nn.relu`
``output_activation`` :class:`tf.nn.relu`
======================= ============================================
Expand All @@ -169,7 +170,8 @@ def __init__(
labda (float, optional): The Lyapunov Lagrance multiplier. Defaults to
``0.99``.
gamma (float, optional): Discount factor. (Always between 0 and 1.).
Defaults to ``0.99``.
Defaults to ``0.99`` per Haarnoja et al. 2018, not ``0.995`` as in
Han et al. 2020.
polyak (float, optional): Interpolation factor in polyak averaging for
target networks. Target networks are updated towards main networks
according to:
Expand Down Expand Up @@ -199,7 +201,7 @@ def __init__(
This class will behave differently when the ``actor_critic`` argument
is set to the :class:`~stable_learning_control.algos.pytorch.policies.lyapunov_actor_twin_critic.LyapunovActorTwinCritic`.
For more information see the :ref:`LATC <latc>` documentation.
""" # noqa: E501
""" # noqa: E501, D301
self._device = set_device(
device
) # NOTE: Needs to be called before super().__init__() call.
Expand Down Expand Up @@ -642,7 +644,7 @@ def build(self):
def summary(self):
"""Small wrapper around the :meth:`tf.keras.Model.summary()` method used to
apply a custom format to the model summary.
"""
""" # noqa: D402
if not self.built: # Ensure the model is built.
self.build()
super().summary()
Expand Down Expand Up @@ -777,7 +779,7 @@ def lac(
env_fn,
actor_critic=None,
ac_kwargs=dict(
hidden_sizes={"actor": [64] * 2, "critic": [128] * 2},
hidden_sizes={"actor": [256] * 2, "critic": [256] * 2},
activation=nn.relu,
output_activation=nn.relu,
),
Expand Down Expand Up @@ -857,8 +859,8 @@ def lac(
======================= ============================================
Kwarg Value
======================= ============================================
``hidden_sizes_actor`` ``64 x 2``
``hidden_sizes_critic`` ``128 x 2``
``hidden_sizes_actor`` ``256 x 2``
``hidden_sizes_critic`` ``256 x 2``
``activation`` :class:`tf.nn.ReLU`
``output_activation`` :class:`tf.nn.ReLU`
======================= ============================================
Expand Down Expand Up @@ -949,7 +951,7 @@ def lac(
- policy (:class:`LAC`): The trained actor-critic policy.
- replay_buffer (union[:class:`~stable_learning_control.algos.common.buffers.ReplayBuffer`, :class:`~stable_learning_control.algos.common.buffers.FiniteHorizonReplayBuffer`]):
The replay buffer used during training.
""" # noqa: E501
""" # noqa: E501, D301
validate_args(**locals())

# Retrieve hyperparameters while filtering out the logger_kwargs.
Expand Down Expand Up @@ -1221,6 +1223,8 @@ def lac(
ep_ret, ep_len = 0, 0

# Update handling.
# NOTE: Improved compared to Han et al. 2020. Previously, updates were based on
# memory size, which only changed at terminal states.
if (t + 1) >= update_after and ((t + 1) - update_after) % update_every == 0:
# Step based learning rate decay.
if lr_decay_ref.lower() == "step":
Expand Down Expand Up @@ -1384,14 +1388,14 @@ def lac(
parser.add_argument(
"--hid_a",
type=int,
default=64,
help="hidden layer size of the actor (default: 64)",
default=256,
help="hidden layer size of the actor (default: 256)",
)
parser.add_argument(
"--hid_c",
type=int,
default=128,
help="hidden layer size of the lyapunov critic (default: 128)",
default=256,
help="hidden layer size of the lyapunov critic (default: 256)",
)
parser.add_argument(
"--l_a",
Expand Down
9 changes: 5 additions & 4 deletions stable_learning_control/algos/tf2/latc/latc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
:meth:`~stable_learning_control.algos.tf2.policies.lyapunov_actor_twin_critic.LyapunovActorTwinCritic.update`
method is modified such that the two critics.
""" # noqa: E501

import argparse
import os.path as osp
import time
Expand Down Expand Up @@ -122,14 +123,14 @@ def latc(env_fn, actor_critic=None, *args, **kwargs):
parser.add_argument(
"--hid_a",
type=int,
default=64,
help="hidden layer size of the actor (default: 64)",
default=256,
help="hidden layer size of the actor (default: 256)",
)
parser.add_argument(
"--hid_c",
type=int,
default=128,
help="hidden layer size of the lyapunov critic (default: 128)",
default=256,
help="hidden layer size of the lyapunov critic (default: 256)",
)
parser.add_argument(
"--l_a",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
This module contains a TensorFlow 2.x implementation of the Lyapunov Actor Critic policy
of `Han et al. 2020 <https://arxiv.org/abs/2004.14288>`_.
"""

import tensorflow as tf
from tensorflow import nn

Expand All @@ -13,7 +14,7 @@
from stable_learning_control.common.helpers import strict_dict_update
from stable_learning_control.utils.log_utils.helpers import log_to_std_out

HIDDEN_SIZES_DEFAULT = {"actor": (64, 64), "critic": (128, 128)}
HIDDEN_SIZES_DEFAULT = {"actor": (256, 256), "critic": (256, 256)}
ACTIVATION_DEFAULT = {"actor": nn.relu, "critic": nn.relu}
OUTPUT_ACTIVATION_DEFAULT = {
"actor": nn.relu,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
algorithm, this LAC variant uses two critics instead of one to mitigate a possible
underestimation bias, while the original LAC only uses one critic.
"""

import tensorflow as tf
from tensorflow import nn

Expand All @@ -15,7 +16,7 @@
from stable_learning_control.common.helpers import strict_dict_update
from stable_learning_control.utils.log_utils.helpers import log_to_std_out

HIDDEN_SIZES_DEFAULT = {"actor": (64, 64), "critic": (128, 128)}
HIDDEN_SIZES_DEFAULT = {"actor": (256, 256), "critic": (256, 256)}
ACTIVATION_DEFAULT = {"actor": nn.relu, "critic": nn.relu}
OUTPUT_ACTIVATION_DEFAULT = {
"actor": nn.relu,
Expand Down

0 comments on commit 574b651

Please sign in to comment.