ray-project · sven1977 · Jun 30, 2021 · Jun 10, 2021 · Jun 11, 2021 · Jun 11, 2021
@@ -156,6 +156,35 @@ py_test(
     args = ["--yaml-dir=tuned_examples/ars", "--framework=torch"]
 )
 
+# CQL
+py_test(
+    name = "run_regression_tests_pendulum_cql_tf",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_pendulum"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    # Include the zipped json data file as well.
+    data = [
+        "tuned_examples/cql/pendulum-cql.yaml",
+        "tests/data/pendulum/huge.zip",
+    ],
+    args = ["--yaml-dir=tuned_examples/cql"]
+)
+
+py_test(
+    name = "run_regression_tests_pendulum_cql_torch",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_pendulum"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    # Include the zipped json data file as well.
+    data = [
+        "tuned_examples/cql/pendulum-cql.yaml",
+        "tests/data/pendulum/huge.zip",
+    ],
+    args = ["--yaml-dir=tuned_examples/cql", "--framework=torch"]
+)
+
 # DDPG
 py_test(
     name = "run_regression_tests_pendulum_ddpg_tf",
@@ -465,6 +494,26 @@ py_test(
     args = ["--yaml-dir=tuned_examples/sac", "--framework=torch"]
 )
 
+py_test(
+    name = "run_regression_tests_transformed_actions_pendulum_sac_tf",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_tf", "learning_tests_pendulum"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/sac/pendulum-transformed-actions-sac.yaml"],
+    args = ["--yaml-dir=tuned_examples/sac"]
+)
+
+py_test(
+    name = "run_regression_tests_transformed_actions_pendulum_sac_torch",
+    main = "tests/run_regression_tests.py",
+    tags = ["learning_tests_torch", "learning_tests_pendulum"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/sac/pendulum-transformed-actions-sac.yaml"],
+    args = ["--yaml-dir=tuned_examples/sac", "--framework=torch"]
+)
+
 
 # TD3
 py_test(

@@ -24,21 +24,22 @@
     SAC_CONFIG, {
         # You should override this to point to an offline dataset.
         "input": "sampler",
-        # Offline RL does not need IS estimators.
+        # Switch off off-policy evaluation.
         "input_evaluation": [],
         # Number of iterations with Behavior Cloning Pretraining.
         "bc_iters": 20000,
-        # CQL Loss Temperature.
+        # CQL loss temperature.
         "temperature": 1.0,
-        # Num Actions to sample for CQL Loss.
+        # Number of actions to sample for CQL loss.
         "num_actions": 10,
-        # Whether to use the Lagrangian for Alpha Prime (in CQL Loss).
+        # Whether to use the Lagrangian for Alpha Prime (in CQL loss).
         "lagrangian": False,
-        # Lagrangian Threshold.
+        # Lagrangian threshold.
         "lagrangian_thresh": 5.0,
-        # Min Q Weight multiplier.
+        # Min Q weight multiplier.
         "min_q_weight": 5.0,
-        # Replay Buffer should be size of offline dataset.
+        # Replay buffer should be larger or equal the size of the offline
+        # dataset.
         "buffer_size": int(1e6),
     })
 # __sphinx_doc_end__

@@ -19,8 +19,6 @@
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.tf.tf_action_dist import TFActionDistribution
 from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.utils.numpy import SMALL_NUMBER, MIN_LOG_NN_OUTPUT, \
-    MAX_LOG_NN_OUTPUT
 from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.exploration.random import Random
@@ -126,25 +124,10 @@ def cql_loss(policy: Policy, model: ModelV2,
         actor_loss = tf.reduce_mean(
             tf.stop_gradient(alpha) * log_pis_t - min_q)
     else:
-
-        def bc_log(model, obs, actions):
-            z = tf.math.atanh(actions)
-            logits = model.get_policy_output(obs)
-            mean, log_std = tf.split(logits, 2, axis=-1)
-            # Mean Clamping for Stability
-            mean = tf.clip_by_value(mean, MEAN_MIN, MEAN_MAX)
-            log_std = tf.clip_by_value(log_std, MIN_LOG_NN_OUTPUT,
-                                       MAX_LOG_NN_OUTPUT)
-            std = tf.math.exp(log_std)
-            normal_dist = tfp.distributions.Normal(mean, std)
-            return tf.reduce_sum(
-                normal_dist.log_prob(z) -
-                tf.math.log(1 - actions * actions + SMALL_NUMBER),
-                axis=-1)
-
-        bc_logp = bc_log(model, model_out_t, actions)
+        bc_logp = action_dist_t.logp(actions)
         actor_loss = tf.reduce_mean(
             tf.stop_gradient(alpha) * log_pis_t - bc_logp)
+        # actor_loss = -tf.reduce_mean(bc_logp)
 
     # Critic Loss (Standard SAC Critic L2 Loss + CQL Entropy Loss)
     # SAC Loss:

@@ -17,14 +17,12 @@
 from ray.rllib.policy.policy import LEARNER_STATS_KEY
 from ray.rllib.policy.policy_template import build_policy_class
 from ray.rllib.models.modelv2 import ModelV2
-from ray.rllib.utils.numpy import SMALL_NUMBER, MIN_LOG_NN_OUTPUT, \
-    MAX_LOG_NN_OUTPUT
 from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.typing import LocalOptimizer, TensorType, \
     TrainerConfigDict
-from ray.rllib.utils.torch_ops import apply_grad_clipping, atanh, \
+from ray.rllib.utils.torch_ops import apply_grad_clipping, \
     convert_to_torch_tensor
 
 torch, nn = try_import_torch()
@@ -130,30 +128,9 @@ def cql_loss(policy: Policy, model: ModelV2,
         actor_loss = (alpha.detach() * log_pis_t - min_q).mean()
     else:
 
-        def bc_log(model, obs, actions):
-            # Stabilize input to atanh.
-            normed_actions = \
-                (actions - action_dist_t.low) / \
-                (action_dist_t.high - action_dist_t.low) * 2.0 - 1.0
-            save_normed_actions = torch.clamp(
-                normed_actions, -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER)
-            z = atanh(save_normed_actions)
-
-            logits = model.get_policy_output(obs)
-            mean, log_std = torch.chunk(logits, 2, dim=-1)
-            # Mean Clamping for Stability
-            mean = torch.clamp(mean, MEAN_MIN, MEAN_MAX)
-            log_std = torch.clamp(log_std, MIN_LOG_NN_OUTPUT,
-                                  MAX_LOG_NN_OUTPUT)
-            std = torch.exp(log_std)
-            normal_dist = torch.distributions.Normal(mean, std)
-            return torch.sum(
-                normal_dist.log_prob(z) -
-                torch.log(1 - actions * actions + SMALL_NUMBER),
-                dim=-1)
-
-        bc_logp = bc_log(model, model_out_t, actions)
+        bc_logp = action_dist_t.logp(actions)
         actor_loss = (alpha.detach() * log_pis_t - bc_logp).mean()
+        # actor_loss = -bc_logp.mean()
 
     if obs.shape[0] == policy.config["train_batch_size"]:
         policy.actor_optim.zero_grad()

@@ -40,11 +40,18 @@ def test_cql_compilation(self):
         config["env"] = "Pendulum-v0"
         config["input"] = [data_file]
 
+        # In the files, we use here for testing, actions have already
+        # been normalized.
+        # This is usually the case when the file was generated by another
+        # RLlib algorithm (e.g. PPO or SAC).
+        config["actions_in_input_normalized"] = False
+        config["clip_actions"] = True
+        config["train_batch_size"] = 2000
+
         config["num_workers"] = 0  # Run locally.
         config["twin_q"] = True
-        config["clip_actions"] = True
-        config["normalize_actions"] = True
         config["learning_starts"] = 0
+        config["bc_iters"] = 2  # 2 BC iters, 2 CQL iters.
         config["rollout_fragment_length"] = 1
 
         # Switch on off-policy evaluation.
@@ -56,7 +63,7 @@ def test_cql_compilation(self):
         config["evaluation_parallel_to_training"] = True
         config["evaluation_num_workers"] = 2
 
-        num_iterations = 3
+        num_iterations = 4
 
         # Test for tf/torch frameworks.
         for fw in framework_iterator(config):

@@ -72,22 +72,18 @@
         "custom_model": None,  # Use this to define a custom policy model.
         "custom_model_config": {},
     },
-    # Unsquash actions to the upper and lower bounds of env's action space.
-    # Ignored for discrete action spaces.
-    "normalize_actions": True,
+    # Actions are already normalized, no need to clip them further.
+    "clip_actions": False,
 
     # === Learning ===
-    # Disable setting done=True at end of episode. This should be set to True
-    # for infinite-horizon MDPs (e.g., many continuous control problems).
-    "no_done_at_end": False,
     # Update the target by \tau * policy + (1-\tau) * target_policy.
     "tau": 5e-3,
     # Initial value to use for the entropy weight alpha.
     "initial_alpha": 1.0,
     # Target entropy lower bound. If "auto", will be set to -|A| (e.g. -2.0 for
     # Discrete(2), -3.0 for Box(shape=(3,))).
     # This is the inverse of reward scale, and will be optimized automatically.
-    "target_entropy": None,
+    "target_entropy": "auto",
     # N-step target updates. If >1, sars' tuples in trajectories will be
     # postprocessed to become sa[discounted sum of R][s t+n] tuples.
     "n_step": 1,

@@ -155,6 +155,7 @@ def _get_dist_class(config: TrainerConfigDict, action_space: gym.spaces.Space
     elif isinstance(action_space, Simplex):
         return Dirichlet
     else:
+        assert isinstance(action_space, Box)
         if config["normalize_actions"]:
             return SquashedGaussian if \
                 not config["_use_beta_distribution"] else Beta

@@ -3,7 +3,7 @@
 """
 
 import gym
-from gym.spaces import Discrete
+from gym.spaces import Box, Discrete
 import logging
 from typing import Dict, List, Optional, Tuple, Type, Union
 
@@ -48,6 +48,7 @@ def _get_dist_class(config: TrainerConfigDict, action_space: gym.spaces.Space
     elif isinstance(action_space, Simplex):
         return TorchDirichlet
     else:
+        assert isinstance(action_space, Box)
         if config["normalize_actions"]:
             return TorchSquashedGaussian if \
                 not config["_use_beta_distribution"] else TorchBeta