ray-project · sven1977 · Feb 4, 2022 · Jan 18, 2022 · Jan 18, 2022 · Jan 19, 2022
diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml
@@ -35,9 +35,21 @@
     - RLLIB_TESTING=1 PYTHON=3.7 ./ci/travis/install-dependencies.sh
     - bazel test --config=ci $(./scripts/bazel_export_options)
       --build_tests_only
-      --test_tag_filters=learning_tests_continuous,-fake_gpus,-torch_only,-multi_gpu
+      --test_tag_filters=learning_tests_continuous,-fake_gpus,-torch_only,-multi_gpu,-gpu,-learning_tests_continuous_tf2_eager_off_policy
       --test_arg=--framework=tf2
       rllib/...
+
+- label: ":brain: RLlib: Learning cont. actions TF2-eager-tracing Off Policy (from rllib/tuned_examples/*.yaml)"
+  conditions: ["RAY_CI_RLLIB_AFFECTED"]
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
+    - RLLIB_TESTING=1 PYTHON=3.7 ./ci/travis/install-dependencies.sh
+    - bazel test --config=ci $(./scripts/bazel_export_options)
+      --build_tests_only
+      --test_tag_filters=learning_tests_continuous_tf2_eager_off_policy,-fake_gpus,-torch_only,-multi_gpu,-gpu
+      --test_arg=--framework=tf2
+      rllib/...
+
 - label: ":brain: RLlib: Learning discr. actions TF1-static-graph (from rllib/tuned_examples/*.yaml)"
   conditions: ["RAY_CI_RLLIB_DIRECTLY_AFFECTED"]
   commands:

@@ -225,6 +225,16 @@ py_test(
     args = ["--yaml-dir=tuned_examples/ddpg"]
 )
 
+py_test(
+    name = "learning_tests_pendulum_ddpg_tf2",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:ml", "learning_tests_continuous_tf2_eager_off_policy"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = glob(["tuned_examples/ddpg/pendulum-ddpg.yaml"]),
+    args = ["--yaml-dir=tuned_examples/ddpg", "--override-mean-reward=-750.0"]
+)
+
 py_test(
     name = "learning_tests_pendulum_ddpg_fake_gpus",
     main = "tests/run_regression_tests.py",
@@ -523,6 +533,27 @@ py_test(
     args = ["--yaml-dir=tuned_examples/sac"]
 )
 
+py_test(
+    name = "learning_tests_pendulum_sac_tf2",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:ml", "learning_tests_continuous_tf2_eager_off_policy"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/sac/pendulum-sac.yaml"],
+    args = ["--yaml-dir=tuned_examples/sac", "--override-mean-reward=-900.0"]
+)
+
+py_test(
+    name = "learning_tests_transformed_actions_pendulum_sac_tf2",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:ml", "learning_tests_continuous_tf2_eager_off_policy"],
+    size = "large",
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/sac/pendulum-transformed-actions-sac.yaml"],
+    args = ["--yaml-dir=tuned_examples/sac" ,"--override-mean-reward=-850.0"]
+)
+
+
 py_test(
     name = "learning_tests_pendulum_sac_fake_gpus",
     main = "tests/run_regression_tests.py",

@@ -1,8 +1,6 @@
 import gym
 from typing import Type
 
-from ray.rllib.utils.annotations import override
-
 
 class ActionTransform(gym.ActionWrapper):
     def __init__(self, env, low, high):
@@ -27,11 +25,11 @@ def transform_action_space(env_name_or_creator) -> Type[gym.Env]:
             env_maker function.
 
     Returns:
-        New TransformedActionSpaceEnv class
-            to be used as env. The constructor takes a config dict with `_low`
-            and `_high` keys specifying the new action range
-            (default -1.0 to 1.0). The reset of the config dict will be
-            passed on to the underlying/wrapped env's constructor.
+        New transformed_action_space_env function that returns an environment
+        wrapped by the ActionTransform wrapper. The constructor takes a
+        config dict with `_low` and `_high` keys specifying the new action
+        range (default -1.0 to 1.0). The reset of the config dict will be
+        passed on to the underlying/wrapped env's constructor.
 
     Examples:
          >>> # By gym string:
@@ -42,33 +40,17 @@ def transform_action_space(env_name_or_creator) -> Type[gym.Env]:
          ... gym.spaces.Box(-15.0, 1.0, (1, ), "float32")
     """
 
-    class TransformedActionSpaceEnv(gym.Env):
-        """PendulumEnv w/ an action space of range 300.0 to 500.0."""
-
-        def __init__(self, config):
-            self._low = config.pop("low", -1.0)
-            self._high = config.pop("high", 1.0)
-            if isinstance(env_name_or_creator, str):
-                self.env = gym.make(env_name_or_creator)
-            else:
-                self.env = env_name_or_creator(config)
-            self.env = ActionTransform(self.env, self._low, self._high)
-            self.observation_space = self.env.observation_space
-            self.action_space = self.env.action_space
-
-        @override(gym.Env)
-        def reset(self):
-            return self.env.reset()
-
-        @override(gym.Env)
-        def step(self, actions):
-            return self.env.step(actions)
-
-        @override(gym.Env)
-        def render(self, mode=None):
-            return self.env.render(mode)
-
-    return TransformedActionSpaceEnv
+    def transformed_action_space_env(config):
+        if isinstance(env_name_or_creator, str):
+            inner_env = gym.make(env_name_or_creator)
+        else:
+            inner_env = env_name_or_creator(config)
+        _low = config.pop("low", -1.0)
+        _high = config.pop("high", 1.0)
+        env = ActionTransform(inner_env, _low, _high)
+        return env
+
+    return transformed_action_space_env
 
 
 TransformedActionPendulum = transform_action_space("Pendulum-v1")
@@ -46,7 +46,16 @@
     action="store_true",
     help="Run ray in local mode for easier debugging.",
 )
-
+parser.add_argument(
+    "--override-mean-reward",
+    type=float,
+    default=0.0,
+    help=(
+        "Override "
+        "the mean reward specified by the yaml file in the stopping criteria. This "
+        "is particularly useful for timed tests."
+    ),
+)
 # Obsoleted arg, use --framework=torch instead.
 parser.add_argument(
     "--torch", action="store_true", help="Runs all tests with PyTorch enabled."
@@ -92,6 +101,12 @@
         exp = list(experiments.values())[0]
         exp["config"]["framework"] = args.framework
 
+        # Override the mean reward if specified. This is used by the ray ci
+        # for overriding the episode reward mean for tf2 tests for off policy
+        # long learning tests such as sac and ddpg on the pendulum environment.
+        if args.override_mean_reward != 0.0:
+            exp["stop"]["episode_reward_mean"] = args.override_mean_reward
+
         # QMIX does not support tf yet -> skip.
         if exp["run"] == "QMIX" and args.framework != "torch":
             print(f"Skipping framework='{args.framework}' for QMIX.")

@@ -2,10 +2,11 @@ pendulum-ddpg-fake-gpus:
     env: Pendulum-v1
     run: DDPG
     stop:
-        episode_reward_mean: -700
-        timesteps_total: 100000
+        episode_reward_mean: -1000
+        timesteps_total: 40000
     config:
         # Works for both torch and tf.
+        seed: 42
         framework: tf
         actor_hiddens: [64, 64]
         critic_hiddens: [64, 64]
@@ -21,7 +22,9 @@ pendulum-ddpg-fake-gpus:
         learning_starts: 500
         train_batch_size: 64
         num_workers: 0
-        worker_side_prioritization: False
+        worker_side_prioritization: false
+        actor_lr: 0.0001
+        critic_lr: 0.0001
 
         # Fake 2 GPUs.
         num_gpus: 2

@@ -3,11 +3,14 @@ pendulum-ddpg:
     env: Pendulum-v1
     run: DDPG
     stop:
-        episode_reward_mean: -600
-        timesteps_total: 100000
+      episode_reward_mean: -320
+      timesteps_total: 30000
     config:
         # Works for both torch and tf.
-        framework: tf
+        seed: 42
+        soft_horizon: false
+        no_done_at_end: true
+        framework: torch
         # === Model ===
         actor_hiddens: [64, 64]
         critic_hiddens: [64, 64]

@@ -1,38 +1,41 @@
 pendulum-sac-fake-gpus:
-    env: Pendulum-v1
-    run: SAC
-    stop:
-        episode_reward_mean: -700
-        training_iteration: 200
-    config:
-        # Works for both torch and tf.
-        framework: tf
-        horizon: 200
-        soft_horizon: true
-        Q_model:
-          fcnet_activation: relu
-          fcnet_hiddens: [256, 256]
-        policy_model:
-          fcnet_activation: relu
-          fcnet_hiddens: [256, 256]
-        tau: 0.005
-        target_entropy: auto
-        n_step: 1
-        rollout_fragment_length: 1
-        prioritized_replay: true
-        target_network_update_freq: 1
-        timesteps_per_iteration: 1000
-        learning_starts: 256
-        num_workers: 0
-        metrics_smoothing_episodes: 5
+      env: Pendulum-v1
+      run: SAC
+      stop:
+          episode_reward_mean: -270
+          timesteps_total: 10000
+      config:
+          # Works for both torch and tf.
+          seed: 42
+          framework: tf
+          horizon: 200
+          soft_horizon: false
+          Q_model:
+            fcnet_activation: relu
+            fcnet_hiddens: [ 256, 256 ]
+          policy_model:
+            fcnet_activation: relu
+            fcnet_hiddens: [ 256, 256 ]
+          tau: 0.005
+          target_entropy: auto
+          no_done_at_end: true
+          n_step: 1
+          rollout_fragment_length: 1
+          prioritized_replay: true
+          train_batch_size: 256
+          target_network_update_freq: 1
+          timesteps_per_iteration: 1000
+          learning_starts: 256
+          num_workers: 0
+          metrics_smoothing_episodes: 5
 
-        # 1x batch size (despite 2 GPUs).
-        # train_batch_size: 256
-        optimization:
-          actor_learning_rate: 0.001
-          critic_learning_rate: 0.001
-          entropy_learning_rate: 0.001
+          # 1x batch size (despite 2 GPUs).
+          # train_batch_size: 256
+          optimization:
+            actor_learning_rate: 0.001
+            critic_learning_rate: 0.001
+            entropy_learning_rate: 0.001
 
-        # Fake 2 GPUs.
-        num_gpus: 2
-        _fake_gpus: true
+          # Fake 2 GPUs.
+          num_gpus: 2
+          _fake_gpus: true
@@ -4,13 +4,14 @@ pendulum-sac:
     env: Pendulum-v1
     run: SAC
     stop:
-        episode_reward_mean: -600
+        episode_reward_mean: -150
         timesteps_total: 10000
     config:
         # Works for both torch and tf.
+        seed: 42
         framework: tf
         horizon: 200
-        soft_horizon: true
+        soft_horizon: false
         Q_model:
           fcnet_activation: relu
           fcnet_hiddens: [256, 256]
@@ -20,7 +21,7 @@ pendulum-sac:
         tau: 0.005
         target_entropy: auto
         no_done_at_end: true
-        n_step: 3
+        n_step: 1
         rollout_fragment_length: 1
         prioritized_replay: true
         train_batch_size: 256

@@ -1,14 +1,15 @@
 # TransformedActionPendulum SAC can attain -150+ reward in 6-7k
 # Configurations are the similar to original softlearning/sac codebase
-pendulum-sac:
+transformed-actions-pendulum-sac-dummy-torch:
     env: ray.rllib.examples.env.transformed_action_space_env.TransformedActionPendulum
     run: SAC
     stop:
-        episode_reward_mean: -500
+        episode_reward_mean: -200
         timesteps_total: 10000
     config:
         # Works for both torch and tf.
-        framework: tf
+        seed: 42
+        framework: torch
 
         # Test, whether SAC is able to learn in "distorted" action spaces.
         env_config:
@@ -17,7 +18,7 @@ pendulum-sac:
             high: 500.0
 
         horizon: 200
-        soft_horizon: true
+        soft_horizon: false
         Q_model:
           fcnet_activation: relu
           fcnet_hiddens: [256, 256]