Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] [CI] Deflake longer running RLlib learning tests for off policy algorithms. Fix seeding issue in TransformedAction Environments #21685

Merged
merged 18 commits into from
Feb 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion .buildkite/pipeline.ml.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,21 @@
- RLLIB_TESTING=1 PYTHON=3.7 ./ci/travis/install-dependencies.sh
- bazel test --config=ci $(./scripts/bazel_export_options)
--build_tests_only
--test_tag_filters=learning_tests_continuous,-fake_gpus,-torch_only,-multi_gpu
--test_tag_filters=learning_tests_continuous,-fake_gpus,-torch_only,-multi_gpu,-gpu,-learning_tests_continuous_tf2_eager_off_policy
--test_arg=--framework=tf2
rllib/...

- label: ":brain: RLlib: Learning cont. actions TF2-eager-tracing Off Policy (from rllib/tuned_examples/*.yaml)"
conditions: ["RAY_CI_RLLIB_AFFECTED"]
commands:
- cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/travis/upload_build_info.sh; fi }; trap cleanup EXIT
- RLLIB_TESTING=1 PYTHON=3.7 ./ci/travis/install-dependencies.sh
- bazel test --config=ci $(./scripts/bazel_export_options)
--build_tests_only
--test_tag_filters=learning_tests_continuous_tf2_eager_off_policy,-fake_gpus,-torch_only,-multi_gpu,-gpu
--test_arg=--framework=tf2
rllib/...

- label: ":brain: RLlib: Learning discr. actions TF1-static-graph (from rllib/tuned_examples/*.yaml)"
conditions: ["RAY_CI_RLLIB_DIRECTLY_AFFECTED"]
commands:
Expand Down
31 changes: 31 additions & 0 deletions rllib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,16 @@ py_test(
args = ["--yaml-dir=tuned_examples/ddpg"]
)

py_test(
name = "learning_tests_pendulum_ddpg_tf2",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests_continuous_tf2_eager_off_policy"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = glob(["tuned_examples/ddpg/pendulum-ddpg.yaml"]),
args = ["--yaml-dir=tuned_examples/ddpg", "--override-mean-reward=-750.0"]
)

py_test(
name = "learning_tests_pendulum_ddpg_fake_gpus",
main = "tests/run_regression_tests.py",
Expand Down Expand Up @@ -523,6 +533,27 @@ py_test(
args = ["--yaml-dir=tuned_examples/sac"]
)

py_test(
name = "learning_tests_pendulum_sac_tf2",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests_continuous_tf2_eager_off_policy"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/sac/pendulum-sac.yaml"],
args = ["--yaml-dir=tuned_examples/sac", "--override-mean-reward=-900.0"]
)

py_test(
name = "learning_tests_transformed_actions_pendulum_sac_tf2",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests_continuous_tf2_eager_off_policy"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/sac/pendulum-transformed-actions-sac.yaml"],
args = ["--yaml-dir=tuned_examples/sac" ,"--override-mean-reward=-850.0"]
)


py_test(
name = "learning_tests_pendulum_sac_fake_gpus",
main = "tests/run_regression_tests.py",
Expand Down
50 changes: 16 additions & 34 deletions rllib/examples/env/transformed_action_space_env.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import gym
from typing import Type

from ray.rllib.utils.annotations import override


class ActionTransform(gym.ActionWrapper):
def __init__(self, env, low, high):
Expand All @@ -27,11 +25,11 @@ def transform_action_space(env_name_or_creator) -> Type[gym.Env]:
env_maker function.

Returns:
New TransformedActionSpaceEnv class
to be used as env. The constructor takes a config dict with `_low`
and `_high` keys specifying the new action range
(default -1.0 to 1.0). The reset of the config dict will be
passed on to the underlying/wrapped env's constructor.
New transformed_action_space_env function that returns an environment
wrapped by the ActionTransform wrapper. The constructor takes a
config dict with `_low` and `_high` keys specifying the new action
range (default -1.0 to 1.0). The reset of the config dict will be
passed on to the underlying/wrapped env's constructor.

Examples:
>>> # By gym string:
Expand All @@ -42,33 +40,17 @@ def transform_action_space(env_name_or_creator) -> Type[gym.Env]:
... gym.spaces.Box(-15.0, 1.0, (1, ), "float32")
"""

class TransformedActionSpaceEnv(gym.Env):
"""PendulumEnv w/ an action space of range 300.0 to 500.0."""

def __init__(self, config):
self._low = config.pop("low", -1.0)
self._high = config.pop("high", 1.0)
if isinstance(env_name_or_creator, str):
self.env = gym.make(env_name_or_creator)
else:
self.env = env_name_or_creator(config)
self.env = ActionTransform(self.env, self._low, self._high)
self.observation_space = self.env.observation_space
self.action_space = self.env.action_space

@override(gym.Env)
def reset(self):
return self.env.reset()

@override(gym.Env)
def step(self, actions):
return self.env.step(actions)

@override(gym.Env)
def render(self, mode=None):
return self.env.render(mode)

return TransformedActionSpaceEnv
def transformed_action_space_env(config):
if isinstance(env_name_or_creator, str):
inner_env = gym.make(env_name_or_creator)
else:
inner_env = env_name_or_creator(config)
_low = config.pop("low", -1.0)
_high = config.pop("high", 1.0)
env = ActionTransform(inner_env, _low, _high)
return env

return transformed_action_space_env


TransformedActionPendulum = transform_action_space("Pendulum-v1")
17 changes: 16 additions & 1 deletion rllib/tests/run_regression_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,16 @@
action="store_true",
help="Run ray in local mode for easier debugging.",
)

parser.add_argument(
"--override-mean-reward",
type=float,
default=0.0,
help=(
"Override "
"the mean reward specified by the yaml file in the stopping criteria. This "
"is particularly useful for timed tests."
),
)
# Obsoleted arg, use --framework=torch instead.
parser.add_argument(
"--torch", action="store_true", help="Runs all tests with PyTorch enabled."
Expand Down Expand Up @@ -92,6 +101,12 @@
exp = list(experiments.values())[0]
exp["config"]["framework"] = args.framework

# Override the mean reward if specified. This is used by the ray ci
# for overriding the episode reward mean for tf2 tests for off policy
# long learning tests such as sac and ddpg on the pendulum environment.
if args.override_mean_reward != 0.0:
exp["stop"]["episode_reward_mean"] = args.override_mean_reward

# QMIX does not support tf yet -> skip.
if exp["run"] == "QMIX" and args.framework != "torch":
print(f"Skipping framework='{args.framework}' for QMIX.")
Expand Down
9 changes: 6 additions & 3 deletions rllib/tuned_examples/ddpg/pendulum-ddpg-fake-gpus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ pendulum-ddpg-fake-gpus:
env: Pendulum-v1
run: DDPG
stop:
episode_reward_mean: -700
timesteps_total: 100000
episode_reward_mean: -1000
timesteps_total: 40000
config:
# Works for both torch and tf.
seed: 42
framework: tf
actor_hiddens: [64, 64]
critic_hiddens: [64, 64]
Expand All @@ -21,7 +22,9 @@ pendulum-ddpg-fake-gpus:
learning_starts: 500
train_batch_size: 64
num_workers: 0
worker_side_prioritization: False
worker_side_prioritization: false
actor_lr: 0.0001
critic_lr: 0.0001

# Fake 2 GPUs.
num_gpus: 2
Expand Down
9 changes: 6 additions & 3 deletions rllib/tuned_examples/ddpg/pendulum-ddpg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@ pendulum-ddpg:
env: Pendulum-v1
run: DDPG
stop:
episode_reward_mean: -600
timesteps_total: 100000
episode_reward_mean: -320
timesteps_total: 30000
config:
# Works for both torch and tf.
framework: tf
seed: 42
soft_horizon: false
no_done_at_end: true
framework: torch
# === Model ===
actor_hiddens: [64, 64]
critic_hiddens: [64, 64]
Expand Down
73 changes: 38 additions & 35 deletions rllib/tuned_examples/sac/pendulum-sac-fake-gpus.yaml
Original file line number Diff line number Diff line change
@@ -1,38 +1,41 @@
pendulum-sac-fake-gpus:
env: Pendulum-v1
run: SAC
stop:
episode_reward_mean: -700
training_iteration: 200
config:
# Works for both torch and tf.
framework: tf
horizon: 200
soft_horizon: true
Q_model:
fcnet_activation: relu
fcnet_hiddens: [256, 256]
policy_model:
fcnet_activation: relu
fcnet_hiddens: [256, 256]
tau: 0.005
target_entropy: auto
n_step: 1
rollout_fragment_length: 1
prioritized_replay: true
target_network_update_freq: 1
timesteps_per_iteration: 1000
learning_starts: 256
num_workers: 0
metrics_smoothing_episodes: 5
env: Pendulum-v1
run: SAC
stop:
episode_reward_mean: -270
timesteps_total: 10000
config:
# Works for both torch and tf.
seed: 42
framework: tf
horizon: 200
soft_horizon: false
Q_model:
fcnet_activation: relu
fcnet_hiddens: [ 256, 256 ]
policy_model:
fcnet_activation: relu
fcnet_hiddens: [ 256, 256 ]
tau: 0.005
target_entropy: auto
no_done_at_end: true
n_step: 1
rollout_fragment_length: 1
prioritized_replay: true
train_batch_size: 256
target_network_update_freq: 1
timesteps_per_iteration: 1000
learning_starts: 256
num_workers: 0
metrics_smoothing_episodes: 5

# 1x batch size (despite 2 GPUs).
# train_batch_size: 256
optimization:
actor_learning_rate: 0.001
critic_learning_rate: 0.001
entropy_learning_rate: 0.001
# 1x batch size (despite 2 GPUs).
# train_batch_size: 256
optimization:
actor_learning_rate: 0.001
critic_learning_rate: 0.001
entropy_learning_rate: 0.001

# Fake 2 GPUs.
num_gpus: 2
_fake_gpus: true
# Fake 2 GPUs.
num_gpus: 2
_fake_gpus: true
7 changes: 4 additions & 3 deletions rllib/tuned_examples/sac/pendulum-sac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@ pendulum-sac:
env: Pendulum-v1
run: SAC
stop:
episode_reward_mean: -600
episode_reward_mean: -150
timesteps_total: 10000
config:
# Works for both torch and tf.
seed: 42
framework: tf
horizon: 200
soft_horizon: true
soft_horizon: false
Q_model:
fcnet_activation: relu
fcnet_hiddens: [256, 256]
Expand All @@ -20,7 +21,7 @@ pendulum-sac:
tau: 0.005
target_entropy: auto
no_done_at_end: true
n_step: 3
n_step: 1
rollout_fragment_length: 1
prioritized_replay: true
train_batch_size: 256
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
# TransformedActionPendulum SAC can attain -150+ reward in 6-7k
# Configurations are the similar to original softlearning/sac codebase
pendulum-sac:
transformed-actions-pendulum-sac-dummy-torch:
env: ray.rllib.examples.env.transformed_action_space_env.TransformedActionPendulum
run: SAC
stop:
episode_reward_mean: -500
episode_reward_mean: -200
timesteps_total: 10000
config:
# Works for both torch and tf.
framework: tf
seed: 42
framework: torch

# Test, whether SAC is able to learn in "distorted" action spaces.
env_config:
Expand All @@ -17,7 +18,7 @@ pendulum-sac:
high: 500.0

horizon: 200
soft_horizon: true
soft_horizon: false
Q_model:
fcnet_activation: relu
fcnet_hiddens: [256, 256]
Expand Down