Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Preparatory PR for multi-agent multi-GPU learner (alpha-star style) #03 #21652

Merged
45 changes: 26 additions & 19 deletions rllib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ py_test(
)

py_test(
name = "learning_cartpole_a2c_fake_gpus",
name = "learning_tests_cartpole_a2c_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -126,15 +126,22 @@ py_test(

# APPO
py_test(
name = "learning_tests_cartpole_appo",
name = "learning_tests_cartpole_appo_no_vtrace",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/ppo/cartpole-appo.yaml",
"tuned_examples/ppo/cartpole-appo-vtrace.yaml"
],
data = ["tuned_examples/ppo/cartpole-appo.yaml"],
args = ["--yaml-dir=tuned_examples/ppo"]
)

py_test(
name = "learning_tests_cartpole_appo_vtrace",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/ppo/cartpole-appo-vtrace.yaml"],
args = ["--yaml-dir=tuned_examples/ppo"]
)

Expand All @@ -151,7 +158,7 @@ py_test(
)

py_test(
name = "learning_frozenlake_appo",
name = "learning_tests_frozenlake_appo",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
size = "large",
Expand All @@ -161,7 +168,7 @@ py_test(
)

py_test(
name = "learning_cartpole_appo_fake_gpus",
name = "learning_tests_cartpole_appo_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -208,7 +215,7 @@ py_test(
)

py_test(
name = "learning_pendulum_ddpg_fake_gpus",
name = "learning_tests_pendulum_ddpg_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -263,7 +270,7 @@ py_test(
)

py_test(
name = "learning_cartpole_dqn_fake_gpus",
name = "learning_tests_cartpole_dqn_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "large",
Expand All @@ -286,7 +293,7 @@ py_test(
)

py_test(
name = "learning_cartpole_simpleq_fake_gpus",
name = "learning_tests_cartpole_simpleq_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "medium",
Expand Down Expand Up @@ -318,7 +325,7 @@ py_test(
)

py_test(
name = "learning_cartpole_impala_fake_gpus",
name = "learning_tests_cartpole_impala_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -352,7 +359,7 @@ py_test(
)

py_test(
name = "learning_cartpole_pg_fake_gpus",
name = "learning_tests_cartpole_pg_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -403,7 +410,7 @@ py_test(
)

py_test(
name = "learning_cartpole_ppo_fake_gpus",
name = "learning_tests_cartpole_ppo_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -455,7 +462,7 @@ py_test(
)

py_test(
name = "learning_stateless_cartpole_r2d2_fake_gpus",
name = "learning_tests_stateless_cartpole_r2d2_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -506,7 +513,7 @@ py_test(
)

py_test(
name = "learning_pendulum_sac_fake_gpus",
name = "learning_tests_pendulum_sac_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -845,7 +852,7 @@ py_test(
"--env", "Pendulum-v1",
"--run", "APEX_DDPG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"framework\": \"tf\", \"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1, \"batch_mode\": \"complete_episodes\"}'",
"--config", "'{\"framework\": \"tf\", \"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_time_s_per_reporting\": 1, \"batch_mode\": \"complete_episodes\"}'",
"--ray-num-cpus", "4",
]
)
Expand Down Expand Up @@ -928,7 +935,7 @@ py_test(
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"framework\": \"tf\", \"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_multi_gpu_tower_stacks\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0}'",
"--config", "'{\"framework\": \"tf\", \"num_gpus\": 0, \"num_workers\": 2, \"min_time_s_per_reporting\": 1, \"num_multi_gpu_tower_stacks\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0}'",
"--ray-num-cpus", "4",
]
)
Expand All @@ -942,7 +949,7 @@ py_test(
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"framework\": \"tf\", \"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_multi_gpu_tower_stacks\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0, \"model\": {\"use_lstm\": true}}'",
"--config", "'{\"framework\": \"tf\", \"num_gpus\": 0, \"num_workers\": 2, \"min_time_s_per_reporting\": 1, \"num_multi_gpu_tower_stacks\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0, \"model\": {\"use_lstm\": true}}'",
"--ray-num-cpus", "4",
]
)
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/a3c/a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
A3C_CONFIG,
{
"rollout_fragment_length": 20,
"min_iter_time_s": 10,
"min_time_s_per_reporting": 10,
"sample_async": False,

# A2C supports microbatching, in which we accumulate gradients over
Expand Down
4 changes: 2 additions & 2 deletions rllib/agents/a3c/a3c.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@
"entropy_coeff": 0.01,
# Entropy coefficient schedule
"entropy_coeff_schedule": None,
# Min time per iteration
"min_iter_time_s": 5,
# Min time per reporting
"min_time_s_per_reporting": 5,
# Workers sample async. Note that this increases the effective
# rollout_fragment_length by up to 5x due to async buffering of batches.
"sample_async": True,
Expand Down
4 changes: 2 additions & 2 deletions rllib/agents/a3c/tests/test_a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_a2c_compilation(self):
trainer.stop()

def test_a2c_exec_impl(ray_start_regular):
config = {"min_iter_time_s": 0}
config = {"min_time_s_per_reporting": 0}
for _ in framework_iterator(config):
trainer = a3c.A2CTrainer(env="CartPole-v0", config=config)
results = trainer.train()
Expand All @@ -46,7 +46,7 @@ def test_a2c_exec_impl(ray_start_regular):

def test_a2c_exec_impl_microbatch(ray_start_regular):
config = {
"min_iter_time_s": 0,
"min_time_s_per_reporting": 0,
"microbatch_size": 10,
}
for _ in framework_iterator(config):
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/a3c/tests/test_a3c.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_a3c_entropy_coeff_schedule(self):
config["timesteps_per_iteration"] = 20
# 0 metrics reporting delay, this makes sure timestep,
# which entropy coeff depends on, is updated after each worker rollout.
config["min_iter_time_s"] = 0
config["min_time_s_per_reporting"] = 0
# Initial lr, doesn't really matter because of the schedule below.
config["entropy_coeff"] = 0.01
schedule = [
Expand Down
42 changes: 28 additions & 14 deletions rllib/agents/ars/ars.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,30 +228,44 @@ def validate_config(self, config: TrainerConfigDict) -> None:
"`NoFilter` for ARS!")

@override(Trainer)
def _init(self, config, env_creator):
self.validate_config(config)
env_context = EnvContext(config["env_config"] or {}, worker_index=0)
env = env_creator(env_context)
def setup(self, config):
# Setup our config: Merge the user-supplied config (which could
# be a partial config dict with the class' default).
self.config = self.merge_trainer_configs(
self.get_default_config(), config, self._allow_unknown_configs)

self._policy_class = get_policy_class(config)
# Validate our config dict.
self.validate_config(self.config)

# Generate `self.env_creator` callable to create an env instance.
self.env_creator = self._get_env_creator_from_env_id(self._env_id)
# Generate the local env.
env_context = EnvContext(
self.config["env_config"] or {}, worker_index=0)
env = self.env_creator(env_context)

self.callbacks = self.config["callbacks"]()

self._policy_class = get_policy_class(self.config)
self.policy = self._policy_class(env.observation_space,
env.action_space, config)
self.optimizer = optimizers.SGD(self.policy, config["sgd_stepsize"])
env.action_space, self.config)
self.optimizer = optimizers.SGD(self.policy,
self.config["sgd_stepsize"])

self.rollouts_used = config["rollouts_used"]
self.num_rollouts = config["num_rollouts"]
self.report_length = config["report_length"]
self.rollouts_used = self.config["rollouts_used"]
self.num_rollouts = self.config["num_rollouts"]
self.report_length = self.config["report_length"]

# Create the shared noise table.
logger.info("Creating shared noise table.")
noise_id = create_shared_noise.remote(config["noise_size"])
noise_id = create_shared_noise.remote(self.config["noise_size"])
self.noise = SharedNoiseTable(ray.get(noise_id))

# Create the actors.
logger.info("Creating actors.")
self.workers = [
Worker.remote(config, env_creator, noise_id, idx + 1)
for idx in range(config["num_workers"])
Worker.remote(self.config, self.env_creator, noise_id, idx + 1)
for idx in range(self.config["num_workers"])
]

self.episodes_so_far = 0
Expand Down Expand Up @@ -375,7 +389,7 @@ def compute_single_action(self, observation, *args, **kwargs):
return action[0], [], {}
return action[0]

@Deprecated(new="compute_single_action", error=False)
@Deprecated(new="compute_single_action", error=True)
def compute_action(self, observation, *args, **kwargs):
return self.compute_single_action(observation, *args, **kwargs)

Expand Down
3 changes: 2 additions & 1 deletion rllib/agents/ars/tests/test_ars.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def test_ars_compilation(self):
config["model"]["fcnet_hiddens"] = [10]
config["model"]["fcnet_activation"] = None
config["noise_size"] = 2500000
# Test eval workers ("normal" Trainer eval WorkerSet, unusual for ARS).
# Test eval workers ("normal" WorkerSet, unlike ARS' list of
# RolloutWorkers used for collecting train batches).
config["evaluation_interval"] = 1
config["evaluation_num_workers"] = 1

Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/ddpg/apex.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"target_network_update_freq": 500000,
"timesteps_per_iteration": 25000,
"worker_side_prioritization": True,
"min_iter_time_s": 30,
"min_time_s_per_reporting": 30,
},
_allow_unknown_configs=True,
)
Expand Down
4 changes: 2 additions & 2 deletions rllib/agents/ddpg/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@
"num_workers": 0,
# Whether to compute priorities on workers.
"worker_side_prioritization": False,
# Prevent iterations from going lower than this time span
"min_iter_time_s": 1,
# Prevent reporting frequency from going lower than this time span.
"min_time_s_per_reporting": 1,
})
# __sphinx_doc_end__
# yapf: enable
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/ddpg/tests/test_apex_ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_apex_ddpg_compilation_and_per_worker_epsilon_values(self):
config["num_workers"] = 2
config["prioritized_replay"] = True
config["timesteps_per_iteration"] = 100
config["min_iter_time_s"] = 1
config["min_time_s_per_reporting"] = 1
config["learning_starts"] = 0
config["optimizer"]["num_replay_buffer_shards"] = 1
num_iterations = 1
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/ddpg/tests/test_ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_ddpg_loss_function(self):
config["actor_hiddens"] = [10]
config["critic_hiddens"] = [10]
# Make sure, timing differences do not affect trainer.train().
config["min_iter_time_s"] = 0
config["min_time_s_per_reporting"] = 0
config["timesteps_per_iteration"] = 100

map_ = {
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/dqn/apex.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
"timesteps_per_iteration": 25000,
"exploration_config": {"type": "PerWorkerEpsilonGreedy"},
"worker_side_prioritization": True,
"min_iter_time_s": 30,
"min_time_s_per_reporting": 30,
# If set, this will fix the ratio of replayed from a buffer and learned
# on timesteps to sampled from an environment and stored in the replay
# buffer timesteps. Otherwise, replay will proceed as fast as possible.
Expand Down
4 changes: 2 additions & 2 deletions rllib/agents/dqn/simple_q.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@
# to increase if your environment is particularly slow to sample, or if
# you"re using the Async or Ape-X optimizers.
"num_workers": 0,
# Prevent iterations from going lower than this time span.
"min_iter_time_s": 1,
# Prevent reporting frequency from going lower than this time span.
"min_time_s_per_reporting": 1,
})
# __sphinx_doc_end__
# yapf: enable
Expand Down
6 changes: 3 additions & 3 deletions rllib/agents/dqn/tests/test_apex_dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_apex_zero_workers(self):
config["learning_starts"] = 1000
config["prioritized_replay"] = True
config["timesteps_per_iteration"] = 100
config["min_iter_time_s"] = 1
config["min_time_s_per_reporting"] = 1
config["optimizer"]["num_replay_buffer_shards"] = 1
for _ in framework_iterator(config):
trainer = apex.ApexTrainer(config=config, env="CartPole-v0")
Expand All @@ -41,7 +41,7 @@ def test_apex_dqn_compilation_and_per_worker_epsilon_values(self):
config["learning_starts"] = 1000
config["prioritized_replay"] = True
config["timesteps_per_iteration"] = 100
config["min_iter_time_s"] = 1
config["min_time_s_per_reporting"] = 1
config["optimizer"]["num_replay_buffer_shards"] = 1

for _ in framework_iterator(config, with_eager_tracing=True):
Expand Down Expand Up @@ -81,7 +81,7 @@ def test_apex_lr_schedule(self):
config["timesteps_per_iteration"] = 10
# 0 metrics reporting delay, this makes sure timestep,
# which lr depends on, is updated after each worker rollout.
config["min_iter_time_s"] = 0
config["min_time_s_per_reporting"] = 0
config["optimizer"]["num_replay_buffer_shards"] = 1
# This makes sure learning schedule is checked every 10 timesteps.
config["optimizer"]["max_weight_sync_delay"] = 10
Expand Down