Skip to content

Commit

Permalink
[RLlib] Preparatory PR for multi-agent multi-GPU learner (alpha-star …
Browse files Browse the repository at this point in the history
…style) #3 (#21652)
  • Loading branch information
sven1977 committed Jan 25, 2022
1 parent b2cd123 commit d5bfb7b
Show file tree
Hide file tree
Showing 43 changed files with 375 additions and 281 deletions.
45 changes: 26 additions & 19 deletions rllib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ py_test(
)

py_test(
name = "learning_cartpole_a2c_fake_gpus",
name = "learning_tests_cartpole_a2c_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -126,15 +126,22 @@ py_test(

# APPO
py_test(
name = "learning_tests_cartpole_appo",
name = "learning_tests_cartpole_appo_no_vtrace",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = [
"tuned_examples/ppo/cartpole-appo.yaml",
"tuned_examples/ppo/cartpole-appo-vtrace.yaml"
],
data = ["tuned_examples/ppo/cartpole-appo.yaml"],
args = ["--yaml-dir=tuned_examples/ppo"]
)

py_test(
name = "learning_tests_cartpole_appo_vtrace",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"],
size = "large",
srcs = ["tests/run_regression_tests.py"],
data = ["tuned_examples/ppo/cartpole-appo-vtrace.yaml"],
args = ["--yaml-dir=tuned_examples/ppo"]
)

Expand All @@ -151,7 +158,7 @@ py_test(
)

py_test(
name = "learning_frozenlake_appo",
name = "learning_tests_frozenlake_appo",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_discrete"],
size = "large",
Expand All @@ -161,7 +168,7 @@ py_test(
)

py_test(
name = "learning_cartpole_appo_fake_gpus",
name = "learning_tests_cartpole_appo_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -208,7 +215,7 @@ py_test(
)

py_test(
name = "learning_pendulum_ddpg_fake_gpus",
name = "learning_tests_pendulum_ddpg_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -263,7 +270,7 @@ py_test(
)

py_test(
name = "learning_cartpole_dqn_fake_gpus",
name = "learning_tests_cartpole_dqn_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "large",
Expand All @@ -286,7 +293,7 @@ py_test(
)

py_test(
name = "learning_cartpole_simpleq_fake_gpus",
name = "learning_tests_cartpole_simpleq_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "medium",
Expand Down Expand Up @@ -318,7 +325,7 @@ py_test(
)

py_test(
name = "learning_cartpole_impala_fake_gpus",
name = "learning_tests_cartpole_impala_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -352,7 +359,7 @@ py_test(
)

py_test(
name = "learning_cartpole_pg_fake_gpus",
name = "learning_tests_cartpole_pg_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -403,7 +410,7 @@ py_test(
)

py_test(
name = "learning_cartpole_ppo_fake_gpus",
name = "learning_tests_cartpole_ppo_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -455,7 +462,7 @@ py_test(
)

py_test(
name = "learning_stateless_cartpole_r2d2_fake_gpus",
name = "learning_tests_stateless_cartpole_r2d2_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_cartpole", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -506,7 +513,7 @@ py_test(
)

py_test(
name = "learning_pendulum_sac_fake_gpus",
name = "learning_tests_pendulum_sac_fake_gpus",
main = "tests/run_regression_tests.py",
tags = ["team:ml", "learning_tests", "learning_tests_pendulum", "learning_tests_continuous", "fake_gpus"],
size = "large",
Expand Down Expand Up @@ -845,7 +852,7 @@ py_test(
"--env", "Pendulum-v1",
"--run", "APEX_DDPG",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"framework\": \"tf\", \"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_iter_time_s\": 1, \"batch_mode\": \"complete_episodes\"}'",
"--config", "'{\"framework\": \"tf\", \"num_workers\": 2, \"optimizer\": {\"num_replay_buffer_shards\": 1}, \"learning_starts\": 100, \"min_time_s_per_reporting\": 1, \"batch_mode\": \"complete_episodes\"}'",
"--ray-num-cpus", "4",
]
)
Expand Down Expand Up @@ -928,7 +935,7 @@ py_test(
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"framework\": \"tf\", \"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_multi_gpu_tower_stacks\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0}'",
"--config", "'{\"framework\": \"tf\", \"num_gpus\": 0, \"num_workers\": 2, \"min_time_s_per_reporting\": 1, \"num_multi_gpu_tower_stacks\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0}'",
"--ray-num-cpus", "4",
]
)
Expand All @@ -942,7 +949,7 @@ py_test(
"--env", "CartPole-v0",
"--run", "IMPALA",
"--stop", "'{\"training_iteration\": 1}'",
"--config", "'{\"framework\": \"tf\", \"num_gpus\": 0, \"num_workers\": 2, \"min_iter_time_s\": 1, \"num_multi_gpu_tower_stacks\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0, \"model\": {\"use_lstm\": true}}'",
"--config", "'{\"framework\": \"tf\", \"num_gpus\": 0, \"num_workers\": 2, \"min_time_s_per_reporting\": 1, \"num_multi_gpu_tower_stacks\": 2, \"replay_buffer_num_slots\": 100, \"replay_proportion\": 1.0, \"model\": {\"use_lstm\": true}}'",
"--ray-num-cpus", "4",
]
)
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/a3c/a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
A3C_CONFIG,
{
"rollout_fragment_length": 20,
"min_iter_time_s": 10,
"min_time_s_per_reporting": 10,
"sample_async": False,

# A2C supports microbatching, in which we accumulate gradients over
Expand Down
4 changes: 2 additions & 2 deletions rllib/agents/a3c/a3c.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@
"entropy_coeff": 0.01,
# Entropy coefficient schedule
"entropy_coeff_schedule": None,
# Min time per iteration
"min_iter_time_s": 5,
# Min time per reporting
"min_time_s_per_reporting": 5,
# Workers sample async. Note that this increases the effective
# rollout_fragment_length by up to 5x due to async buffering of batches.
"sample_async": True,
Expand Down
4 changes: 2 additions & 2 deletions rllib/agents/a3c/tests/test_a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_a2c_compilation(self):
trainer.stop()

def test_a2c_exec_impl(ray_start_regular):
config = {"min_iter_time_s": 0}
config = {"min_time_s_per_reporting": 0}
for _ in framework_iterator(config):
trainer = a3c.A2CTrainer(env="CartPole-v0", config=config)
results = trainer.train()
Expand All @@ -46,7 +46,7 @@ def test_a2c_exec_impl(ray_start_regular):

def test_a2c_exec_impl_microbatch(ray_start_regular):
config = {
"min_iter_time_s": 0,
"min_time_s_per_reporting": 0,
"microbatch_size": 10,
}
for _ in framework_iterator(config):
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/a3c/tests/test_a3c.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_a3c_entropy_coeff_schedule(self):
config["timesteps_per_iteration"] = 20
# 0 metrics reporting delay, this makes sure timestep,
# which entropy coeff depends on, is updated after each worker rollout.
config["min_iter_time_s"] = 0
config["min_time_s_per_reporting"] = 0
# Initial lr, doesn't really matter because of the schedule below.
config["entropy_coeff"] = 0.01
schedule = [
Expand Down
42 changes: 28 additions & 14 deletions rllib/agents/ars/ars.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,30 +228,44 @@ def validate_config(self, config: TrainerConfigDict) -> None:
"`NoFilter` for ARS!")

@override(Trainer)
def _init(self, config, env_creator):
self.validate_config(config)
env_context = EnvContext(config["env_config"] or {}, worker_index=0)
env = env_creator(env_context)
def setup(self, config):
# Setup our config: Merge the user-supplied config (which could
# be a partial config dict with the class' default).
self.config = self.merge_trainer_configs(
self.get_default_config(), config, self._allow_unknown_configs)

self._policy_class = get_policy_class(config)
# Validate our config dict.
self.validate_config(self.config)

# Generate `self.env_creator` callable to create an env instance.
self.env_creator = self._get_env_creator_from_env_id(self._env_id)
# Generate the local env.
env_context = EnvContext(
self.config["env_config"] or {}, worker_index=0)
env = self.env_creator(env_context)

self.callbacks = self.config["callbacks"]()

self._policy_class = get_policy_class(self.config)
self.policy = self._policy_class(env.observation_space,
env.action_space, config)
self.optimizer = optimizers.SGD(self.policy, config["sgd_stepsize"])
env.action_space, self.config)
self.optimizer = optimizers.SGD(self.policy,
self.config["sgd_stepsize"])

self.rollouts_used = config["rollouts_used"]
self.num_rollouts = config["num_rollouts"]
self.report_length = config["report_length"]
self.rollouts_used = self.config["rollouts_used"]
self.num_rollouts = self.config["num_rollouts"]
self.report_length = self.config["report_length"]

# Create the shared noise table.
logger.info("Creating shared noise table.")
noise_id = create_shared_noise.remote(config["noise_size"])
noise_id = create_shared_noise.remote(self.config["noise_size"])
self.noise = SharedNoiseTable(ray.get(noise_id))

# Create the actors.
logger.info("Creating actors.")
self.workers = [
Worker.remote(config, env_creator, noise_id, idx + 1)
for idx in range(config["num_workers"])
Worker.remote(self.config, self.env_creator, noise_id, idx + 1)
for idx in range(self.config["num_workers"])
]

self.episodes_so_far = 0
Expand Down Expand Up @@ -375,7 +389,7 @@ def compute_single_action(self, observation, *args, **kwargs):
return action[0], [], {}
return action[0]

@Deprecated(new="compute_single_action", error=False)
@Deprecated(new="compute_single_action", error=True)
def compute_action(self, observation, *args, **kwargs):
return self.compute_single_action(observation, *args, **kwargs)

Expand Down
3 changes: 2 additions & 1 deletion rllib/agents/ars/tests/test_ars.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def test_ars_compilation(self):
config["model"]["fcnet_hiddens"] = [10]
config["model"]["fcnet_activation"] = None
config["noise_size"] = 2500000
# Test eval workers ("normal" Trainer eval WorkerSet, unusual for ARS).
# Test eval workers ("normal" WorkerSet, unlike ARS' list of
# RolloutWorkers used for collecting train batches).
config["evaluation_interval"] = 1
config["evaluation_num_workers"] = 1

Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/ddpg/apex.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"target_network_update_freq": 500000,
"timesteps_per_iteration": 25000,
"worker_side_prioritization": True,
"min_iter_time_s": 30,
"min_time_s_per_reporting": 30,
},
_allow_unknown_configs=True,
)
Expand Down
4 changes: 2 additions & 2 deletions rllib/agents/ddpg/ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@
"num_workers": 0,
# Whether to compute priorities on workers.
"worker_side_prioritization": False,
# Prevent iterations from going lower than this time span
"min_iter_time_s": 1,
# Prevent reporting frequency from going lower than this time span.
"min_time_s_per_reporting": 1,
})
# __sphinx_doc_end__
# yapf: enable
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/ddpg/tests/test_apex_ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_apex_ddpg_compilation_and_per_worker_epsilon_values(self):
config["num_workers"] = 2
config["prioritized_replay"] = True
config["timesteps_per_iteration"] = 100
config["min_iter_time_s"] = 1
config["min_time_s_per_reporting"] = 1
config["learning_starts"] = 0
config["optimizer"]["num_replay_buffer_shards"] = 1
num_iterations = 1
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/ddpg/tests/test_ddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_ddpg_loss_function(self):
config["actor_hiddens"] = [10]
config["critic_hiddens"] = [10]
# Make sure, timing differences do not affect trainer.train().
config["min_iter_time_s"] = 0
config["min_time_s_per_reporting"] = 0
config["timesteps_per_iteration"] = 100

map_ = {
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/dqn/apex.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
"timesteps_per_iteration": 25000,
"exploration_config": {"type": "PerWorkerEpsilonGreedy"},
"worker_side_prioritization": True,
"min_iter_time_s": 30,
"min_time_s_per_reporting": 30,
# If set, this will fix the ratio of replayed from a buffer and learned
# on timesteps to sampled from an environment and stored in the replay
# buffer timesteps. Otherwise, replay will proceed as fast as possible.
Expand Down
4 changes: 2 additions & 2 deletions rllib/agents/dqn/simple_q.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@
# to increase if your environment is particularly slow to sample, or if
# you"re using the Async or Ape-X optimizers.
"num_workers": 0,
# Prevent iterations from going lower than this time span.
"min_iter_time_s": 1,
# Prevent reporting frequency from going lower than this time span.
"min_time_s_per_reporting": 1,
})
# __sphinx_doc_end__
# yapf: enable
Expand Down
6 changes: 3 additions & 3 deletions rllib/agents/dqn/tests/test_apex_dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_apex_zero_workers(self):
config["learning_starts"] = 1000
config["prioritized_replay"] = True
config["timesteps_per_iteration"] = 100
config["min_iter_time_s"] = 1
config["min_time_s_per_reporting"] = 1
config["optimizer"]["num_replay_buffer_shards"] = 1
for _ in framework_iterator(config):
trainer = apex.ApexTrainer(config=config, env="CartPole-v0")
Expand All @@ -41,7 +41,7 @@ def test_apex_dqn_compilation_and_per_worker_epsilon_values(self):
config["learning_starts"] = 1000
config["prioritized_replay"] = True
config["timesteps_per_iteration"] = 100
config["min_iter_time_s"] = 1
config["min_time_s_per_reporting"] = 1
config["optimizer"]["num_replay_buffer_shards"] = 1

for _ in framework_iterator(config, with_eager_tracing=True):
Expand Down Expand Up @@ -81,7 +81,7 @@ def test_apex_lr_schedule(self):
config["timesteps_per_iteration"] = 10
# 0 metrics reporting delay, this makes sure timestep,
# which lr depends on, is updated after each worker rollout.
config["min_iter_time_s"] = 0
config["min_time_s_per_reporting"] = 0
config["optimizer"]["num_replay_buffer_shards"] = 1
# This makes sure learning schedule is checked every 10 timesteps.
config["optimizer"]["max_weight_sync_delay"] = 10
Expand Down

0 comments on commit d5bfb7b

Please sign in to comment.