ray-project · sven1977 · Oct 2, 2020 · Aug 26, 2020 · Aug 29, 2020 · Aug 31, 2020
@@ -28,13 +28,7 @@
 )
 
 
-def validate_config(config):
-    if config.get("framework") == "tfe":
-        raise ValueError("APEX_DDPG does not support tf-eager yet!")
-
-
 ApexDDPGTrainer = DDPGTrainer.with_updates(
     name="APEX_DDPG",
     default_config=APEX_DDPG_DEFAULT_CONFIG,
-    validate_config=validate_config,
     execution_plan=apex_execution_plan)
@@ -251,7 +251,10 @@ def __init__(self, config):
         self.kl_coeff_val = config["kl_coeff"]
         # The current KL value (as tf Variable for in-graph operations).
         self.kl_coeff = get_variable(
-            float(self.kl_coeff_val), tf_name="kl_coeff", trainable=False)
+            float(self.kl_coeff_val),
+            tf_name="kl_coeff",
+            trainable=False,
+            framework=config["framework"])
         # Constant target value.
         self.kl_target = config["kl_target"]
 

@@ -37,7 +37,7 @@
 class TestPPO(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        ray.init(local_mode=True)
+        ray.init()
 
     @classmethod
     def tearDownClass(cls):

@@ -166,7 +166,8 @@ def test_sac_loss_function(self):
 
             # Set all weights (of all nets) to fixed values.
             if weights_dict is None:
-                assert fw in ["tf", "tfe"]  # Start with the tf vars-dict.
+                # Start with the tf vars-dict.
+                assert fw in ["tf2", "tf", "tfe"]
                 weights_dict = policy.get_weights()
                 if fw == "tfe":
                     log_alpha = weights_dict[10]

@@ -1,4 +1,5 @@
 import argparse
+import os
 
 import ray
 from ray import tune
@@ -42,6 +43,8 @@
             "repeat_delay": 2,
         },
         "gamma": 0.99,
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", 0)),
         "num_workers": 0,
         "num_envs_per_worker": 20,
         "entropy_coeff": 0.001,

@@ -11,6 +11,7 @@
 """
 
 import argparse
+import os
 
 import ray
 from ray import tune
@@ -44,7 +45,8 @@
     config = {
         "env": CorrelatedActionsEnv,
         "gamma": 0.5,
-        "num_gpus": 0,
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
         "model": {
             "custom_model": "autoregressive_model",
             "custom_action_dist": "binary_autoreg_dist",
@@ -58,7 +60,7 @@
         "episode_reward_mean": args.stop_reward,
     }
 
-    results = tune.run(args.run, stop=stop, config=config)
+    results = tune.run(args.run, stop=stop, config=config, verbose=1)
 
     if args.as_test:
         check_learning_achieved(results, args.stop_reward)

@@ -1,6 +1,7 @@
 """Example of using a custom model with batch norm."""
 
 import argparse
+import os
 
 import ray
 from ray import tune
@@ -22,7 +23,7 @@
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    ray.init(local_mode=True)
+    ray.init()
 
     ModelCatalog.register_custom_model(
         "bn_model", TorchBatchNormModel if args.torch else BatchNormModel)
@@ -32,6 +33,8 @@
         "model": {
             "custom_model": "bn_model",
         },
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
         "num_workers": 0,
         "framework": "torch" if args.torch else "tf",
     }
@@ -42,7 +45,7 @@
         "episode_reward_mean": args.stop_reward,
     }
 
-    results = tune.run(args.run, stop=stop, config=config)
+    results = tune.run(args.run, stop=stop, config=config, verbose=1)
 
     if args.as_test:
         check_learning_achieved(results, args.stop_reward)

@@ -1,4 +1,5 @@
 import argparse
+import os
 
 from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
 from ray.rllib.utils.test_utils import check_learning_achieved
@@ -35,8 +36,11 @@
     }
 
     config = dict(
-        configs[args.run], **{
+        configs[args.run],
+        **{
             "env": StatelessCartPole,
+            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
             "model": {
                 "use_lstm": True,
                 "lstm_use_prev_action_reward": args.use_prev_action_reward,
@@ -50,7 +54,7 @@
         "episode_reward_mean": args.stop_reward,
     }
 
-    results = tune.run(args.run, config=config, stop=stop)
+    results = tune.run(args.run, config=config, stop=stop, verbose=1)
 
     if args.as_test:
         check_learning_achieved(results, args.stop_reward)

@@ -16,6 +16,7 @@
 import argparse
 import numpy as np
 from gym.spaces import Discrete
+import os
 
 import ray
 from ray import tune
@@ -90,7 +91,7 @@ def centralized_critic_postprocessing(policy,
                     sample_batch[OPPONENT_OBS], policy.device),
                 convert_to_torch_tensor(
                     sample_batch[OPPONENT_ACTION], policy.device)) \
-                .detach().numpy()
+                .cpu().detach().numpy()
         else:
             sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
                 sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS],
@@ -137,14 +138,22 @@ def loss_with_central_critic(policy, model, dist_class, train_batch):
     return loss
 
 
-def setup_mixins(policy, obs_space, action_space, config):
-    # copied from PPO
+def setup_tf_mixins(policy, obs_space, action_space, config):
+    # Copied from PPOTFPolicy (w/o ValueNetworkMixin).
     KLCoeffMixin.__init__(policy, config)
     EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
                                   config["entropy_coeff_schedule"])
     LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
 
 
+def setup_torch_mixins(policy, obs_space, action_space, config):
+    # Copied from PPOTorchPolicy  (w/o ValueNetworkMixin).
+    TorchKLCoeffMixin.__init__(policy, config)
+    TorchEntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
+                                       config["entropy_coeff_schedule"])
+    TorchLR.__init__(policy, config["lr"], config["lr_schedule"])
+
+
 def central_vf_stats(policy, train_batch, grads):
     # Report the explained variance of the central value function.
     return {
@@ -158,7 +167,7 @@ def central_vf_stats(policy, train_batch, grads):
     name="CCPPOTFPolicy",
     postprocess_fn=centralized_critic_postprocessing,
     loss_fn=loss_with_central_critic,
-    before_loss_init=setup_mixins,
+    before_loss_init=setup_tf_mixins,
     grad_stats_fn=central_vf_stats,
     mixins=[
         LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
@@ -169,7 +178,7 @@ def central_vf_stats(policy, train_batch, grads):
     name="CCPPOTorchPolicy",
     postprocess_fn=centralized_critic_postprocessing,
     loss_fn=loss_with_central_critic,
-    before_init=setup_mixins,
+    before_init=setup_torch_mixins,
     mixins=[
         TorchLR, TorchEntropyCoeffSchedule, TorchKLCoeffMixin,
         CentralizedValueMixin
@@ -188,7 +197,7 @@ def get_policy_class(config):
 )
 
 if __name__ == "__main__":
-    ray.init(local_mode=True)
+    ray.init()
     args = parser.parse_args()
 
     ModelCatalog.register_custom_model(
@@ -198,6 +207,8 @@ def get_policy_class(config):
     config = {
         "env": TwoStepGame,
         "batch_mode": "complete_episodes",
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
         "num_workers": 0,
         "multiagent": {
             "policies": {
@@ -222,7 +233,7 @@ def get_policy_class(config):
         "episode_reward_mean": args.stop_reward,
     }
 
-    results = tune.run(CCTrainer, config=config, stop=stop)
+    results = tune.run(CCTrainer, config=config, stop=stop, verbose=1)
 
     if args.as_test:
         check_learning_achieved(results, args.stop_reward)
@@ -12,6 +12,7 @@
 import numpy as np
 from gym.spaces import Dict, Discrete
 import argparse
+import os
 
 from ray import tune
 from ray.rllib.agents.callbacks import DefaultCallbacks
@@ -87,6 +88,8 @@ def central_critic_observer(agent_obs, **kw):
         "env": TwoStepGame,
         "batch_mode": "complete_episodes",
         "callbacks": FillInActions,
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
         "num_workers": 0,
         "multiagent": {
             "policies": {

@@ -8,6 +8,7 @@
 """
 
 import argparse
+import os
 
 import ray
 from ray import tune
@@ -18,10 +19,10 @@
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
-    "--framework", choices=["tf", "tfe", "torch"], default="tf")
+    "--framework", choices=["tf2", "tf", "tfe", "torch"], default="tf2")
 
 if __name__ == "__main__":
-    ray.init(local_mode=True)
+    ray.init()
     args = parser.parse_args()
     if args.framework == "torch":
         ModelCatalog.register_custom_model("my_model", CustomTorchRPGModel)
@@ -37,6 +38,8 @@
             "env": SimpleRPG,
             "rollout_fragment_length": 1,
             "train_batch_size": 2,
+            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
             "num_workers": 0,
             "model": {
                 "custom_model": "my_model",

@@ -11,6 +11,7 @@
 import gym
 from gym.spaces import Discrete, Box
 import numpy as np
+import os
 
 import ray
 from ray import tune
@@ -114,6 +115,8 @@ def value_function(self):
         "env_config": {
             "corridor_length": 5,
         },
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
         "model": {
             "custom_model": "my_model",
         },

@@ -67,6 +67,7 @@
 """
 
 import argparse
+import os
 
 import ray
 from ray import tune
@@ -137,7 +138,9 @@ def custom_eval_function(trainer, eval_workers):
             "corridor_length": 10,
         },
         "horizon": 20,
-        "log_level": "INFO",
+
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
 
         # Training rollouts will be collected using just the learner
         # process, but evaluation will be done in parallel with two

@@ -5,6 +5,7 @@
 """
 
 import argparse
+import os
 
 import ray
 import ray.tune as tune
@@ -32,15 +33,16 @@
         "model": {
             "custom_model": "fast_model"
         },
-        "num_gpus": 0,
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
         "num_workers": 2,
         "num_envs_per_worker": 10,
         "num_data_loader_buffers": 1,
         "num_aggregation_workers": 1,
         "broadcast_interval": 50,
         "rollout_fragment_length": 100,
         "train_batch_size": sample_from(
-            lambda spec: 1000 * max(1, spec.config.num_gpus)),
+            lambda spec: 1000 * max(1, spec.config.num_gpus or 1)),
         "fake_sampler": True,
         "framework": "torch" if args.torch else "tf",
     }
@@ -50,6 +52,6 @@
         "timesteps_total": args.stop_timesteps,
     }
 
-    tune.run("IMPALA", config=config, stop=stop)
+    tune.run("IMPALA", config=config, stop=stop, verbose=1)
 
     ray.shutdown()
@@ -1,6 +1,7 @@
 """Example of using a custom ModelV2 Keras-style model."""
 
 import argparse
+import os
 
 import ray
 from ray import tune
@@ -119,11 +120,12 @@ def check_has_custom_metric(result):
         args.run,
         stop={"episode_reward_mean": args.stop},
         config=dict(
-            extra_config, **{
-                "log_level": "INFO",
+            extra_config,
+            **{
                 "env": "BreakoutNoFrameskip-v4"
                 if args.use_vision_network else "CartPole-v0",
-                "num_gpus": 0,
+                # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+                "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
                 "callbacks": {
                     "on_train_result": check_has_custom_metric,
                 },

@@ -50,6 +50,8 @@
 
     config = {
         "env": "CartPole-v0",
+        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
+        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
         "num_workers": 0,
         "model": {
             "custom_model": "custom_loss",
@@ -64,4 +66,4 @@
         "training_iteration": args.stop_iters,
     }
 
-    tune.run("PG", config=config, stop=stop)
+    tune.run("PG", config=config, stop=stop, verbose=1)