ray-project · ericl · Apr 30, 2018 · Apr 23, 2018 · Apr 24, 2018 · Apr 24, 2018
@@ -20,6 +20,8 @@ Example of constructing and using a policy optimizer `(link to full example) <ht
         print("optimizer stats", optimizer.stats())
         print("local evaluator stats", optimizer.local_evaluator.stats())
 
+Read more about policy optimizers in this post: `Distributed Policy Optimizers for Scalable and Reproducible Deep RL <https://rise.cs.berkeley.edu/blog/distributed-policy-optimizers-for-scalable-and-reproducible-deep-rl/>`__.
+
 Here are the steps for using a RLlib policy optimizer with an existing algorithm.
 
 1. Implement the `Policy evaluator interface <rllib-dev.html#policy-evaluators-and-optimizers>`__.

@@ -7,19 +7,19 @@ You can find the code for RLlib `here on GitHub <https://github.com/ray-project/
 
 RLlib's policy optimizers serve as the basis for RLlib's reference algorithms, which include:
 
--  `Proximal Policy Optimization (PPO) <https://arxiv.org/abs/1707.06347>`__ which
-   is a proximal variant of `TRPO <https://arxiv.org/abs/1502.05477>`__.
+- Proximal Policy Optimization (`PPO <https://github.com/ray-project/ray/tree/master/python/ray/rllib/ppo>`__) which is a proximal variant of `TRPO <https://arxiv.org/abs/1502.05477>`__.
 
--  `The Asynchronous Advantage Actor-Critic (A3C) <https://arxiv.org/abs/1602.01783>`__.
+- Policy Gradients (`PG <https://github.com/ray-project/ray/tree/master/python/ray/rllib/pg>`__).
 
-- `Deep Q Networks (DQN) <https://arxiv.org/abs/1312.5602>`__.
+- Asynchronous Advantage Actor-Critic (`A3C <https://github.com/ray-project/ray/tree/master/python/ray/rllib/a3c>`__).
 
-- `Ape-X Distributed Prioritized Experience Replay <https://arxiv.org/abs/1803.00933>`__.
+- Deep Q Networks (`DQN <https://github.com/ray-project/ray/tree/master/python/ray/rllib/dqn>`__).
 
--  Evolution Strategies, as described in `this
-   paper <https://arxiv.org/abs/1703.03864>`__. Our implementation
-   is adapted from
-   `here <https://github.com/openai/evolution-strategies-starter>`__.
+- Deep Deterministic Policy Gradients (`DDPG <https://github.com/ray-project/ray/tree/master/python/ray/rllib/ddpg>`__, `DDPG2 <https://github.com/ray-project/ray/tree/master/python/ray/rllib/ddpg2>`__).
+
+- Ape-X Distributed Prioritized Experience Replay, including both `DQN <https://github.com/ray-project/ray/blob/master/python/ray/rllib/dqn/apex.py>`__ and `DDPG <https://github.com/ray-project/ray/blob/master/python/ray/rllib/ddpg2/apex.py>`__ variants.
+
+- Evolution Strategies (`ES <https://github.com/ray-project/ray/tree/master/python/ray/rllib/es>`__), as described in `this paper <https://arxiv.org/abs/1703.03864>`__.
 
 These algorithms can be run on any `OpenAI Gym MDP <https://github.com/openai/gym>`__,
 including custom ones written and registered by the user.
@@ -76,18 +76,22 @@ The ``train.py`` script has a number of options you can show by running
 The most important options are for choosing the environment
 with ``--env`` (any OpenAI gym environment including ones registered by the user
 can be used) and for choosing the algorithm with ``--run``
-(available options are ``PPO``, ``A3C``, ``ES``, ``DQN`` and ``APEX``).
+(available options are ``PPO``, ``PG``, ``A3C``, ``ES``, ``DDPG``, ``DDPG2``, ``DQN``, ``APEX``, and ``APEX_DDPG2``).
 
 Specifying Parameters
 ~~~~~~~~~~~~~~~~~~~~~
 
 Each algorithm has specific hyperparameters that can be set with ``--config`` - see the
 ``DEFAULT_CONFIG`` variable in
 `PPO <https://github.com/ray-project/ray/blob/master/python/ray/rllib/ppo/ppo.py>`__,
+`PG <https://github.com/ray-project/ray/blob/master/python/ray/rllib/pg/pg.py>`__,
 `A3C <https://github.com/ray-project/ray/blob/master/python/ray/rllib/a3c/a3c.py>`__,
 `ES <https://github.com/ray-project/ray/blob/master/python/ray/rllib/es/es.py>`__,
-`DQN <https://github.com/ray-project/ray/blob/master/python/ray/rllib/dqn/dqn.py>`__ and
-`APEX <https://github.com/ray-project/ray/blob/master/python/ray/rllib/dqn/apex.py>`__.
+`DQN <https://github.com/ray-project/ray/blob/master/python/ray/rllib/dqn/dqn.py>`__,
+`DDPG <https://github.com/ray-project/ray/blob/master/python/ray/rllib/ddpg/ddpg.py>`__,
+`DDPG2 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/ddpg2/ddpg.py>`__,
+`APEX <https://github.com/ray-project/ray/blob/master/python/ray/rllib/dqn/apex.py>`__, and
+`APEX_DDPG2 <https://github.com/ray-project/ray/blob/master/python/ray/rllib/ddpg2/apex.py>`__.
 
 In an example below, we train A3C by specifying 8 workers through the config flag.
 function that creates the env to refer to it by name. The contents of the env_config agent config field will be passed to that function to allow the environment to be configured. The return type should be an OpenAI gym.Env. For example:

diff --git a/python/ray/rllib/README.rst b/python/ray/rllib/README.rst
@@ -5,18 +5,19 @@ Ray RLlib is an RL execution toolkit built on the Ray distributed execution fram
 
 RLlib includes the following reference algorithms:
 
--  `Proximal Policy Optimization (PPO) <https://arxiv.org/abs/1707.06347>`__ which
-   is a proximal variant of `TRPO <https://arxiv.org/abs/1502.05477>`__.
+- Proximal Policy Optimization (`PPO <https://github.com/ray-project/ray/tree/master/python/ray/rllib/ppo>`__) which is a proximal variant of `TRPO <https://arxiv.org/abs/1502.05477>`__.
 
--  `The Asynchronous Advantage Actor-Critic (A3C) <https://arxiv.org/abs/1602.01783>`__.
+- Policy Gradients (`PG <https://github.com/ray-project/ray/tree/master/python/ray/rllib/pg>`__).
 
-- `Deep Q Networks (DQN) <https://arxiv.org/abs/1312.5602>`__.
+- Asynchronous Advantage Actor-Critic (`A3C <https://github.com/ray-project/ray/tree/master/python/ray/rllib/a3c>`__).
 
-- `Ape-X Distributed Prioritized Experience Replay <https://arxiv.org/abs/1803.00933>`__.
+- Deep Q Networks (`DQN <https://github.com/ray-project/ray/tree/master/python/ray/rllib/dqn>`__).
 
--  Evolution Strategies, as described in `this
-   paper <https://arxiv.org/abs/1703.03864>`__. Our implementation
-   is adapted from
-   `here <https://github.com/openai/evolution-strategies-starter>`__.
+- Deep Deterministic Policy Gradients (`DDPG <https://github.com/ray-project/ray/tree/master/python/ray/rllib/ddpg>`__, `DDPG2 <https://github.com/ray-project/ray/tree/master/python/ray/rllib/ddpg2>`__).
+
+- Ape-X Distributed Prioritized Experience Replay, including both `DQN <https://github.com/ray-project/ray/blob/master/python/ray/rllib/dqn/apex.py>`__ and `DDPG <https://github.com/ray-project/ray/blob/master/python/ray/rllib/ddpg2/apex.py>`__ variants.
+
+- Evolution Strategies (`ES <https://github.com/ray-project/ray/tree/master/python/ray/rllib/es>`__), as described in `this
+   paper <https://arxiv.org/abs/1703.03864>`__.
 
 These algorithms can be run on any OpenAI Gym MDP, including custom ones written and registered by the user.
diff --git a/python/ray/rllib/__init__.py b/python/ray/rllib/__init__.py
@@ -9,7 +9,7 @@
 
 def _register_all():
     for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
-                "DDPG2", "APEX_DDPG2", "__fake", "__sigmoid_fake_data",
+                "DDPG2", "APEX_DDPG", "__fake", "__sigmoid_fake_data",
                 "__parameter_tuning"]:
         from ray.rllib.agent import get_agent_class
         register_trainable(key, get_agent_class(key))

diff --git a/python/ray/rllib/agent.py b/python/ray/rllib/agent.py
@@ -234,9 +234,12 @@ def get_agent_class(alg):
     if alg == "DDPG2":
         from ray.rllib import ddpg2
         return ddpg2.DDPG2Agent
-    elif alg == "APEX_DDPG2":
-        from ray.rllib import ddpg2
-        return ddpg2.ApexDDPG2Agent
+    elif alg == "DDPG":
+        from ray.rllib import ddpg
+        return ddpg.DDPGAgent
+    elif alg == "APEX_DDPG":
+        from ray.rllib import ddpg
+        return ddpg.ApexDDPGAgent
     elif alg == "PPO":
         from ray.rllib import ppo
         return ppo.PPOAgent
@@ -258,9 +261,6 @@ def get_agent_class(alg):
     elif alg == "PG":
         from ray.rllib import pg
         return pg.PGAgent
-    elif alg == "DDPG":
-        from ray.rllib import ddpg
-        return ddpg.DDPGAgent
     elif alg == "script":
         from ray.tune import script_runner
         return script_runner.ScriptRunner

diff --git a/python/ray/rllib/ddpg/README.md b/python/ray/rllib/ddpg/README.md
@@ -0,0 +1 @@
+Implementation of deep deterministic policy gradients (https://arxiv.org/abs/1509.02971), including an Ape-X variant.
diff --git a/python/ray/rllib/ddpg/__init__.py b/python/ray/rllib/ddpg/__init__.py
@@ -1,3 +1,8 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ray.rllib.ddpg.apex import ApexDDPGAgent
 from ray.rllib.ddpg.ddpg import DDPGAgent, DEFAULT_CONFIG
 
-__all__ = ["DDPGAgent", "DEFAULT_CONFIG"]
+__all__ = ["DDPGAgent", "ApexDDPGAgent", "DEFAULT_CONFIG"]
diff --git a/python/ray/rllib/ddpg2/apex.py → python/ray/rllib/ddpg/apex.py b/python/ray/rllib/ddpg2/apex.py → python/ray/rllib/ddpg/apex.py
@@ -2,7 +2,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from ray.rllib.ddpg2.ddpg import DDPG2Agent, DEFAULT_CONFIG as DDPG_CONFIG
+from ray.rllib.ddpg.ddpg import DDPGAgent, DEFAULT_CONFIG as DDPG_CONFIG
 
 APEX_DDPG_DEFAULT_CONFIG = dict(DDPG_CONFIG,
                                 **dict(
@@ -28,7 +28,7 @@
                                 ))
 
 
-class ApexDDPG2Agent(DDPG2Agent):
+class ApexDDPGAgent(DDPGAgent):
     """DDPG variant that uses the Ape-X distributed policy optimizer.
 
     By default, this is configured for a large single node (32 cores). For

diff --git a/python/ray/rllib/ddpg2/common/__init__.py → python/ray/rllib/ddpg/common/__init__.py b/python/ray/rllib/ddpg2/common/__init__.py → python/ray/rllib/ddpg/common/__init__.py