ray-project · sven1977 · Jun 15, 2021 · Jun 10, 2021 · Jun 10, 2021 · Jun 10, 2021
@@ -315,7 +315,7 @@ def get_state(self):
     def save(self, checkpoint_dir=None):
         """Saves the current model state to a checkpoint.
 
-        Subclasses should override ``_save()`` instead to save state.
+        Subclasses should override ``save_checkpoint()`` instead to save state.
         This method dumps additional metadata alongside the saved path.
 
         Args:

@@ -1286,6 +1286,13 @@ py_test(
     srcs = ["policy/tests/test_compute_log_likelihoods.py"]
 )
 
+py_test(
+    name = "policy/tests/test_policy",
+    tags = ["policy"],
+    size = "medium",
+    srcs = ["policy/tests/test_policy.py"]
+)
+
 py_test(
     name = "policy/tests/test_sample_batch",
     tags = ["policy"],

@@ -31,7 +31,7 @@ def test_apex_ddpg_compilation_and_per_worker_epsilon_values(self):
 
             # Test per-worker scale distribution.
             infos = trainer.workers.foreach_policy(
-                lambda p, _: p.get_exploration_info())
+                lambda p, _: p.get_exploration_state())
             scale = [i["cur_scale"] for i in infos]
             expected = [
                 0.4**(1 + (i + 1) / float(config["num_workers"] - 1) * 7)
@@ -46,7 +46,7 @@ def test_apex_ddpg_compilation_and_per_worker_epsilon_values(self):
             # Test again per-worker scale distribution
             # (should not have changed).
             infos = trainer.workers.foreach_policy(
-                lambda p, _: p.get_exploration_info())
+                lambda p, _: p.get_exploration_state())
             scale = [i["cur_scale"] for i in infos]
             check(scale, [0.0] + expected)
 

@@ -222,7 +222,7 @@ def add_apex_metrics(result: dict) -> dict:
         replay_stats = ray.get(replay_actors[0].stats.remote(
             config["optimizer"].get("debug")))
         exploration_infos = workers.foreach_trainable_policy(
-            lambda p, _: p.get_exploration_info())
+            lambda p, _: p.get_exploration_state())
         result["info"].update({
             "exploration_infos": exploration_infos,
             "learner_queue": learner_thread.learner_queue_size.stats(),

@@ -43,7 +43,7 @@ def test_apex_dqn_compilation_and_per_worker_epsilon_values(self):
 
             # Test per-worker epsilon distribution.
             infos = trainer.workers.foreach_policy(
-                lambda p, _: p.get_exploration_info())
+                lambda p, _: p.get_exploration_state())
             expected = [0.4, 0.016190862, 0.00065536]
             check([i["cur_epsilon"] for i in infos], [0.0] + expected)
 
@@ -55,7 +55,7 @@ def test_apex_dqn_compilation_and_per_worker_epsilon_values(self):
             # Test again per-worker epsilon distribution
             # (should not have changed).
             infos = trainer.workers.foreach_policy(
-                lambda p, _: p.get_exploration_info())
+                lambda p, _: p.get_exploration_state())
             check([i["cur_epsilon"] for i in infos], [0.0] + expected)
 
             trainer.stop()

@@ -1083,7 +1083,7 @@ def get_filters(self, flush_after: bool = False) -> dict:
         return return_filters
 
     @DeveloperAPI
-    def save(self) -> str:
+    def save(self) -> bytes:
         filters = self.get_filters(flush_after=True)
         state = {
             pid: self.policy_map[pid].get_state()
@@ -1092,7 +1092,7 @@ def save(self) -> str:
         return pickle.dumps({"filters": filters, "state": state})
 
     @DeveloperAPI
-    def restore(self, objs: str) -> None:
+    def restore(self, objs: bytes) -> None:
         objs = pickle.loads(objs)
         self.sync_filters(objs["filters"])
         for pid, state in objs["state"].items():

@@ -617,8 +617,8 @@ def apply_gradients(self, gradients):
                      for g in gradients], self.model.trainable_variables()))
 
         @override(Policy)
-        def get_exploration_info(self):
-            return _convert_to_numpy(self.exploration.get_info())
+        def get_exploration_state(self):
+            return _convert_to_numpy(self.exploration.get_state())
 
         @override(Policy)
         def get_weights(self, as_dict=False):
@@ -637,18 +637,20 @@ def set_weights(self, weights):
 
         @override(Policy)
         def get_state(self):
-            state = {"_state": super().get_state()}
+            state = super().get_state()
             if self._optimizer and \
                     len(self._optimizer.variables()) > 0:
                 state["_optimizer_variables"] = \
                     self._optimizer.variables()
+            # Add exploration state.
+            state["_exploration_state"] = self.exploration.get_state()
             return state
 
         @override(Policy)
         def set_state(self, state):
             state = state.copy()  # shallow copy
             # Set optimizer vars first.
-            optimizer_vars = state.pop("_optimizer_variables", None)
+            optimizer_vars = state.get("_optimizer_variables", None)
             if optimizer_vars and self._optimizer.variables():
                 logger.warning(
                     "Cannot restore an optimizer's state for tf eager! Keras "
@@ -658,8 +660,11 @@ def set_state(self, state):
                 for opt_var, value in zip(self._optimizer.variables(),
                                           optimizer_vars):
                     opt_var.assign(value)
+            # Set exploration's state.
+            if hasattr(self, "exploration") and "_exploration_state" in state:
+                self.exploration.set_state(state=state["_exploration_state"])
             # Then the Policy's (NN) weights.
-            super().set_state(state["_state"])
+            super().set_state(state)
 
         def variables(self):
             """Return the list of all savable variables for this policy."""
@@ -698,9 +703,10 @@ def loss_initialized(self):
         def export_model(self, export_dir):
             pass
 
+        # TODO: (sven) Deprecate this in favor of `save()`.
         @override(Policy)
         def export_checkpoint(self, export_dir):
-            pass
+            deprecation_warning("export_checkpoint", "save")
 
         def _get_is_training_placeholder(self):
             return tf.convert_to_tensor(self._is_training)

@@ -10,6 +10,7 @@
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.view_requirement import ViewRequirement
 from ray.rllib.utils.annotations import DeveloperAPI
+from ray.rllib.utils.deprecation import deprecation_warning
 from ray.rllib.utils.exploration.exploration import Exploration
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
 from ray.rllib.utils.from_config import from_config
@@ -424,7 +425,7 @@ def set_weights(self, weights: ModelWeights) -> None:
         raise NotImplementedError
 
     @DeveloperAPI
-    def get_exploration_info(self) -> Dict[str, TensorType]:
+    def get_exploration_state(self) -> Dict[str, TensorType]:
         """Returns the current exploration information of this policy.
 
         This information depends on the policy's Exploration object.
@@ -433,7 +434,12 @@ def get_exploration_info(self) -> Dict[str, TensorType]:
             Dict[str, TensorType]: Serializable information on the
                 `self.exploration` object.
         """
-        return self.exploration.get_info()
+        return self.exploration.get_state()
+
+    # TODO: (sven) Deprecate this method.
+    def get_exploration_info(self) -> Dict[str, TensorType]:
+        deprecation_warning("get_exploration_info", "get_exploration_state")
+        return self.get_exploration_state()
 
     @DeveloperAPI
     def is_recurrent(self) -> bool:
@@ -464,22 +470,28 @@ def get_initial_state(self) -> List[TensorType]:
 
     @DeveloperAPI
     def get_state(self) -> Union[Dict[str, TensorType], List[TensorType]]:
-        """Saves all local state.
+        """Returns all local state.
 
         Returns:
             Union[Dict[str, TensorType], List[TensorType]]: Serialized local
                 state.
         """
-        return self.get_weights()
+        state = {
+            "weights": self.get_weights(),
+            "global_timestep": self.global_timestep,
+        }
+        return state
 
     @DeveloperAPI
     def set_state(self, state: object) -> None:
-        """Restores all local state.
+        """Restores all local state to the provided `state`.
 
         Args:
-            state (obj): Serialized local state.
+            state (object): The new state to set this policy to. Can be
+                obtained by calling `Policy.get_state()`.
         """
-        self.set_weights(state)
+        self.set_weights(state["weights"])
+        self.global_timestep = state["global_timestep"]
 
     @DeveloperAPI
     def on_global_var_update(self, global_vars: Dict[str, TensorType]) -> None:
@@ -506,15 +518,6 @@ def export_model(self, export_dir: str) -> None:
         """
         raise NotImplementedError
 
-    @DeveloperAPI
-    def export_checkpoint(self, export_dir: str) -> None:
-        """Export Policy checkpoint to local directory.
-
-        Args:
-            export_dir (str): Local writable directory.
-        """
-        raise NotImplementedError
-
     @DeveloperAPI
     def import_model_from_h5(self, import_file: str) -> None:
         """Imports Policy from local file.
@@ -810,6 +813,16 @@ def _update_model_view_requirements_from_init_state(self):
             view_reqs["state_out_{}".format(i)] = ViewRequirement(
                 space=space, used_for_training=True)
 
+    # TODO: (sven) Deprecate this in favor of `save()`.
+    def export_checkpoint(self, export_dir: str) -> None:
+        """Export Policy checkpoint to local directory.
+
+        Args:
+            export_dir (str): Local writable directory.
+        """
+        deprecation_warning("export_checkpoint", "save")
+        raise NotImplementedError
+
 
 def clip_action(action, action_space):
     """Clips all actions in `flat_actions` according to the given Spaces.

@@ -0,0 +1,43 @@
+import unittest
+
+import ray
+from ray.rllib.agents.dqn import DQNTrainer, DEFAULT_CONFIG
+from ray.rllib.utils.test_utils import check, framework_iterator
+
+
+class TestPolicy(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        ray.init()
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        ray.shutdown()
+
+    def test_policy_save_restore(self):
+        config = DEFAULT_CONFIG.copy()
+        for _ in framework_iterator(config):
+            trainer = DQNTrainer(config=config, env="CartPole-v0")
+            policy = trainer.get_policy()
+            state1 = policy.get_state()
+            trainer.train()
+            state2 = policy.get_state()
+            check(
+                state1["_exploration_state"]["last_timestep"],
+                state2["_exploration_state"]["last_timestep"],
+                false=True)
+            check(
+                state1["global_timestep"],
+                state2["global_timestep"],
+                false=True)
+            # Reset policy to its original state and compare.
+            policy.set_state(state1)
+            state3 = policy.get_state()
+            # Make sure everything is the same.
+            check(state1, state3)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
@@ -14,6 +14,7 @@
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.utils.debug import summarize
+from ray.rllib.utils.deprecation import deprecation_warning
 from ray.rllib.utils.framework import try_import_tf, get_variable
 from ray.rllib.utils.schedules import PiecewiseSchedule
 from ray.rllib.utils.tf_run_builder import TFRunBuilder
@@ -478,8 +479,13 @@ def apply_gradients(self, gradients: ModelGradients) -> None:
 
     @override(Policy)
     @DeveloperAPI
+    def get_exploration_state(self) -> Dict[str, TensorType]:
+        return self.exploration.get_state(sess=self.get_session())
+
+    # TODO: (sven) Deprecate this method.
     def get_exploration_info(self) -> Dict[str, TensorType]:
-        return self.exploration.get_info(sess=self.get_session())
+        deprecation_warning("get_exploration_info", "get_exploration_state")
+        return self.get_exploration_state()
 
     @override(Policy)
     @DeveloperAPI
@@ -500,17 +506,24 @@ def get_state(self) -> Union[Dict[str, TensorType], List[TensorType]]:
                 len(self._optimizer_variables.variables) > 0:
             state["_optimizer_variables"] = \
                 self._sess.run(self._optimizer_variables.variables)
+        # Add exploration state.
+        state["_exploration_state"] = \
+            self.exploration.get_state(self.get_session())
         return state
 
     @override(Policy)
     @DeveloperAPI
-    def set_state(self, state) -> None:
-        state = state.copy()  # shallow copy
+    def set_state(self, state: dict) -> None:
         # Set optimizer vars first.
-        optimizer_vars = state.pop("_optimizer_variables", None)
+        optimizer_vars = state.get("_optimizer_variables", None)
         if optimizer_vars:
             self._optimizer_variables.set_weights(optimizer_vars)
-        # Then the Policy's (NN) weights.
+        # Set exploration's state.
+        if hasattr(self, "exploration") and "_exploration_state" in state:
+            self.exploration.set_state(
+                state=state["_exploration_state"], sess=self.get_session())
+
+        # Set the Policy's (NN) weights.
         super().set_state(state)
 
     @override(Policy)
@@ -527,12 +540,14 @@ def export_model(self, export_dir: str) -> None:
                     graph=self._sess.graph))
             builder.save()
 
+    # TODO: (sven) Deprecate this in favor of `save()`.
     @override(Policy)
     @DeveloperAPI
     def export_checkpoint(self,
                           export_dir: str,
                           filename_prefix: str = "model") -> None:
         """Export tensorflow checkpoint to export_dir."""
+        deprecation_warning("export_checkpoint", "save")
         try:
             os.makedirs(export_dir)
         except OSError as e: