[tune] Search alg checkpointing during training (#9803)

Co-authored-by: krfricke <krfricke@users.noreply.github.com>
ray-project · Aug 3, 2020 · c6404e8 · c6404e8
1 parent db09f70
commit c6404e8
Show file tree

Hide file tree

Showing 11 changed files with 321 additions and 46 deletions.
diff --git a/doc/source/tune/api_docs/suggestion.rst b/doc/source/tune/api_docs/suggestion.rst
@@ -72,6 +72,54 @@ Tune also provides helpful utilities to use with Search Algorithms:
  * :ref:`repeater`: Support for running each *sampled hyperparameter* with multiple random seeds.
  * :ref:`limiter`: Limits the amount of concurrent trials when running optimization.
 
+Saving and Restoring
+--------------------
+
+Certain search algorithms have ``save/restore`` implemented,
+allowing reuse of learnings across multiple tuning runs.
+
+.. code-block:: python
+
+    search_alg = HyperOptSearch()
+
+    experiment_1 = tune.run(
+        trainable,
+        search_alg=search_alg)
+
+    search_alg.save("./my-checkpoint.pkl")
+
+    # Restore the saved state onto another search algorithm
+
+    search_alg2 = HyperOptSearch()
+    search_alg2.restore("./my-checkpoint.pkl")
+
+    experiment_2 = tune.run(
+        trainable,
+        search_alg=search_alg2)
+
+Further, Tune automatically saves its state inside the current experiment folder ("Result Dir") during tuning.
+
+Note that if you have two Tune runs with the same experiment folder,
+the previous state checkpoint will be overwritten. You can
+avoid this by making sure ``tune.run(name=...)`` is set to a unique
+identifier.
+
+.. code-block:: python
+
+    search_alg = HyperOptSearch()
+    experiment_1 = tune.run(
+        cost,
+        num_samples=5,
+        search_alg=search_alg,
+        verbose=0,
+        name="my-experiment-1",
+        local_dir="~/my_results")
+
+    search_alg2 = HyperOptSearch()
+    search_alg2.restore_from_dir(
+      os.path.join("~/my_results", "my-experiment-1"))
+
+.. note:: This is currently not implemented for: AxSearch, TuneBOHB, SigOptSearch, and DragonflySearch.
 
 .. _tune-ax:
 
@@ -87,6 +135,7 @@ Bayesian Optimization (tune.suggest.bayesopt.BayesOptSearch)
 
 
 .. autoclass:: ray.tune.suggest.bayesopt.BayesOptSearch
+  :members: save, restore
 
 .. _`BayesianOptimization search space specification`: https://github.com/fmfn/BayesianOptimization/blob/master/examples/advanced-tour.ipynb
 
@@ -115,20 +164,23 @@ Dragonfly (tune.suggest.dragonfly.DragonflySearch)
 --------------------------------------------------
 
 .. autoclass:: ray.tune.suggest.dragonfly.DragonflySearch
+  :members: save, restore
 
 .. _tune-hyperopt:
 
 HyperOpt (tune.suggest.hyperopt.HyperOptSearch)
 -----------------------------------------------
 
 .. autoclass:: ray.tune.suggest.hyperopt.HyperOptSearch
+  :members: save, restore
 
 .. _nevergrad:
 
 Nevergrad (tune.suggest.nevergrad.NevergradSearch)
 --------------------------------------------------
 
 .. autoclass:: ray.tune.suggest.nevergrad.NevergradSearch
+  :members: save, restore
 
 .. _`Nevergrad README's Optimization section`: https://github.com/facebookresearch/nevergrad/blob/master/docs/optimization.rst#choosing-an-optimizer
 
@@ -147,6 +199,7 @@ Scikit-Optimize (tune.suggest.skopt.SkOptSearch)
 ------------------------------------------------
 
 .. autoclass:: ray.tune.suggest.skopt.SkOptSearch
+  :members: save, restore
 
 .. _`skopt Optimizer object`: https://scikit-optimize.github.io/#skopt.Optimizer
 
@@ -156,6 +209,7 @@ ZOOpt (tune.suggest.zoopt.ZOOptSearch)
 --------------------------------------
 
 .. autoclass:: ray.tune.suggest.zoopt.ZOOptSearch
+  :members: save, restore
 
 .. _repeater:
 
@@ -188,8 +242,8 @@ Use ``ray.tune.suggest.ConcurrencyLimiter`` to limit the amount of concurrency w
 
 .. _byo-algo:
 
-Implementing your own Search Algorithm
---------------------------------------
+Custom Search Algorithms (tune.suggest.Searcher)
+------------------------------------------------
 
 If you are interested in implementing or contributing a new Search Algorithm, provide the following interface:
 

diff --git a/python/ray/tune/suggest/bayesopt.py b/python/ray/tune/suggest/bayesopt.py
@@ -270,16 +270,16 @@ def _register_result(self, params, result):
         """Register given tuple of params and results."""
         self.optimizer.register(params, self._metric_op * result[self.metric])
 
-    def save(self, checkpoint_dir):
+    def save(self, checkpoint_path):
         """Storing current optimizer state."""
-        with open(checkpoint_dir, "wb") as f:
+        with open(checkpoint_path, "wb") as f:
             pickle.dump(
                 (self.optimizer, self._buffered_trial_results,
                  self._total_random_search_trials, self._config_counter), f)
 
-    def restore(self, checkpoint_dir):
+    def restore(self, checkpoint_path):
         """Restoring current optimizer state."""
-        with open(checkpoint_dir, "rb") as f:
+        with open(checkpoint_path, "rb") as f:
             (self.optimizer, self._buffered_trial_results,
              self._total_random_search_trials,
              self._config_counter) = pickle.load(f)
diff --git a/python/ray/tune/suggest/hyperopt.py b/python/ray/tune/suggest/hyperopt.py
@@ -212,13 +212,13 @@ def _get_hyperopt_trial(self, trial_id):
             t for t in self._hpopt_trials.trials if t["tid"] == hyperopt_tid
         ][0]
 
-    def save(self, checkpoint_dir):
+    def save(self, checkpoint_path):
         trials_object = (self._hpopt_trials, self.rstate.get_state())
-        with open(checkpoint_dir, "wb") as outputFile:
+        with open(checkpoint_path, "wb") as outputFile:
             pickle.dump(trials_object, outputFile)
 
-    def restore(self, checkpoint_dir):
-        with open(checkpoint_dir, "rb") as inputFile:
+    def restore(self, checkpoint_path):
+        with open(checkpoint_path, "rb") as inputFile:
             trials_object = pickle.load(inputFile)
         self._hpopt_trials = trials_object[0]
         self.rstate.set_state(trials_object[1])
diff --git a/python/ray/tune/suggest/nevergrad.py b/python/ray/tune/suggest/nevergrad.py
@@ -137,13 +137,13 @@ def _process_result(self, trial_id, result):
         self._nevergrad_opt.tell(ng_trial_info,
                                  self._metric_op * result[self._metric])
 
-    def save(self, checkpoint_dir):
+    def save(self, checkpoint_path):
         trials_object = (self._nevergrad_opt, self._parameters)
-        with open(checkpoint_dir, "wb") as outputFile:
+        with open(checkpoint_path, "wb") as outputFile:
             pickle.dump(trials_object, outputFile)
 
-    def restore(self, checkpoint_dir):
-        with open(checkpoint_dir, "rb") as inputFile:
+    def restore(self, checkpoint_path):
+        with open(checkpoint_path, "rb") as inputFile:
             trials_object = pickle.load(inputFile)
         self._nevergrad_opt = trials_object[0]
         self._parameters = trials_object[1]
diff --git a/python/ray/tune/suggest/search.py b/python/ray/tune/suggest/search.py
@@ -62,3 +62,9 @@ def is_finished(self):
     def set_finished(self):
         """Marks the search algorithm as finished."""
         self._finished = True
+
+    def save(self, *args):
+        pass
+
+    def restore(self, *args):
+        pass
diff --git a/python/ray/tune/suggest/sigopt.py b/python/ray/tune/suggest/sigopt.py
@@ -130,13 +130,13 @@ def on_trial_complete(self, trial_id, result=None, error=False):
                 failed=True, suggestion=self._live_trial_mapping[trial_id].id)
         del self._live_trial_mapping[trial_id]
 
-    def save(self, checkpoint_dir):
+    def save(self, checkpoint_path):
         trials_object = (self.conn, self.experiment)
-        with open(checkpoint_dir, "wb") as outputFile:
+        with open(checkpoint_path, "wb") as outputFile:
             pickle.dump(trials_object, outputFile)
 
-    def restore(self, checkpoint_dir):
-        with open(checkpoint_dir, "rb") as inputFile:
+    def restore(self, checkpoint_path):
+        with open(checkpoint_path, "rb") as inputFile:
             trials_object = pickle.load(inputFile)
         self.conn = trials_object[0]
         self.experiment = trials_object[1]
diff --git a/python/ray/tune/suggest/skopt.py b/python/ray/tune/suggest/skopt.py
@@ -157,13 +157,13 @@ def _process_result(self, trial_id, result):
         self._skopt_opt.tell(skopt_trial_info,
                              self._metric_op * result[self._metric])
 
-    def save(self, checkpoint_dir):
+    def save(self, checkpoint_path):
         trials_object = (self._initial_points, self._skopt_opt)
-        with open(checkpoint_dir, "wb") as outputFile:
+        with open(checkpoint_path, "wb") as outputFile:
             pickle.dump(trials_object, outputFile)
 
-    def restore(self, checkpoint_dir):
-        with open(checkpoint_dir, "rb") as inputFile:
+    def restore(self, checkpoint_path):
+        with open(checkpoint_path, "rb") as inputFile:
             trials_object = pickle.load(inputFile)
         self._initial_points = trials_object[0]
         self._skopt_opt = trials_object[1]
diff --git a/python/ray/tune/suggest/suggestion.py b/python/ray/tune/suggest/suggestion.py
@@ -1,5 +1,6 @@
 import copy
 import logging
+import os
 
 from ray.tune.error import TuneError
 from ray.tune.experiment import convert_to_experiment_list
@@ -58,6 +59,7 @@ def on_trial_complete(self, trial_id, result, **kwargs):
 
     """
     FINISHED = "FINISHED"
+    CKPT_FILE = "searcher-state.pkl"
 
     def __init__(self,
                  metric="episode_reward_mean",
@@ -130,14 +132,108 @@ def suggest(self, trial_id):
         """
         raise NotImplementedError
 
-    def save(self, checkpoint_dir):
-        """Save function for this object."""
+    def save(self, checkpoint_path):
+        """Save state to path for this search algorithm.
+
+        Args:
+            checkpoint_path (str): File where the search algorithm
+                state is saved. This path should be used later when
+                restoring from file.
+
+        Example:
+
+        .. code-block:: python
+
+            search_alg = Searcher(...)
+
+            analysis = tune.run(
+                cost,
+                num_samples=5,
+                search_alg=search_alg,
+                name=self.experiment_name,
+                local_dir=self.tmpdir)
+
+            search_alg.save("./my_favorite_path.pkl")
+
+        .. versionchanged:: 0.8.7
+            Save is automatically called by `tune.run`. You can use
+            `restore_from_dir` to restore from an experiment directory
+            such as `~/ray_results/trainable`.
+
+        """
         raise NotImplementedError
 
-    def restore(self, checkpoint_dir):
-        """Restore function for this object."""
+    def restore(self, checkpoint_path):
+        """Restore state for this search algorithm
+
+
+        Args:
+            checkpoint_path (str): File where the search algorithm
+                state is saved. This path should be the same
+                as the one provided to "save".
+
+        Example:
+
+        .. code-block:: python
+
+            search_alg.save("./my_favorite_path.pkl")
+
+            search_alg2 = Searcher(...)
+            search_alg2 = ConcurrencyLimiter(search_alg2, 1)
+            search_alg2.restore(checkpoint_path)
+            tune.run(cost, num_samples=5, search_alg=search_alg2)
+
+        """
         raise NotImplementedError
 
+    def save_to_dir(self, checkpoint_dir):
+        """Automatically saves the given searcher to the checkpoint_dir.
+
+        This is automatically used by tune.run during a Tune job.
+        """
+        tmp_search_ckpt_path = os.path.join(checkpoint_dir,
+                                            ".tmp_searcher_ckpt")
+        success = True
+        try:
+            self.save(tmp_search_ckpt_path)
+        except NotImplementedError as e:
+            logger.warning(e)
+            success = False
+
+        if success and os.path.exists(tmp_search_ckpt_path):
+            os.rename(tmp_search_ckpt_path,
+                      os.path.join(checkpoint_dir, Searcher.CKPT_FILE))
+
+    def restore_from_dir(self, checkpoint_dir):
+        """Restores the state of a searcher from a given checkpoint_dir.
+
+        Typically, you should use this function to restore from an
+        experiment directory such as `~/ray_results/trainable`.
+
+        .. code-block:: python
+
+            experiment_1 = tune.run(
+                cost,
+                num_samples=5,
+                search_alg=search_alg,
+                verbose=0,
+                name=self.experiment_name,
+                local_dir="~/my_results")
+
+            search_alg2 = Searcher()
+            search_alg2.restore_from_dir(
+                os.path.join("~/my_results", self.experiment_name)
+        """
+
+        checkpoint_path = os.path.join(checkpoint_dir, Searcher.CKPT_FILE)
+        if os.path.exists(checkpoint_path):
+            self.restore(checkpoint_path)
+        else:
+            raise FileNotFoundError(
+                "{filename} not found in {directory}. Unable to restore "
+                "searcher state from directory.".format(
+                    filename=Searcher.CKPT_FILE, directory=checkpoint_dir))
+
     @property
     def metric(self):
         """The training result objective value attribute."""
@@ -294,6 +390,12 @@ def on_trial_complete(self, trial_id, result=None, error=False):
     def is_finished(self):
         return self._counter >= self._total_samples or self._finished
 
+    def save(self, checkpoint_path):
+        self.searcher.save(checkpoint_path)
+
+    def restore(self, checkpoint_path):
+        self.searcher.restore(checkpoint_path)
+
 
 class _MockSearcher(Searcher):
     def __init__(self, **kwargs):

diff --git a/python/ray/tune/suggest/zoopt.py b/python/ray/tune/suggest/zoopt.py
@@ -133,12 +133,12 @@ def on_trial_complete(self, trial_id, result=None, error=False):
 
         del self._live_trial_mapping[trial_id]
 
-    def save(self, checkpoint_dir):
+    def save(self, checkpoint_path):
         trials_object = self.optimizer
-        with open(checkpoint_dir, "wb") as output:
+        with open(checkpoint_path, "wb") as output:
             pickle.dump(trials_object, output)
 
-    def restore(self, checkpoint_dir):
-        with open(checkpoint_dir, "rb") as input:
+    def restore(self, checkpoint_path):
+        with open(checkpoint_path, "rb") as input:
             trials_object = pickle.load(input)
         self.optimizer = trials_object