Skip to content

Commit

Permalink
[tune] Search alg checkpointing during training (#9803)
Browse files Browse the repository at this point in the history
Co-authored-by: krfricke <krfricke@users.noreply.github.com>
  • Loading branch information
richardliaw and krfricke committed Aug 3, 2020
1 parent db09f70 commit c6404e8
Show file tree
Hide file tree
Showing 11 changed files with 321 additions and 46 deletions.
58 changes: 56 additions & 2 deletions doc/source/tune/api_docs/suggestion.rst
Expand Up @@ -72,6 +72,54 @@ Tune also provides helpful utilities to use with Search Algorithms:
* :ref:`repeater`: Support for running each *sampled hyperparameter* with multiple random seeds.
* :ref:`limiter`: Limits the amount of concurrent trials when running optimization.

Saving and Restoring
--------------------

Certain search algorithms have ``save/restore`` implemented,
allowing reuse of learnings across multiple tuning runs.

.. code-block:: python
search_alg = HyperOptSearch()
experiment_1 = tune.run(
trainable,
search_alg=search_alg)
search_alg.save("./my-checkpoint.pkl")
# Restore the saved state onto another search algorithm
search_alg2 = HyperOptSearch()
search_alg2.restore("./my-checkpoint.pkl")
experiment_2 = tune.run(
trainable,
search_alg=search_alg2)
Further, Tune automatically saves its state inside the current experiment folder ("Result Dir") during tuning.

Note that if you have two Tune runs with the same experiment folder,
the previous state checkpoint will be overwritten. You can
avoid this by making sure ``tune.run(name=...)`` is set to a unique
identifier.

.. code-block:: python
search_alg = HyperOptSearch()
experiment_1 = tune.run(
cost,
num_samples=5,
search_alg=search_alg,
verbose=0,
name="my-experiment-1",
local_dir="~/my_results")
search_alg2 = HyperOptSearch()
search_alg2.restore_from_dir(
os.path.join("~/my_results", "my-experiment-1"))
.. note:: This is currently not implemented for: AxSearch, TuneBOHB, SigOptSearch, and DragonflySearch.

.. _tune-ax:

Expand All @@ -87,6 +135,7 @@ Bayesian Optimization (tune.suggest.bayesopt.BayesOptSearch)


.. autoclass:: ray.tune.suggest.bayesopt.BayesOptSearch
:members: save, restore

.. _`BayesianOptimization search space specification`: https://github.com/fmfn/BayesianOptimization/blob/master/examples/advanced-tour.ipynb

Expand Down Expand Up @@ -115,20 +164,23 @@ Dragonfly (tune.suggest.dragonfly.DragonflySearch)
--------------------------------------------------

.. autoclass:: ray.tune.suggest.dragonfly.DragonflySearch
:members: save, restore

.. _tune-hyperopt:

HyperOpt (tune.suggest.hyperopt.HyperOptSearch)
-----------------------------------------------

.. autoclass:: ray.tune.suggest.hyperopt.HyperOptSearch
:members: save, restore

.. _nevergrad:

Nevergrad (tune.suggest.nevergrad.NevergradSearch)
--------------------------------------------------

.. autoclass:: ray.tune.suggest.nevergrad.NevergradSearch
:members: save, restore

.. _`Nevergrad README's Optimization section`: https://github.com/facebookresearch/nevergrad/blob/master/docs/optimization.rst#choosing-an-optimizer

Expand All @@ -147,6 +199,7 @@ Scikit-Optimize (tune.suggest.skopt.SkOptSearch)
------------------------------------------------

.. autoclass:: ray.tune.suggest.skopt.SkOptSearch
:members: save, restore

.. _`skopt Optimizer object`: https://scikit-optimize.github.io/#skopt.Optimizer

Expand All @@ -156,6 +209,7 @@ ZOOpt (tune.suggest.zoopt.ZOOptSearch)
--------------------------------------

.. autoclass:: ray.tune.suggest.zoopt.ZOOptSearch
:members: save, restore

.. _repeater:

Expand Down Expand Up @@ -188,8 +242,8 @@ Use ``ray.tune.suggest.ConcurrencyLimiter`` to limit the amount of concurrency w

.. _byo-algo:

Implementing your own Search Algorithm
--------------------------------------
Custom Search Algorithms (tune.suggest.Searcher)
------------------------------------------------

If you are interested in implementing or contributing a new Search Algorithm, provide the following interface:

Expand Down
8 changes: 4 additions & 4 deletions python/ray/tune/suggest/bayesopt.py
Expand Up @@ -270,16 +270,16 @@ def _register_result(self, params, result):
"""Register given tuple of params and results."""
self.optimizer.register(params, self._metric_op * result[self.metric])

def save(self, checkpoint_dir):
def save(self, checkpoint_path):
"""Storing current optimizer state."""
with open(checkpoint_dir, "wb") as f:
with open(checkpoint_path, "wb") as f:
pickle.dump(
(self.optimizer, self._buffered_trial_results,
self._total_random_search_trials, self._config_counter), f)

def restore(self, checkpoint_dir):
def restore(self, checkpoint_path):
"""Restoring current optimizer state."""
with open(checkpoint_dir, "rb") as f:
with open(checkpoint_path, "rb") as f:
(self.optimizer, self._buffered_trial_results,
self._total_random_search_trials,
self._config_counter) = pickle.load(f)
8 changes: 4 additions & 4 deletions python/ray/tune/suggest/hyperopt.py
Expand Up @@ -212,13 +212,13 @@ def _get_hyperopt_trial(self, trial_id):
t for t in self._hpopt_trials.trials if t["tid"] == hyperopt_tid
][0]

def save(self, checkpoint_dir):
def save(self, checkpoint_path):
trials_object = (self._hpopt_trials, self.rstate.get_state())
with open(checkpoint_dir, "wb") as outputFile:
with open(checkpoint_path, "wb") as outputFile:
pickle.dump(trials_object, outputFile)

def restore(self, checkpoint_dir):
with open(checkpoint_dir, "rb") as inputFile:
def restore(self, checkpoint_path):
with open(checkpoint_path, "rb") as inputFile:
trials_object = pickle.load(inputFile)
self._hpopt_trials = trials_object[0]
self.rstate.set_state(trials_object[1])
8 changes: 4 additions & 4 deletions python/ray/tune/suggest/nevergrad.py
Expand Up @@ -137,13 +137,13 @@ def _process_result(self, trial_id, result):
self._nevergrad_opt.tell(ng_trial_info,
self._metric_op * result[self._metric])

def save(self, checkpoint_dir):
def save(self, checkpoint_path):
trials_object = (self._nevergrad_opt, self._parameters)
with open(checkpoint_dir, "wb") as outputFile:
with open(checkpoint_path, "wb") as outputFile:
pickle.dump(trials_object, outputFile)

def restore(self, checkpoint_dir):
with open(checkpoint_dir, "rb") as inputFile:
def restore(self, checkpoint_path):
with open(checkpoint_path, "rb") as inputFile:
trials_object = pickle.load(inputFile)
self._nevergrad_opt = trials_object[0]
self._parameters = trials_object[1]
6 changes: 6 additions & 0 deletions python/ray/tune/suggest/search.py
Expand Up @@ -62,3 +62,9 @@ def is_finished(self):
def set_finished(self):
"""Marks the search algorithm as finished."""
self._finished = True

def save(self, *args):
pass

def restore(self, *args):
pass
8 changes: 4 additions & 4 deletions python/ray/tune/suggest/sigopt.py
Expand Up @@ -130,13 +130,13 @@ def on_trial_complete(self, trial_id, result=None, error=False):
failed=True, suggestion=self._live_trial_mapping[trial_id].id)
del self._live_trial_mapping[trial_id]

def save(self, checkpoint_dir):
def save(self, checkpoint_path):
trials_object = (self.conn, self.experiment)
with open(checkpoint_dir, "wb") as outputFile:
with open(checkpoint_path, "wb") as outputFile:
pickle.dump(trials_object, outputFile)

def restore(self, checkpoint_dir):
with open(checkpoint_dir, "rb") as inputFile:
def restore(self, checkpoint_path):
with open(checkpoint_path, "rb") as inputFile:
trials_object = pickle.load(inputFile)
self.conn = trials_object[0]
self.experiment = trials_object[1]
8 changes: 4 additions & 4 deletions python/ray/tune/suggest/skopt.py
Expand Up @@ -157,13 +157,13 @@ def _process_result(self, trial_id, result):
self._skopt_opt.tell(skopt_trial_info,
self._metric_op * result[self._metric])

def save(self, checkpoint_dir):
def save(self, checkpoint_path):
trials_object = (self._initial_points, self._skopt_opt)
with open(checkpoint_dir, "wb") as outputFile:
with open(checkpoint_path, "wb") as outputFile:
pickle.dump(trials_object, outputFile)

def restore(self, checkpoint_dir):
with open(checkpoint_dir, "rb") as inputFile:
def restore(self, checkpoint_path):
with open(checkpoint_path, "rb") as inputFile:
trials_object = pickle.load(inputFile)
self._initial_points = trials_object[0]
self._skopt_opt = trials_object[1]
110 changes: 106 additions & 4 deletions python/ray/tune/suggest/suggestion.py
@@ -1,5 +1,6 @@
import copy
import logging
import os

from ray.tune.error import TuneError
from ray.tune.experiment import convert_to_experiment_list
Expand Down Expand Up @@ -58,6 +59,7 @@ def on_trial_complete(self, trial_id, result, **kwargs):
"""
FINISHED = "FINISHED"
CKPT_FILE = "searcher-state.pkl"

def __init__(self,
metric="episode_reward_mean",
Expand Down Expand Up @@ -130,14 +132,108 @@ def suggest(self, trial_id):
"""
raise NotImplementedError

def save(self, checkpoint_dir):
"""Save function for this object."""
def save(self, checkpoint_path):
"""Save state to path for this search algorithm.
Args:
checkpoint_path (str): File where the search algorithm
state is saved. This path should be used later when
restoring from file.
Example:
.. code-block:: python
search_alg = Searcher(...)
analysis = tune.run(
cost,
num_samples=5,
search_alg=search_alg,
name=self.experiment_name,
local_dir=self.tmpdir)
search_alg.save("./my_favorite_path.pkl")
.. versionchanged:: 0.8.7
Save is automatically called by `tune.run`. You can use
`restore_from_dir` to restore from an experiment directory
such as `~/ray_results/trainable`.
"""
raise NotImplementedError

def restore(self, checkpoint_dir):
"""Restore function for this object."""
def restore(self, checkpoint_path):
"""Restore state for this search algorithm
Args:
checkpoint_path (str): File where the search algorithm
state is saved. This path should be the same
as the one provided to "save".
Example:
.. code-block:: python
search_alg.save("./my_favorite_path.pkl")
search_alg2 = Searcher(...)
search_alg2 = ConcurrencyLimiter(search_alg2, 1)
search_alg2.restore(checkpoint_path)
tune.run(cost, num_samples=5, search_alg=search_alg2)
"""
raise NotImplementedError

def save_to_dir(self, checkpoint_dir):
"""Automatically saves the given searcher to the checkpoint_dir.
This is automatically used by tune.run during a Tune job.
"""
tmp_search_ckpt_path = os.path.join(checkpoint_dir,
".tmp_searcher_ckpt")
success = True
try:
self.save(tmp_search_ckpt_path)
except NotImplementedError as e:
logger.warning(e)
success = False

if success and os.path.exists(tmp_search_ckpt_path):
os.rename(tmp_search_ckpt_path,
os.path.join(checkpoint_dir, Searcher.CKPT_FILE))

def restore_from_dir(self, checkpoint_dir):
"""Restores the state of a searcher from a given checkpoint_dir.
Typically, you should use this function to restore from an
experiment directory such as `~/ray_results/trainable`.
.. code-block:: python
experiment_1 = tune.run(
cost,
num_samples=5,
search_alg=search_alg,
verbose=0,
name=self.experiment_name,
local_dir="~/my_results")
search_alg2 = Searcher()
search_alg2.restore_from_dir(
os.path.join("~/my_results", self.experiment_name)
"""

checkpoint_path = os.path.join(checkpoint_dir, Searcher.CKPT_FILE)
if os.path.exists(checkpoint_path):
self.restore(checkpoint_path)
else:
raise FileNotFoundError(
"{filename} not found in {directory}. Unable to restore "
"searcher state from directory.".format(
filename=Searcher.CKPT_FILE, directory=checkpoint_dir))

@property
def metric(self):
"""The training result objective value attribute."""
Expand Down Expand Up @@ -294,6 +390,12 @@ def on_trial_complete(self, trial_id, result=None, error=False):
def is_finished(self):
return self._counter >= self._total_samples or self._finished

def save(self, checkpoint_path):
self.searcher.save(checkpoint_path)

def restore(self, checkpoint_path):
self.searcher.restore(checkpoint_path)


class _MockSearcher(Searcher):
def __init__(self, **kwargs):
Expand Down
8 changes: 4 additions & 4 deletions python/ray/tune/suggest/zoopt.py
Expand Up @@ -133,12 +133,12 @@ def on_trial_complete(self, trial_id, result=None, error=False):

del self._live_trial_mapping[trial_id]

def save(self, checkpoint_dir):
def save(self, checkpoint_path):
trials_object = self.optimizer
with open(checkpoint_dir, "wb") as output:
with open(checkpoint_path, "wb") as output:
pickle.dump(trials_object, output)

def restore(self, checkpoint_dir):
with open(checkpoint_dir, "rb") as input:
def restore(self, checkpoint_path):
with open(checkpoint_path, "rb") as input:
trials_object = pickle.load(input)
self.optimizer = trials_object

0 comments on commit c6404e8

Please sign in to comment.