Skip to content

Commit

Permalink
[tune] Fix Trial.node_ip property (#40028) (#40052)
Browse files Browse the repository at this point in the history
This PR fixes the `Trial.node_ip` property, which didn't work since the `location` attribute got switched to `trial.temporary_state.location`. This PR also does some small cleanups, type annotations, and adds comments.

Signed-off-by: Justin Yu <justinvyu@anyscale.com>
  • Loading branch information
justinvyu committed Oct 3, 2023
1 parent c0d2c3d commit 97763c6
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 11 deletions.
13 changes: 8 additions & 5 deletions python/ray/tune/execution/tune_controller.py
Expand Up @@ -1631,10 +1631,9 @@ def _schedule_trial_pause(self, trial: Trial, should_checkpoint: bool = True):

if should_checkpoint:
self._cached_trial_decisions[trial.trial_id] = TrialScheduler.PAUSE
future_result = self._schedule_trial_save(
self._schedule_trial_save(
trial=trial, storage=CheckpointStorage.PERSISTENT
)
trial.temporary_state.saving_to = future_result
else:
self._schedule_trial_stop(trial)
self._set_trial_status(trial, Trial.PAUSED)
Expand Down Expand Up @@ -1906,12 +1905,16 @@ def _schedule_trial_save(
on_error=self._trial_task_failure,
_return_future=True,
)
# TODO(justinvyu): `trial.saving_to` is needed in order to prevent
# a done=True result from executing a STOP decision
# TODO(justinvyu): `trial.saving_to` (and trial.is_saving) is needed
# in order to prevent a done=True result from executing a STOP decision
# (which clears all futures) before the save gets processed.
# Keep this in for now while `train` and `save` are 2 separate steps.
# TODO(justinvyu): Remove the return value?
trial.temporary_state.saving_to = _FutureTrainingResult(future)

# `trial.saving_to` holds a future training result -- this is only used
# in the case of PBT to block until the checkpoint is ready.
# In all other situations, the checkpoint future is processed by the
# actor event manager when it is ready.
return trial.temporary_state.saving_to

if storage == CheckpointStorage.MEMORY:
Expand Down
12 changes: 6 additions & 6 deletions python/ray/tune/experiment/trial.py
Expand Up @@ -29,9 +29,9 @@
from ray.train import Checkpoint
from ray.train.constants import RAY_CHDIR_TO_TRIAL_DIR
from ray.train._internal.checkpoint_manager import (
_TrainingResult,
_CheckpointManager as _NewCheckpointManager,
)
from ray.train._internal.session import _FutureTrainingResult, _TrainingResult
from ray.train._internal.storage import _use_storage_context, StorageContext
from ray.tune import TuneError
from ray.tune.error import _TuneRestoreError
Expand Down Expand Up @@ -209,12 +209,12 @@ class _TemporaryTrialState:
def __init__(self):
self.location = _Location()

self.ray_actor = None
self.ray_actor: Optional[ray.actor.ActorHandle] = None

self.saving_to = None
self.restoring_from = None
self.saving_to: Optional[_FutureTrainingResult] = None
self.restoring_from: Optional[_TrainingResult] = None

self.num_restore_failures = 0
self.num_restore_failures: int = 0

def __getstate__(self):
return {}
Expand Down Expand Up @@ -811,7 +811,7 @@ def has_reported_at_least_once(self) -> bool:

@property
def node_ip(self):
return self.location.hostname
return self.temporary_state.location.hostname

@property
def sync_on_checkpoint(self):
Expand Down

0 comments on commit 97763c6

Please sign in to comment.