diff --git a/python/ray/air/config.py b/python/ray/air/config.py
index 652b137a45603f..69eb8a0931d32c 100644
--- a/python/ray/air/config.py
+++ b/python/ray/air/config.py
@@ -16,6 +16,7 @@
 
 import pyarrow.fs
 
+from ray._private.storage import _get_storage_uri
 from ray._private.thirdparty.tabulate.tabulate import tabulate
 from ray.util.annotations import PublicAPI, Deprecated
 from ray.widgets import Template, make_table_html_repr
@@ -581,10 +582,14 @@ class RunConfig:
     Args:
         name: Name of the trial or experiment. If not provided, will be deduced
             from the Trainable.
-        storage_path: [Beta] Path to store results at. Can be a local directory or
-            a destination on cloud storage. If Ray storage is set up,
-            defaults to the storage location. Otherwise, this defaults to
-            the local ``~/ray_results`` directory.
+        storage_path: [Beta] Path where all results and checkpoints are persisted.
+            Can be a local directory or a destination on cloud storage.
+            For multi-node training/tuning runs, this must be set to a
+            shared storage location (e.g., S3, NFS).
+            This defaults to the local ``~/ray_results`` directory.
+        storage_filesystem: [Beta] A custom filesystem to use for storage.
+            If this is provided, `storage_path` should be a path with its
+            prefix stripped (e.g., `s3://bucket/path` -> `bucket/path`).
         failure_config: Failure mode configuration.
         checkpoint_config: Checkpointing configuration.
         sync_config: Configuration object for syncing. See train.SyncConfig.
@@ -634,8 +639,21 @@ class RunConfig:
 
     def __post_init__(self):
         from ray.train import SyncConfig
+        from ray.train.constants import DEFAULT_STORAGE_PATH
         from ray.tune.experimental.output import AirVerbosity, get_air_verbosity
 
+        if self.storage_path is None:
+            self.storage_path = DEFAULT_STORAGE_PATH
+
+            # If no remote path is set, try to get Ray Storage URI
+            ray_storage_uri: Optional[str] = _get_storage_uri()
+            if ray_storage_uri is not None:
+                logger.info(
+                    "Using configured Ray Storage URI as the `storage_path`: "
+                    f"{ray_storage_uri}"
+                )
+                self.storage_path = ray_storage_uri
+
         if not self.failure_config:
             self.failure_config = FailureConfig()
 
diff --git a/python/ray/air/integrations/mlflow.py b/python/ray/air/integrations/mlflow.py
index 50b55c9f288053..8d1352eef937be 100644
--- a/python/ray/air/integrations/mlflow.py
+++ b/python/ray/air/integrations/mlflow.py
@@ -3,7 +3,6 @@
 from typing import Dict, Optional, Union
 
 import ray
-from ray.air import session
 from ray.air._internal.mlflow import _MLflowLoggerUtil
 from ray.air._internal import usage as air_usage
 from ray.air.constants import TRAINING_ITERATION
@@ -148,13 +147,14 @@ def train_fn(config):
         )
 
     try:
+        train_context = ray.train.get_context()
+
         # Do a try-catch here if we are not in a train session
-        _session = session._get_session(warn=False)
-        if _session and rank_zero_only and session.get_world_rank() != 0:
+        if rank_zero_only and train_context.get_world_rank() != 0:
             return _NoopModule()
 
-        default_trial_id = session.get_trial_id()
-        default_trial_name = session.get_trial_name()
+        default_trial_id = train_context.get_trial_id()
+        default_trial_name = train_context.get_trial_name()
 
     except RuntimeError:
         default_trial_id = None
diff --git a/python/ray/air/tests/test_configs.py b/python/ray/air/tests/test_configs.py
index 81ae761d34bf8b..7d4092195cb599 100644
--- a/python/ray/air/tests/test_configs.py
+++ b/python/ray/air/tests/test_configs.py
@@ -34,8 +34,7 @@ def test_repr(config):
 
 def test_storage_filesystem_repr():
     config = RunConfig(storage_filesystem=pyarrow.fs.S3FileSystem())
-    representation = repr(config)
-    assert len(representation) < MAX_REPR_LENGTH
+    repr(config)
 
 
 def test_failure_config_init():
diff --git a/python/ray/air/tests/test_integration_mlflow.py b/python/ray/air/tests/test_integration_mlflow.py
index 5cb4091daf298b..c587c19d822481 100644
--- a/python/ray/air/tests/test_integration_mlflow.py
+++ b/python/ray/air/tests/test_integration_mlflow.py
@@ -3,13 +3,11 @@
 import tempfile
 import unittest
 from collections import namedtuple
-from unittest.mock import patch
+from unittest.mock import patch, MagicMock
 
 from mlflow.tracking import MlflowClient
 
 from ray._private.dict import flatten_dict
-from ray.train._internal.session import init_session, shutdown_session
-from ray.train._internal.storage import StorageContext
 from ray.air.integrations.mlflow import MLflowLoggerCallback, setup_mlflow, _NoopModule
 from ray.air._internal.mlflow import _MLflowLoggerUtil
 
@@ -49,8 +47,7 @@ def setUp(self):
         assert client.get_experiment_by_name("existing_experiment").experiment_id == "1"
 
     def tearDown(self) -> None:
-        # Shutdown session to clean up for next test
-        shutdown_session()
+        pass
 
     def testMlFlowLoggerCallbackConfig(self):
         # Explicitly pass in all args.
@@ -225,23 +222,14 @@ def testMlFlowSetupExplicit(self):
         )
         mlflow.end_run()
 
-    def testMlFlowSetupRankNonRankZero(self):
+    @patch("ray.train.get_context")
+    def testMlFlowSetupRankNonRankZero(self, mock_get_context):
         """Assert that non-rank-0 workers get a noop module"""
-        storage = StorageContext(
-            storage_path=tempfile.mkdtemp(),
-            experiment_dir_name="exp_name",
-            trial_dir_name="trial_name",
-        )
+        mock_context = MagicMock()
+        mock_context.get_world_rank.return_value = 1
+
+        mock_get_context.return_value = mock_context
 
-        init_session(
-            training_func=None,
-            world_rank=1,
-            local_rank=1,
-            node_rank=1,
-            local_world_size=2,
-            world_size=2,
-            storage=storage,
-        )
         mlflow = setup_mlflow({})
         assert isinstance(mlflow, _NoopModule)
 
diff --git a/python/ray/train/_internal/session.py b/python/ray/train/_internal/session.py
index fc60fc21bdac74..8201e1f59dfa5e 100644
--- a/python/ray/train/_internal/session.py
+++ b/python/ray/train/_internal/session.py
@@ -222,15 +222,14 @@ def reset(
         self.training_started = False
         self._first_report = True
 
-        # Change the working directory to the local trial directory.
-        # -> All workers on the same node share a working directory.
-        os.makedirs(storage.trial_local_path, exist_ok=True)
+        # Change the working directory to a special trial folder.
+        # This is to ensure that all Ray Train workers have a common working directory.
+        os.makedirs(storage.trial_working_directory, exist_ok=True)
         if bool(int(os.environ.get(RAY_CHDIR_TO_TRIAL_DIR, "1"))):
             logger.debug(
-                "Switching the working directory to the trial directory: "
-                f"{storage.trial_local_path}"
+                f"Changing the working directory to: {storage.trial_working_directory}"
             )
-            os.chdir(storage.trial_local_path)
+            os.chdir(storage.trial_working_directory)
 
     def pause_reporting(self):
         """Ignore all future ``session.report()`` calls."""
diff --git a/python/ray/train/_internal/storage.py b/python/ray/train/_internal/storage.py
index 6e88148f5f3974..812a6fb50075c5 100644
--- a/python/ray/train/_internal/storage.py
+++ b/python/ray/train/_internal/storage.py
@@ -21,7 +21,6 @@
     ) from e
 # isort: on
 
-import dataclasses
 import fnmatch
 import logging
 import os
@@ -29,10 +28,10 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Type, Union
 
-from ray._private.storage import _get_storage_uri
 from ray.air._internal.filelock import TempFileLock
 from ray.train._internal.syncer import SyncConfig, Syncer, _BackgroundSyncer
-from ray.train.constants import _get_defaults_results_dir
+from ray.train.constants import _get_ray_train_session_dir
+from ray.util.annotations import DeveloperAPI
 
 if TYPE_CHECKING:
     from ray.train._checkpoint import Checkpoint
@@ -346,27 +345,29 @@ def _delete_command(self, uri: str) -> Tuple[Callable, Dict]:
         return _delete_fs_path, dict(fs=self.storage_filesystem, fs_path=fs_path)
 
 
+@DeveloperAPI
 class StorageContext:
-    """Shared context that holds all paths and storage utilities, passed along from
-    the driver to workers.
+    """Shared context that holds the source of truth for all paths and
+    storage utilities, passed along from the driver to workers.
 
-    The properties of this context may not all be set at once, depending on where
-    the context lives.
-    For example, on the driver, the storage context is initialized, only knowing
-    the experiment path. On the Trainable actor, the trial_dir_name is accessible.
-
-    There are 2 types of paths:
+    This object defines a few types of paths:
     1. *_fs_path: A path on the `storage_filesystem`. This is a regular path
         which has been prefix-stripped by pyarrow.fs.FileSystem.from_uri and
         can be joined with `Path(...).as_posix()`.
-    2. *_local_path: The path on the local filesystem where results are saved to
-       before persisting to storage.
+    2. *_driver_staging_path: The temporary staging directory on the local filesystem
+        where driver artifacts are saved to before persisting them to storage.
+    3. trial_working_directory: The local filesystem path that the remote
+        actors' working directories are moved to by default.
+        This is separated from the driver staging path so that driver syncing
+        does not implicitly upload the trial working directory, for trials on the
+        driver node.
 
     Example with storage_path="mock:///bucket/path?param=1":
 
+        >>> import ray
         >>> from ray.train._internal.storage import StorageContext
         >>> import os
-        >>> os.environ["RAY_AIR_LOCAL_CACHE_DIR"] = "/tmp/ray_results"
+        >>> _ = ray.init()
         >>> storage = StorageContext(
         ...     storage_path="mock://netloc/bucket/path?param=1",
         ...     experiment_dir_name="exp_name",
@@ -375,36 +376,31 @@ class StorageContext:
         <pyarrow._fs._MockFileSystem object...
         >>> storage.experiment_fs_path
         'bucket/path/exp_name'
-        >>> storage.experiment_local_path
-        '/tmp/ray_results/exp_name'
+        >>> storage.experiment_driver_staging_path  # doctest: +ELLIPSIS
+        '/tmp/ray/session_.../artifacts/.../exp_name/driver_artifacts'
         >>> storage.trial_dir_name = "trial_dir"
         >>> storage.trial_fs_path
         'bucket/path/exp_name/trial_dir'
-        >>> storage.trial_local_path
-        '/tmp/ray_results/exp_name/trial_dir'
+        >>> storage.trial_driver_staging_path  # doctest: +ELLIPSIS
+        '/tmp/ray/session_.../artifacts/.../exp_name/driver_artifacts/trial_dir'
+        >>> storage.trial_working_directory   # doctest: +ELLIPSIS
+        '/tmp/ray/session_.../artifacts/.../exp_name/working_dirs/trial_dir'
         >>> storage.current_checkpoint_index = 1
         >>> storage.checkpoint_fs_path
         'bucket/path/exp_name/trial_dir/checkpoint_000001'
+        >>> ray.shutdown()
 
-    Example with storage_path=None:
+    Example with storage_path="/tmp/ray_results":
 
         >>> from ray.train._internal.storage import StorageContext
-        >>> import os
-        >>> os.environ["RAY_AIR_LOCAL_CACHE_DIR"] = "/tmp/ray_results"
         >>> storage = StorageContext(
-        ...     storage_path=None,
+        ...     storage_path="/tmp/ray_results",
         ...     experiment_dir_name="exp_name",
         ... )
-        >>> storage.storage_fs_path  # Auto-resolved
+        >>> storage.storage_fs_path
         '/tmp/ray_results'
-        >>> storage.storage_local_path
-        '/tmp/ray_results'
-        >>> storage.experiment_local_path
-        '/tmp/ray_results/exp_name'
         >>> storage.experiment_fs_path
         '/tmp/ray_results/exp_name'
-        >>> storage.syncer is None
-        True
         >>> storage.storage_filesystem   # Auto-resolved  # doctest: +ELLIPSIS
         <pyarrow._fs.LocalFileSystem object...
 
@@ -416,70 +412,57 @@ class StorageContext:
             Path(storage.trial_fs_path, "subdir").as_posix(),
             destination_filesystem=storage.filesystem
         )
+
+    .. warning::
+        This is an experimental developer API and is subject to change
+        without notice between versions.
     """
 
     def __init__(
         self,
-        storage_path: Optional[Union[str, os.PathLike]],
+        storage_path: Union[str, os.PathLike],
         experiment_dir_name: str,
         sync_config: Optional[SyncConfig] = None,
         storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
         trial_dir_name: Optional[str] = None,
         current_checkpoint_index: int = -1,
     ):
-        self.custom_fs_provided = storage_filesystem is not None
-
-        self.storage_local_path = _get_defaults_results_dir()
+        from ray.tune.utils import date_str
 
-        # If no remote path is set, try to get Ray Storage URI
-        ray_storage_uri: Optional[str] = _get_storage_uri()
-        if ray_storage_uri and storage_path is None:
-            logger.info(
-                "Using configured Ray Storage URI as the `storage_path`: "
-                f"{ray_storage_uri}"
-            )
+        self.custom_fs_provided = storage_filesystem is not None
 
-        # If `storage_path=None`, then set it to the local path.
         # Invariant: (`storage_filesystem`, `storage_path`) is the location where
         # *all* results can be accessed.
-        storage_path = storage_path or ray_storage_uri or self.storage_local_path
         self.experiment_dir_name = experiment_dir_name
         self.trial_dir_name = trial_dir_name
         self.current_checkpoint_index = current_checkpoint_index
-        self.sync_config = (
-            dataclasses.replace(sync_config) if sync_config else SyncConfig()
-        )
+        self.sync_config = sync_config or SyncConfig()
 
         self.storage_filesystem, self.storage_fs_path = get_fs_and_path(
             storage_path, storage_filesystem
         )
         self.storage_fs_path = Path(self.storage_fs_path).as_posix()
 
-        # Syncing is always needed if a custom `storage_filesystem` is provided.
-        # Otherwise, syncing is only needed if storage_local_path
-        # and storage_fs_path point to different locations.
-        syncing_needed = (
-            self.custom_fs_provided or self.storage_fs_path != self.storage_local_path
-        )
-        self.syncer: Optional[Syncer] = (
-            _FilesystemSyncer(
-                storage_filesystem=self.storage_filesystem,
-                sync_period=self.sync_config.sync_period,
-                sync_timeout=self.sync_config.sync_timeout,
-            )
-            if syncing_needed
-            else None
+        self.syncer: Syncer = _FilesystemSyncer(
+            storage_filesystem=self.storage_filesystem,
+            sync_period=self.sync_config.sync_period,
+            sync_timeout=self.sync_config.sync_timeout,
         )
 
         self._create_validation_file()
         self._check_validation_file()
 
+        # Timestamp is used to create a unique session directory for the current
+        # training job. This is used to avoid conflicts when multiple training jobs
+        # run with the same name in the same cluster.
+        # This is set ONCE at the creation of the storage context, on the driver.
+        self._timestamp = date_str()
+
     def __str__(self):
         return (
             "StorageContext<\n"
             f"  storage_filesystem='{self.storage_filesystem.type_name}',\n"
             f"  storage_fs_path='{self.storage_fs_path}',\n"
-            f"  storage_local_path='{self.storage_local_path}',\n"
             f"  experiment_dir_name='{self.experiment_dir_name}',\n"
             f"  trial_dir_name='{self.trial_dir_name}',\n"
             f"  current_checkpoint_index={self.current_checkpoint_index},\n"
@@ -577,7 +560,7 @@ def persist_artifacts(self, force: bool = False) -> None:
         depending on the `sync_period` + `sync_artifacts_on_checkpoint`
         settings of `SyncConfig`.
 
-        `(local_fs, trial_local_path) -> (storage_filesystem, trial_fs_path)`
+        `(local_fs, trial_working_dir) -> (storage_filesystem, trial_fs_path)`
 
         Args:
             force: If True, wait for a previous sync to finish, launch a new one,
@@ -587,20 +570,20 @@ def persist_artifacts(self, force: bool = False) -> None:
         if not self.sync_config.sync_artifacts:
             return
 
-        # Skip if we don't need to sync (e.g., storage_path == storage_local_path, and
-        # all trial artifacts are already in the right place)
-        if not self.syncer:
+        # Skip if there are no artifacts to sync
+        is_empty = not any(os.scandir(self.trial_working_directory))
+        if is_empty:
             return
 
         if force:
             self.syncer.wait()
             self.syncer.sync_up(
-                local_dir=self.trial_local_path, remote_dir=self.trial_fs_path
+                local_dir=self.trial_working_directory, remote_dir=self.trial_fs_path
             )
             self.syncer.wait()
         else:
             self.syncer.sync_up_if_needed(
-                local_dir=self.trial_local_path, remote_dir=self.trial_fs_path
+                local_dir=self.trial_working_directory, remote_dir=self.trial_fs_path
             )
 
     @property
@@ -613,38 +596,91 @@ def experiment_fs_path(self) -> str:
         """
         return Path(self.storage_fs_path, self.experiment_dir_name).as_posix()
 
+    def _get_session_path(self) -> str:
+        """The Ray Train/Tune session local directory used to stage files
+        before persisting to the storage filesystem."""
+        return Path(
+            _get_ray_train_session_dir(), self._timestamp, self.experiment_dir_name
+        ).as_posix()
+
     @property
-    def experiment_local_path(self) -> str:
-        """The local filesystem path to the experiment directory.
+    def experiment_driver_staging_path(self) -> str:
+        """The local filesystem path of the experiment directory on the driver node.
+
+        The driver is the node where `Trainer.fit`/`Tuner.fit` is being called.
+
+        This path is of the form:
+        `/tmp/ray/session_<session_id>/artifacts/<ray-train-job-timestamp>/
+        <experiment_dir_name>/driver_artifacts`
 
-        This local "cache" path refers to location where files are dumped before
-        syncing them to the `storage_path` on the `storage_filesystem`.
+        This should be used as the temporary staging location for files *on the driver*
+        before syncing them to `experiment_fs_path`.
+        For example, the search algorithm should dump its state to this directory.
+        See `trial_driver_staging_path` for writing trial-specific artifacts.
+
+        The directory is synced to
+        `{storage_path}/{experiment_dir_name}` periodically.
+        See `_ExperimentCheckpointManager.checkpoint` for where that happens.
         """
-        return Path(self.storage_local_path, self.experiment_dir_name).as_posix()
+        return Path(self._get_session_path(), "driver_artifacts").as_posix()
 
     @property
-    def trial_local_path(self) -> str:
-        """The local filesystem path to the trial directory.
+    def trial_fs_path(self) -> str:
+        """The trial directory path on the `storage_filesystem`.
 
         Raises a ValueError if `trial_dir_name` is not set beforehand.
         """
         if self.trial_dir_name is None:
             raise RuntimeError(
-                "Should not access `trial_local_path` without setting `trial_dir_name`"
+                "Should not access `trial_fs_path` without setting `trial_dir_name`"
             )
-        return Path(self.experiment_local_path, self.trial_dir_name).as_posix()
+        return Path(self.experiment_fs_path, self.trial_dir_name).as_posix()
 
     @property
-    def trial_fs_path(self) -> str:
-        """The trial directory path on the `storage_filesystem`.
+    def trial_driver_staging_path(self) -> str:
+        """The local filesystem path of the trial directory on the driver.
 
-        Raises a ValueError if `trial_dir_name` is not set beforehand.
+        The driver is the node where `Trainer.fit`/`Tuner.fit` is being called.
+
+        This path is of the form:
+        `/tmp/ray/session_<session_id>/artifacts/<ray-train-job-timestamp>/
+        <experiment_dir_name>/driver_artifacts/<trial_dir_name>`
+
+        This should be used as the temporary location for files on the driver
+        before persisting them to `trial_fs_path`.
+
+        For example, callbacks (e.g., JsonLoggerCallback) should write trial-specific
+        logfiles within this directory.
         """
         if self.trial_dir_name is None:
             raise RuntimeError(
-                "Should not access `trial_fs_path` without setting `trial_dir_name`"
+                "Should not access `trial_driver_staging_path` "
+                "without setting `trial_dir_name`"
             )
-        return Path(self.experiment_fs_path, self.trial_dir_name).as_posix()
+        return Path(self.experiment_driver_staging_path, self.trial_dir_name).as_posix()
+
+    @property
+    def trial_working_directory(self) -> str:
+        """The local filesystem path to trial working directory.
+
+        This path is of the form:
+        `/tmp/ray/session_<session_id>/artifacts/<ray-train-job-timestamp>/
+        <experiment_dir_name>/working_dirs/<trial_dir_name>`
+
+        Ray Train/Tune moves the remote actor's working directory to this path
+        by default, unless disabled by `RAY_CHDIR_TO_TRIAL_DIR` environment variable.
+
+        Writing files to this directory allows users to persist training artifacts
+        if `SyncConfig(sync_artifacts=True)` is set.
+        """
+        if self.trial_dir_name is None:
+            raise RuntimeError(
+                "Cannot access `trial_working_directory` without "
+                "setting `trial_dir_name`"
+            )
+        return Path(
+            self._get_session_path(), "working_dirs", self.trial_dir_name
+        ).as_posix()
 
     @property
     def checkpoint_fs_path(self) -> str:
diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py
index efb36bd57eab1e..d643aad732225e 100644
--- a/python/ray/train/constants.py
+++ b/python/ray/train/constants.py
@@ -1,6 +1,7 @@
 import os
 from pathlib import Path
 
+import ray
 from ray.air.constants import (  # noqa: F401
     COPY_DIRECTORY_CHECKPOINTS_INSTEAD_OF_MOVING_ENV,
     EVALUATION_DATASET_KEY,
@@ -10,6 +11,13 @@
 )
 
 
+def _get_ray_train_session_dir() -> str:
+    assert ray.is_initialized(), "Ray must be initialized to get the session dir."
+    return Path(
+        ray._private.worker._global_node.get_session_dir_path(), "artifacts"
+    ).as_posix()
+
+
 def _get_defaults_results_dir() -> str:
     return (
         # This can be overwritten by our libraries
@@ -24,6 +32,8 @@ def _get_defaults_results_dir() -> str:
     )
 
 
+DEFAULT_STORAGE_PATH = Path("~/ray_results").expanduser().as_posix()
+
 # Autofilled ray.train.report() metrics. Keys should be consistent with Tune.
 CHECKPOINT_DIR_NAME = "checkpoint_dir_name"
 TIME_TOTAL_S = "_time_total_s"
diff --git a/python/ray/train/tests/test_new_persistence.py b/python/ray/train/tests/test_new_persistence.py
index d3aaff30d1b250..dc530bfb1122dd 100644
--- a/python/ray/train/tests/test_new_persistence.py
+++ b/python/ray/train/tests/test_new_persistence.py
@@ -2,9 +2,9 @@
 import os
 import pickle
 import re
-import shutil
 import tempfile
 import time
+import uuid
 from contextlib import contextmanager
 from pathlib import Path
 from typing import List, Optional, Tuple
@@ -63,20 +63,8 @@ def dummy_context_manager(*args, **kwargs):
     yield "dummy value"
 
 
-@pytest.fixture(autouse=True)
-def disable_driver_artifact_sync():
-    # NOTE: Hack to make sure that the driver doesn't sync the artifacts.
-    from ray.tune import execution
-
-    execution.experiment_state._DRIVER_SYNC_EXCLUDE_PATTERNS = [
-        "*/checkpoint_*",
-        "*/artifact-*.txt",
-    ]
-
-
 @pytest.fixture(autouse=True, scope="module")
 def ray_start_4_cpus():
-    # Make sure to set the env var before calling ray.init()
     ray.init(num_cpus=4)
     yield
     ray.shutdown()
@@ -120,8 +108,8 @@ def _resolve_storage_type(
 def _get_local_inspect_dir(
     root_local_path: Path,
     storage_path: str,
-    storage_local_path: Path,
     storage_filesystem: Optional[pyarrow.fs.FileSystem],
+    storage_local_path: Path = None,
 ) -> Tuple[Path, str]:
     """Downloads the storage path -> local dir for inspecting contents.
 
@@ -177,7 +165,9 @@ def train_fn(config):
         assert train_session.storage.checkpoint_fs_path
 
         # Check that the working dir for each worker is the shared trial dir.
-        assert os.getcwd() == train_session.storage.trial_local_path
+        assert (
+            Path.cwd() == Path(train_session.storage.trial_working_directory).resolve()
+        )
 
     start = 0
 
@@ -416,13 +406,12 @@ def _assert_storage_contents(
 
 
 @pytest.mark.parametrize("trainable", [train_fn, ClassTrainable])
-@pytest.mark.parametrize("storage_path_type", [None, "nfs", "cloud", "custom_fs"])
+@pytest.mark.parametrize("storage_path_type", ["nfs", "cloud", "custom_fs"])
 @pytest.mark.parametrize(
     "checkpoint_config",
     [train.CheckpointConfig(), train.CheckpointConfig(num_to_keep=2)],
 )
 def test_tuner(
-    monkeypatch,
     tmp_path,
     trainable,
     storage_path_type,
@@ -456,16 +445,21 @@ def test_tuner(
     └── train_fn_a2b9e_00001_1_...
         └── ...                     <- Same as above
     """
-    # Set the cache dir to some temp directory
-    LOCAL_CACHE_DIR = tmp_path / "ray_results"
-    monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", str(LOCAL_CACHE_DIR))
-
-    exp_name = "simple_persistence_test"
+    exp_name = f"tuner_persistence_test-{uuid.uuid4().hex}"
 
     with _resolve_storage_type(storage_path_type, tmp_path) as (
         storage_path,
         storage_filesystem,
     ):
+        run_config = train.RunConfig(
+            storage_path=storage_path,
+            storage_filesystem=storage_filesystem,
+            name=exp_name,
+            verbose=0,
+            failure_config=train.FailureConfig(max_failures=1),
+            checkpoint_config=checkpoint_config,
+            sync_config=train.SyncConfig(sync_artifacts=True),
+        )
         tuner = tune.Tuner(
             trainable,
             param_space={
@@ -475,26 +469,15 @@ def test_tuner(
                 "save_checkpoint_as_dict": tune.grid_search([True, False]),
                 "tmp_path": tmp_path,
             },
-            run_config=train.RunConfig(
-                storage_path=storage_path,
-                storage_filesystem=storage_filesystem,
-                name=exp_name,
-                verbose=0,
-                failure_config=train.FailureConfig(max_failures=1),
-                checkpoint_config=checkpoint_config,
-                sync_config=train.SyncConfig(sync_artifacts=True),
-            ),
+            run_config=run_config,
             # 2 samples (from the grid search). Run 1 at at time to test actor reuse
             tune_config=tune.TuneConfig(num_samples=1, max_concurrent_trials=1),
         )
         result_grid = tuner.fit()
         assert result_grid.errors
 
-        if storage_path:
-            shutil.rmtree(LOCAL_CACHE_DIR, ignore_errors=True)
-
         restored_tuner = tune.Tuner.restore(
-            path=str(URI(storage_path or str(LOCAL_CACHE_DIR)) / exp_name),
+            path=str(URI(run_config.storage_path) / exp_name),
             trainable=trainable,
             storage_filesystem=storage_filesystem,
             resume_errored=True,
@@ -504,8 +487,7 @@ def test_tuner(
 
         local_inspect_dir, storage_fs_path = _get_local_inspect_dir(
             root_local_path=tmp_path,
-            storage_path=storage_path,
-            storage_local_path=LOCAL_CACHE_DIR,
+            storage_path=run_config.storage_path,
             storage_filesystem=storage_filesystem,
         )
 
@@ -532,7 +514,7 @@ def test_tuner(
     )
 
 
-@pytest.mark.parametrize("storage_path_type", [None, "nfs", "cloud", "custom_fs"])
+@pytest.mark.parametrize("storage_path_type", ["nfs", "cloud", "custom_fs"])
 @pytest.mark.parametrize(
     "checkpoint_config",
     [
@@ -545,7 +527,7 @@ def test_tuner(
     ],
 )
 def test_trainer(
-    tmp_path, monkeypatch, storage_path_type, checkpoint_config: train.CheckpointConfig
+    tmp_path, storage_path_type, checkpoint_config: train.CheckpointConfig
 ):
     """Same end-to-end test as `test_tuner`, but also includes a
     `DataParallelTrainer(resume_from_checkpoint)` test at the end.
@@ -575,15 +557,22 @@ def test_trainer(
         ├── artifact-rank=1-iter=1.txt
         └── ...
     """
-    LOCAL_CACHE_DIR = tmp_path / "ray_results"
-    monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", str(LOCAL_CACHE_DIR))
-    exp_name = "trainer_new_persistence"
+    exp_name = f"trainer_persistence_test-{uuid.uuid4().hex}"
     no_checkpoint_ranks = [0]
 
     with _resolve_storage_type(storage_path_type, tmp_path) as (
         storage_path,
         storage_filesystem,
     ):
+        run_config = train.RunConfig(
+            storage_path=storage_path,
+            storage_filesystem=storage_filesystem,
+            name=exp_name,
+            verbose=0,
+            checkpoint_config=checkpoint_config,
+            failure_config=train.FailureConfig(max_failures=1),
+            sync_config=train.SyncConfig(sync_artifacts=True),
+        )
         trainer = DataParallelTrainer(
             train_fn,
             train_loop_config={
@@ -594,15 +583,7 @@ def test_trainer(
                 "no_checkpoint_ranks": no_checkpoint_ranks,
             },
             scaling_config=train.ScalingConfig(num_workers=TestConstants.NUM_WORKERS),
-            run_config=train.RunConfig(
-                storage_path=storage_path,
-                storage_filesystem=storage_filesystem,
-                name=exp_name,
-                verbose=0,
-                checkpoint_config=checkpoint_config,
-                failure_config=train.FailureConfig(max_failures=1),
-                sync_config=train.SyncConfig(sync_artifacts=True),
-            ),
+            run_config=run_config,
         )
         print("\nStarting initial run.\n")
         with pytest.raises(TrainingFailedError):
@@ -610,26 +591,19 @@ def test_trainer(
 
         print("\nStarting manually restored run.\n")
         restored_trainer = DataParallelTrainer.restore(
-            path=str(URI(storage_path or str(LOCAL_CACHE_DIR)) / exp_name),
+            path=str(URI(run_config.storage_path) / exp_name),
             storage_filesystem=storage_filesystem,
         )
         result = restored_trainer.fit()
 
-        with monkeypatch.context() as m:
-            # This is so that the `resume_from_checkpoint` run doesn't mess up the
-            # assertions later for the `storage_path=None` case.
-            m.setenv(
-                "RAY_AIR_LOCAL_CACHE_DIR", str(tmp_path / "resume_from_checkpoint")
-            )
-            _resume_from_checkpoint(
-                result.checkpoint,
-                expected_state={"iter": TestConstants.NUM_ITERATIONS - 1},
-            )
+        _resume_from_checkpoint(
+            result.checkpoint,
+            expected_state={"iter": TestConstants.NUM_ITERATIONS - 1},
+        )
 
         local_inspect_dir, storage_fs_path = _get_local_inspect_dir(
             root_local_path=tmp_path,
-            storage_path=storage_path,
-            storage_local_path=LOCAL_CACHE_DIR,
+            storage_path=run_config.storage_path,
             storage_filesystem=storage_filesystem,
         )
 
@@ -650,29 +624,6 @@ def test_trainer(
     )
 
 
-def test_local_dir(tmp_path):
-    """Test that local_dir can do the same job as `RAY_AIR_LOCAL_CACHE_DIR`."""
-
-    def train_fn(config):
-        from ray.train._internal.session import get_session
-
-        assert get_session().storage.storage_local_path == str(tmp_path)
-
-    tune.run(train_fn, local_dir=str(tmp_path))
-
-    results = tune.Tuner(
-        train_fn, run_config=train.RunConfig(local_dir=str(tmp_path))
-    ).fit()
-    assert not results.errors
-
-    trainer = DataParallelTrainer(
-        train_fn,
-        scaling_config=train.ScalingConfig(num_workers=2),
-        run_config=train.RunConfig(local_dir=str(tmp_path)),
-    )
-    trainer.fit()
-
-
 if __name__ == "__main__":
     import sys
 
diff --git a/python/ray/train/tests/test_result.py b/python/ray/train/tests/test_result.py
index 803c1e7a66ceed..b3592979688315 100644
--- a/python/ray/train/tests/test_result.py
+++ b/python/ray/train/tests/test_result.py
@@ -65,11 +65,7 @@ def build_dummy_tuner(configs):
 
 @pytest.mark.parametrize("storage", ["local", "remote"])
 @pytest.mark.parametrize("mode", ["trainer", "tuner"])
-def test_result_restore(
-    ray_start_4_cpus, monkeypatch, tmpdir, mock_s3_bucket_uri, storage, mode
-):
-    monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", str(tmpdir / "ray_results"))
-
+def test_result_restore(ray_start_4_cpus, tmpdir, mock_s3_bucket_uri, storage, mode):
     NUM_ITERATIONS = 5
     NUM_CHECKPOINTS = 3
     if storage == "local":
diff --git a/python/ray/train/tests/test_session.py b/python/ray/train/tests/test_session.py
index 057eef209ad7be..c70e4ecbbd8e4f 100644
--- a/python/ray/train/tests/test_session.py
+++ b/python/ray/train/tests/test_session.py
@@ -34,6 +34,13 @@
 )
 
 
+@pytest.fixture(autouse=True, scope="module")
+def ray_start_4_cpus():
+    ray.init(num_cpus=4)
+    yield
+    ray.shutdown()
+
+
 @pytest.fixture(scope="function")
 def session():
     def f():
@@ -94,7 +101,7 @@ def test_train(session):
     session.finish()
 
 
-def test_get_dataset_shard(shutdown_only):
+def test_get_dataset_shard():
     dataset = ray.data.from_items([1, 2, 3])
     init_session(
         training_func=lambda: 1,
diff --git a/python/ray/train/tests/test_storage.py b/python/ray/train/tests/test_storage.py
index b5c2c6edc2a8b1..910305c33c6b7a 100644
--- a/python/ray/train/tests/test_storage.py
+++ b/python/ray/train/tests/test_storage.py
@@ -1,9 +1,11 @@
 import os
+import uuid
 from pathlib import Path
 
 import pyarrow.fs
 import pytest
 
+import ray
 import ray.cloudpickle as ray_pickle
 from ray.train import Checkpoint, SyncConfig
 from ray.train._internal.storage import (
@@ -14,7 +16,7 @@
 from ray.train.tests.test_new_persistence import _resolve_storage_type
 
 
-@pytest.fixture(params=[None, "nfs", "cloud", "custom_fs"])
+@pytest.fixture(params=["nfs", "cloud", "custom_fs"])
 def storage(request, tmp_path) -> StorageContext:
     storage_type = request.param
     with _resolve_storage_type(storage_type, tmp_path) as (
@@ -23,7 +25,7 @@ def storage(request, tmp_path) -> StorageContext:
     ):
         yield StorageContext(
             storage_path=storage_path,
-            experiment_dir_name="exp_name",
+            experiment_dir_name=f"storage_type={storage_type}-{uuid.uuid4().hex}",
             storage_filesystem=storage_filesystem,
             trial_dir_name="trial_name",
             sync_config=SyncConfig(
@@ -32,11 +34,12 @@ def storage(request, tmp_path) -> StorageContext:
         )
 
 
-@pytest.fixture(autouse=True)
-def local_path(tmp_path, monkeypatch):
-    local_dir = str(tmp_path / "ray_results")
-    monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", local_dir)
-    yield local_dir
+@pytest.fixture(autouse=True, scope="module")
+def ray_init():
+    # NOTE: This is needed to set the `/tmp/ray/session_*` directory.
+    ray.init()
+    yield
+    ray.shutdown()
 
 
 def test_custom_fs_validation(tmp_path):
@@ -100,12 +103,6 @@ def test_storage_path_inputs():
     StorageContext(storage_path=Path(path), experiment_dir_name=exp_name)
 
 
-def test_no_syncing_needed(local_path):
-    """Tests that we don't create a syncer if no syncing is needed."""
-    storage = StorageContext(storage_path=local_path, experiment_dir_name="test_dir")
-    assert storage.syncer is None
-
-
 def test_storage_validation_marker(storage: StorageContext):
     # A marker should have been created at initialization
     storage._check_validation_file()
@@ -162,26 +159,18 @@ def test_persist_current_checkpoint(storage: StorageContext, tmp_path):
 
 def test_persist_artifacts(storage: StorageContext):
     """Tests typical `StorageContext.persist_artifacts(force=True/False)` usage."""
-    trial_local_path = Path(storage.trial_local_path)
-    trial_local_path.mkdir(parents=True)
-    trial_local_path.joinpath("1.txt").touch()
+    trial_working_dir = Path(storage.trial_working_directory)
+    trial_working_dir.mkdir(parents=True)
+    trial_working_dir.joinpath("1.txt").touch()
 
     storage.persist_artifacts()
-
-    if not storage.syncer:
-        # No syncing is needed -- pass early if storage_path == storage_local_path
-        assert _list_at_fs_path(storage.storage_filesystem, storage.trial_fs_path) == [
-            "1.txt"
-        ]
-        return
-
     storage.syncer.wait()
 
     assert sorted(
         _list_at_fs_path(storage.storage_filesystem, storage.trial_fs_path)
     ) == ["1.txt"]
 
-    trial_local_path.joinpath("2.txt").touch()
+    trial_working_dir.joinpath("2.txt").touch()
 
     # A new sync should not be triggered because sync_period is 1000 seconds
     storage.persist_artifacts()
@@ -202,11 +191,6 @@ def test_persist_artifacts(storage: StorageContext):
 
 def test_persist_artifacts_failures(storage: StorageContext):
     """Tests `StorageContext.persist_artifacts` edge cases (empty directory)."""
-    if not storage.syncer:
-        # Should be a no-op if storage_path == storage_local_path (no syncing needed)
-        storage.persist_artifacts()
-        return
-
     # Uploading before the trial directory has been created should fail
     with pytest.raises(FileNotFoundError):
         storage.persist_artifacts()
diff --git a/python/ray/train/tests/test_tune.py b/python/ray/train/tests/test_tune.py
index 0d513a172a19d3..2a9859bd3a20c4 100644
--- a/python/ray/train/tests/test_tune.py
+++ b/python/ray/train/tests/test_tune.py
@@ -218,12 +218,7 @@ def train_func():
     assert len(df[TRAINING_ITERATION]) == 4
 
 
-def test_restore_with_new_trainer(
-    ray_start_4_cpus, tmpdir, propagate_logs, caplog, monkeypatch
-):
-
-    monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", str(tmpdir))
-
+def test_restore_with_new_trainer(ray_start_4_cpus, tmpdir, propagate_logs, caplog):
     def train_func(config):
         raise RuntimeError("failing!")
 
@@ -231,7 +226,7 @@ def train_func(config):
         train_func,
         backend_config=TestConfig(),
         scaling_config=ScalingConfig(num_workers=1),
-        run_config=RunConfig(name="restore_new_trainer"),
+        run_config=RunConfig(name="restore_new_trainer", storage_path=str(tmpdir)),
         datasets={"train": ray.data.from_items([{"a": i} for i in range(10)])},
     )
     results = Tuner(trainer).fit()
@@ -272,11 +267,14 @@ def train_func(config):
 @pytest.mark.parametrize("in_trainer", [True, False])
 @pytest.mark.parametrize("in_tuner", [True, False])
 def test_run_config_in_trainer_and_tuner(
-    propagate_logs, tmp_path, monkeypatch, caplog, in_trainer, in_tuner
+    propagate_logs, tmp_path, caplog, in_trainer, in_tuner
 ):
-    monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", str(tmp_path))
-    trainer_run_config = RunConfig(name="trainer") if in_trainer else None
-    tuner_run_config = RunConfig(name="tuner") if in_tuner else None
+    trainer_run_config = (
+        RunConfig(name="trainer", storage_path=str(tmp_path)) if in_trainer else None
+    )
+    tuner_run_config = (
+        RunConfig(name="tuner", storage_path=str(tmp_path)) if in_tuner else None
+    )
     trainer = DataParallelTrainer(
         lambda config: None,
         backend_config=TestConfig(),
@@ -284,22 +282,20 @@ def test_run_config_in_trainer_and_tuner(
         run_config=trainer_run_config,
     )
     with caplog.at_level(logging.INFO, logger="ray.tune.impl.tuner_internal"):
-        Tuner(trainer, run_config=tuner_run_config)
+        tuner = Tuner(trainer, run_config=tuner_run_config)
 
     both_msg = (
         "`RunConfig` was passed to both the `Tuner` and the `DataParallelTrainer`"
     )
+    run_config = tuner._local_tuner.get_run_config()
     if in_trainer and in_tuner:
-        assert (tmp_path / "tuner").exists()
-        assert not (tmp_path / "trainer").exists()
+        assert run_config.name == "tuner"
         assert both_msg in caplog.text
     elif in_trainer and not in_tuner:
-        assert not (tmp_path / "tuner").exists()
-        assert (tmp_path / "trainer").exists()
+        assert run_config.name == "trainer"
         assert both_msg not in caplog.text
     elif not in_trainer and in_tuner:
-        assert (tmp_path / "tuner").exists()
-        assert not (tmp_path / "trainer").exists()
+        assert run_config.name == "tuner"
         assert both_msg not in caplog.text
     else:
         assert both_msg not in caplog.text
diff --git a/python/ray/train/tests/test_utils.py b/python/ray/train/tests/test_utils.py
index 0974b00bb13fa0..6f1880eeb74a03 100644
--- a/python/ray/train/tests/test_utils.py
+++ b/python/ray/train/tests/test_utils.py
@@ -1,6 +1,18 @@
 from pathlib import Path
 
+import pytest
+
+import ray
 from ray.train._internal.utils import construct_path
+from ray.train.constants import _get_ray_train_session_dir
+
+
+@pytest.fixture
+def ray_init_custom_tmpdir():
+    custom_tmpdir = "/tmp/custom"
+    ray.init(_temp_dir=custom_tmpdir)
+    yield custom_tmpdir
+    ray.shutdown()
 
 
 def test_construct_path():
@@ -17,9 +29,13 @@ def test_construct_path():
     assert construct_path(Path("a"), Path("b")) == Path("b/a").resolve()
 
 
+def test_customize_local_staging_path(ray_init_custom_tmpdir):
+    """Test that the staging directory where driver artifacts are written
+    before being persisted to storage path can be customized."""
+    assert str(ray_init_custom_tmpdir) in _get_ray_train_session_dir()
+
+
 if __name__ == "__main__":
     import sys
 
-    import pytest
-
     sys.exit(pytest.main(["-v", "-x", __file__]))
diff --git a/python/ray/train/tests/util.py b/python/ray/train/tests/util.py
index a18181cf903dd7..205848923f8893 100644
--- a/python/ray/train/tests/util.py
+++ b/python/ray/train/tests/util.py
@@ -27,20 +27,24 @@ def load_dict_checkpoint(checkpoint: Checkpoint) -> Dict[str, Any]:
 
 def mock_storage_context(
     exp_name: str = "exp_name",
-    delete_syncer: bool = True,
     storage_path: Optional[str] = None,
     storage_context_cls: Type = StorageContext,
 ) -> StorageContext:
     storage_path = storage_path or tempfile.mkdtemp()
     exp_name = exp_name
     trial_name = "trial_name"
+
     storage = storage_context_cls(
         storage_path=storage_path,
         experiment_dir_name=exp_name,
         trial_dir_name=trial_name,
     )
-    storage.storage_local_path = storage_path
-    if delete_syncer:
-        storage.syncer = None
-    os.makedirs(os.path.join(storage_path, exp_name, trial_name), exist_ok=True)
+    # Patch the default /tmp/ray/session_* so we don't require ray
+    # to be initialized in unit tests.
+    session_path = tempfile.mkdtemp()
+    storage._get_session_path = lambda: session_path
+
+    os.makedirs(storage.trial_fs_path, exist_ok=True)
+    os.makedirs(storage.trial_driver_staging_path, exist_ok=True)
+
     return storage
diff --git a/python/ray/tune/analysis/experiment_analysis.py b/python/ray/tune/analysis/experiment_analysis.py
index e8972f7fdaa36f..73aeb1916f4303 100644
--- a/python/ray/tune/analysis/experiment_analysis.py
+++ b/python/ray/tune/analysis/experiment_analysis.py
@@ -1,5 +1,4 @@
 import copy
-import fnmatch
 import io
 import json
 import logging
@@ -18,10 +17,10 @@
 )
 from ray.train import Checkpoint
 from ray.train._internal.storage import (
-    _list_at_fs_path,
     _exists_at_fs_path,
     get_fs_and_path,
 )
+from ray.tune.execution.experiment_state import _find_newest_experiment_checkpoint
 from ray.tune.execution.tune_controller import TuneController
 from ray.tune.experiment import Trial
 from ray.tune.result import (
@@ -97,10 +96,8 @@ def __init__(
         else:
             self._experiment_fs_path = experiment_checkpoint_path
 
-            experiment_json_fs_path = (
-                ExperimentAnalysis._find_newest_experiment_checkpoint(
-                    self._fs, self._experiment_fs_path
-                )
+            experiment_json_fs_path = _find_newest_experiment_checkpoint(
+                experiment_path=self._experiment_fs_path, fs=self._fs
             )
             if experiment_json_fs_path is None:
                 pattern = TuneController.CKPT_FILE_TMPL.format("*")
@@ -212,19 +209,6 @@ def get_all_configs(self, prefix: bool = False) -> Dict[str, Dict]:
             for trial in self.trials
         }
 
-    @classmethod
-    def _find_newest_experiment_checkpoint(
-        cls, fs: pyarrow.fs.FileSystem, experiment_fs_path: Union[str, os.PathLike]
-    ) -> Optional[str]:
-        """Return the most recent experiment checkpoint path."""
-        filenames = _list_at_fs_path(fs=fs, fs_path=experiment_fs_path)
-        pattern = TuneController.CKPT_FILE_TMPL.format("*")
-        matching = fnmatch.filter(filenames, pattern)
-        if not matching:
-            return None
-        filename = max(matching)
-        return Path(experiment_fs_path, filename).as_posix()
-
     @property
     def experiment_path(self) -> str:
         """Path pointing to the experiment directory on persistent storage.
diff --git a/python/ray/tune/execution/experiment_state.py b/python/ray/tune/execution/experiment_state.py
index 298ee9e064a8ba..6ab92335026429 100644
--- a/python/ray/tune/execution/experiment_state.py
+++ b/python/ray/tune/execution/experiment_state.py
@@ -1,16 +1,19 @@
 from collections import Counter
+import fnmatch
 from pathlib import Path
 from typing import Callable, Dict, Optional, Union
-
 import logging
 import os
 import time
 
+import pyarrow.fs
+
 from ray.train._internal.storage import (
     StorageContext,
     get_fs_and_path,
     _download_from_fs_path,
     _list_at_fs_path,
+    _upload_to_fs_path,
 )
 from ray.tune.experiment import Trial
 from ray.tune.impl.out_of_band_serialize_dataset import out_of_band_serialize_dataset
@@ -31,23 +34,29 @@ def _experiment_checkpoint_exists(experiment_dir: str) -> bool:
     return bool(_find_newest_experiment_checkpoint(experiment_dir=experiment_dir))
 
 
-def _find_newest_experiment_checkpoint(experiment_dir: str) -> Optional[str]:
+def _find_newest_experiment_checkpoint(
+    experiment_path: str, fs: Optional[pyarrow.fs.FileSystem] = None
+) -> Optional[str]:
     """Returns file name of most recently created experiment checkpoint.
 
     Args:
-        experiment_dir: Local or remote path to the experiment directory
+        experiment_path: Local or remote path to the experiment directory
             containing at least one experiment checkpoint file.
 
     Returns:
         str: The local or remote path to the latest experiment checkpoint file
             based on timestamp. None if no experiment checkpoints were found.
     """
-    from ray.tune.analysis import ExperimentAnalysis
+    from ray.tune.execution.tune_controller import TuneController
 
-    fs, path = get_fs_and_path(experiment_dir)
-    return ExperimentAnalysis._find_newest_experiment_checkpoint(
-        fs=fs, experiment_fs_path=path
-    )
+    fs, experiment_fs_path = get_fs_and_path(experiment_path, storage_filesystem=fs)
+    filenames = _list_at_fs_path(fs=fs, fs_path=experiment_fs_path)
+    pattern = TuneController.CKPT_FILE_TMPL.format("*")
+    matching = fnmatch.filter(filenames, pattern)
+    if not matching:
+        return None
+    filename = max(matching)
+    return Path(experiment_fs_path, filename).as_posix()
 
 
 class _ExperimentCheckpointManager:
@@ -154,10 +163,8 @@ def checkpoint(
             wait: Wait until sync to cloud has finished.
 
         """
-        experiment_local_path = self._storage.experiment_local_path
-        if not experiment_local_path:
-            return
-
+        driver_staging_path = self._storage.experiment_driver_staging_path
+        # TODO(justinvyu): [local_dir] Probably want to disable this num_to_keep force
         force = force or self._should_force_cloud_sync
 
         now = time.time()
@@ -175,8 +182,11 @@ def checkpoint(
         with out_of_band_serialize_dataset():
             save_fn()
 
-        # Sync to cloud
-        self.sync_up(force=force, wait=wait)
+        _upload_to_fs_path(
+            local_path=driver_staging_path,
+            fs=self._storage.storage_filesystem,
+            fs_path=self._storage.experiment_fs_path,
+        )
 
         checkpoint_time_taken = time.monotonic() - checkpoint_time_start
 
@@ -185,7 +195,6 @@ def checkpoint(
 
         # Finish
         self._last_save_time = time.time()
-        return experiment_local_path
 
     def sync_up(self, force: bool = False, wait: bool = False) -> bool:
         syncer = self._storage.syncer
@@ -198,7 +207,7 @@ def sync_up(self, force: bool = False, wait: bool = False) -> bool:
         # But for now, this is needed to upload driver artifacts that live in the
         # trial directory.
         exclude = _DRIVER_SYNC_EXCLUDE_PATTERNS
-        experiment_local_path = self._storage.experiment_local_path
+        experiment_local_path = self._storage.experiment_driver_staging_path
         experiment_fs_path = self._storage.experiment_fs_path
 
         if force:
@@ -244,25 +253,13 @@ def sync_up(self, force: bool = False, wait: bool = False) -> bool:
         sync_time_taken = now - start_time
 
         if sync_time_taken > self._slow_sync_threshold:
-            try:
-                import fsspec
-            except Exception:
-                fsspec = None
-
-            fsspec_msg = ""
-            if fsspec is None:
-                fsspec_msg = (
-                    "If your data is small, try installing fsspec "
-                    "(`pip install fsspec`) for more efficient local file parsing. "
-                )
-
             logger.warning(
-                "Syncing the experiment checkpoint to cloud took a long time with "
+                "Syncing the experiment checkpoint to cloud took a "
                 f"{sync_time_taken:.2f} seconds. This can be due to a large number "
                 f"of trials, large logfiles, or throttling from the "
-                f"remote storage provider for too frequent syncs. {fsspec_msg}"
-                f"If your `CheckpointConfig.num_to_keep` is a low number, this can "
-                f"trigger frequent syncing, in which case you should increase it. "
+                f"remote storage provider for too frequent syncs.\n"
+                f"If you set `CheckpointConfig.num_to_keep` to a low number, this can "
+                f"trigger frequent syncing. Try increasing the `num_to_keep`. "
             )
 
         if not synced:
@@ -300,62 +297,13 @@ def sync_down_experiment_state(self) -> None:
         ]
         for relpath in matches:
             fs_path = Path(self._storage.experiment_fs_path, relpath).as_posix()
-            local_path = Path(self._storage.experiment_local_path, relpath).as_posix()
+            local_path = Path(
+                self._storage.experiment_driver_staging_path, relpath
+            ).as_posix()
             _download_from_fs_path(fs=fs, fs_path=fs_path, local_path=local_path)
         logger.debug(
             f"Copied {matches} from:\n(fs, path) = "
             f"({self._storage.storage_filesystem.type_name}, "
             f"{self._storage.experiment_fs_path})\n"
-            f"-> {self._storage.experiment_local_path}"
+            f"-> {self._storage.experiment_driver_staging_path}"
         )
-
-    def resume(self) -> bool:
-        """Checks whether to resume experiment.
-
-        The experiment can be resumed if a metadata file uploaded from a
-        previous run can be found at the specified experiment directory on storage.
-        If experiment should be resumed, this method will pull the necessary
-        experiment state from storage.
-
-        Returns:
-            can_restore: Whether the experiment can be restored.
-        """
-        experiment_local_path = self._storage.experiment_local_path
-        experiment_fs_path = self._storage.experiment_fs_path
-
-        syncer = self._storage.syncer
-
-        # syncer is not None when the local path != storage path
-        if syncer:
-            logger.info(
-                f"Trying to find and download experiment checkpoint from: "
-                f"{experiment_fs_path}"
-            )
-            try:
-                self.sync_down_experiment_state()
-            except Exception:
-                logger.exception(
-                    "Got error when trying to sync down experiment state from "
-                    f"{experiment_fs_path}\n"
-                    "Please check this error message for potential "
-                    "access problems - if a directory was not found, "
-                    "that is expected at this stage when you're starting "
-                    "a new experiment."
-                )
-                return False
-
-        latest_experiment_checkpoint_path = _find_newest_experiment_checkpoint(
-            experiment_local_path
-        )
-        if latest_experiment_checkpoint_path is None:
-            logger.warning(
-                f"No experiment metadata was found at {experiment_fs_path}. "
-                "Starting a new run..."
-            )
-            return False
-
-        logger.info(
-            f"The run will now start from the experiment state found in: "
-            f"{latest_experiment_checkpoint_path}"
-        )
-        return True
diff --git a/python/ray/tune/execution/tune_controller.py b/python/ray/tune/execution/tune_controller.py
index 8f8df9c36c5c44..e51bbdfef60606 100644
--- a/python/ray/tune/execution/tune_controller.py
+++ b/python/ray/tune/execution/tune_controller.py
@@ -1,8 +1,9 @@
 import copy
 import json
+import logging
+import os
 import time
 import traceback
-import uuid
 import warnings
 from collections import defaultdict, deque
 from datetime import datetime
@@ -10,9 +11,6 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union, Tuple, Set
 
-import logging
-import os
-
 import ray
 from ray.air import ResourceRequest
 from ray.air.constants import TIME_THIS_ITER_S
@@ -214,7 +212,6 @@ def __init__(
         self._stopper = stopper or NoopStopper()
 
         self._start_time = time.time()
-        self._last_checkpoint_time = -float("inf")
 
         self._session_str = datetime.fromtimestamp(self._start_time).strftime(
             "%Y-%m-%d_%H-%M-%S"
@@ -230,9 +227,6 @@ def __init__(
         self._resumed = False
 
         if resume_config is not None:
-            # Sync down state from storage
-            self._checkpoint_manager.resume()
-
             # Use the metadata file to restore TuneController state
             try:
                 self.resume(resume_config=resume_config)
@@ -240,7 +234,7 @@ def __init__(
             except Exception as e:
                 if has_verbosity(Verbosity.V3_TRIAL_DETAILS):
                     logger.error(str(e))
-                logger.exception("Runner restore failed.")
+                logger.exception("Failed to restore the run state.")
                 if self._fail_fast:
                     raise
                 logger.info("Restarting experiment.")
@@ -316,7 +310,8 @@ def experiment_state_file_name(self) -> str:
     def experiment_state_path(self) -> str:
         """Returns the local experiment checkpoint path."""
         return Path(
-            self._storage.experiment_local_path, self.experiment_state_file_name
+            self._storage.experiment_driver_staging_path,
+            self.experiment_state_file_name,
         ).as_posix()
 
     @property
@@ -338,7 +333,7 @@ def checkpoint_exists(cls, directory: str) -> bool:
         return _experiment_checkpoint_exists(directory)
 
     def save_to_dir(self):
-        """Save TuneController state to the local experiment directory.
+        """Save TuneController state to the local staging experiment directory.
 
         This includes:
         - trial states
@@ -346,8 +341,6 @@ def save_to_dir(self):
         - the searcher state
         - the callback states
         """
-        experiment_dir = self._storage.experiment_local_path
-
         # Get state from trial executor and runner
         runner_state = {
             # Trials
@@ -355,97 +348,19 @@ def save_to_dir(self):
             # Experiment data
             "runner_data": self.__getstate__(),
             # Metadata
-            "stats": {
-                "start_time": self._start_time,
-                "timestamp": self._last_checkpoint_time,
-            },
+            "stats": {"start_time": self._start_time},
         }
 
-        tmp_file_name = Path(
-            experiment_dir, f".tmp_experiment_state_{uuid.uuid4()}"
-        ).as_posix()
-
-        with open(tmp_file_name, "w") as f:
-            json.dump(runner_state, f, indent=2, cls=TuneFunctionEncoder)
-
-        os.replace(
-            tmp_file_name,
-            Path(experiment_dir, self.experiment_state_file_name).as_posix(),
-        )
-
-        self._search_alg.save_to_dir(experiment_dir, session_str=self._session_str)
-        self._callbacks.save_to_dir(experiment_dir, session_str=self._session_str)
-
-    def restore_from_dir(self) -> List[Trial]:
-        """Restore TrialRunner state from local experiment directory.
-
-        This method will restore the trial runner state, the searcher state,
-        and the callback states. It will then parse the trial states
-        and return them as a list of Trial objects.
-        """
-        experiment_dir = self._storage.experiment_local_path
-
-        # Find newest state file
-        newest_state_path = _find_newest_experiment_checkpoint(experiment_dir)
-
-        if not newest_state_path:
-            raise ValueError(
-                f"Tried to resume experiment from directory "
-                f"`{experiment_dir}`, but no "
-                f"experiment checkpoint data was found."
-            )
-
-        # Set checkpoint file to load
-        logger.warning(
-            f"Attempting to resume experiment from {experiment_dir}. "
-            "This will ignore any new changes to the specification."
-        )
-        logger.info(
-            "Using the newest experiment state file found within the "
-            f"experiment directory: {Path(newest_state_path).name}"
-        )
-
-        # Actually load data
-        with open(newest_state_path, "r") as f:
-            runner_state = json.load(f, cls=TuneFunctionDecoder)
-
-        # 1. Restore trial runner state
-        self.__setstate__(runner_state["runner_data"])
+        driver_staging_path = self._storage.experiment_driver_staging_path
+        os.makedirs(driver_staging_path, exist_ok=True)
+        with open(
+            Path(driver_staging_path, self.experiment_state_file_name),
+            "w",
+        ) as f:
+            json.dump(runner_state, f, cls=TuneFunctionEncoder)
 
-        # 2. Restore search algorithm and callback state
-        if self._search_alg.has_checkpoint(experiment_dir):
-            self._search_alg.restore_from_dir(experiment_dir)
-
-        if self._callbacks.can_restore(experiment_dir):
-            self._callbacks.restore_from_dir(experiment_dir)
-
-        # 3. Load trials
-        trials = []
-        for trial_json_state, trial_runtime_metadata in runner_state["trial_data"]:
-            trial = Trial.from_json_state(trial_json_state)
-            trial.restore_run_metadata(trial_runtime_metadata)
-
-            # The following properties may be updated on restoration
-            # Ex: moved local/cloud experiment directory
-
-            # Propagate updated storage ctx properties to the trial's restored copy.
-            new_storage = copy.copy(trial.storage)
-            new_storage.storage_filesystem = self._storage.storage_filesystem
-            new_storage.storage_fs_path = self._storage.storage_fs_path
-            new_storage.experiment_dir_name = self._storage.experiment_dir_name
-            # ATTN: `trial.set_storage` is used intentionally, since it
-            # also updates the absolute paths and filesystem of tracked checkpoints.
-            trial.set_storage(new_storage)
-
-            # Avoid creating logdir in client mode for returned trial results,
-            # since the dir might not be creatable locally.
-            # TODO(ekl) this is kind of a hack.
-            if not ray.util.client.ray.is_connected():
-                trial.init_local_path()  # Create logdir if it does not exist
-
-            trials.append(trial)
-
-        return trials
+        self._search_alg.save_to_dir(driver_staging_path, session_str=self._session_str)
+        self._callbacks.save_to_dir(driver_staging_path, session_str=self._session_str)
 
     def checkpoint(self, force: bool = False, wait: bool = False):
         """Saves execution state to the local experiment path.
@@ -478,14 +393,9 @@ def checkpoint(self, force: bool = False, wait: bool = False):
                 save_fn=self.save_to_dir, force=force, wait=wait
             )
 
-    def resume(self, resume_config: ResumeConfig):
-        """Resumes all checkpointed trials from previous run.
-
-        Requires user to manually re-register their objects. Also stops
-        all ongoing trials.
-        """
-        trials = self.restore_from_dir()
-
+    def _requeue_restored_trials(
+        self, trials: List[Trial], resume_config: ResumeConfig
+    ):
         # Set trial statuses according to the resume configuration
         for trial in sorted(
             trials, key=lambda t: t.run_metadata.last_result_time, reverse=True
@@ -519,6 +429,84 @@ def resume(self, resume_config: ResumeConfig):
 
             self.add_trial(trial_to_add)
 
+    def _restore_trials(self, experiment_state: Dict) -> List[Trial]:
+        trials = []
+        for trial_json_state, trial_runtime_metadata in experiment_state["trial_data"]:
+            trial = Trial.from_json_state(trial_json_state)
+            trial.restore_run_metadata(trial_runtime_metadata)
+
+            # The following properties may be updated on restoration
+            # Ex: moved local/cloud experiment directory
+
+            # Propagate updated storage ctx properties to the trial's restored copy.
+            new_storage = copy.copy(trial.storage)
+            new_storage.storage_filesystem = self._storage.storage_filesystem
+            new_storage.storage_fs_path = self._storage.storage_fs_path
+            new_storage.experiment_dir_name = self._storage.experiment_dir_name
+
+            # ATTN: `trial.set_storage` is used intentionally, since it
+            # also updates the absolute paths and filesystem of tracked checkpoints.
+            trial.set_storage(new_storage)
+
+            # Avoid creating logdir in client mode for returned trial results,
+            # since the dir might not be creatable locally.
+            # TODO(ekl) this is kind of a hack.
+            if not ray.util.client.ray.is_connected():
+                trial.init_local_path()  # Create logdir if it does not exist
+
+            trials.append(trial)
+
+        # NOTE: The restored run should reuse the same driver staging directory.
+        self._storage._timestamp = trials[0].storage._timestamp
+
+        return trials
+
+    def resume(self, resume_config: ResumeConfig):
+        """Resumes all checkpointed trials from previous run.
+
+        Requires user to manually re-register their objects. Also stops
+        all ongoing trials.
+        """
+        # 1. Restore TuneController state
+        # Find newest state file
+        newest_state_path = _find_newest_experiment_checkpoint(
+            self._storage.experiment_fs_path, fs=self._storage.storage_filesystem
+        )
+
+        if newest_state_path is None:
+            raise ValueError(
+                f"Tried to resume experiment from directory "
+                f"'{self._storage.experiment_fs_path}', but no "
+                f"experiment state file of the form '{TuneController.CKPT_FILE_TMPL}' "
+                "was found. This is expected if you are launching a new experiment."
+            )
+
+        logger.info(
+            "Restoring the run from the latest experiment state file: "
+            f"{Path(newest_state_path).name}"
+        )
+        with self._storage.storage_filesystem.open_input_stream(newest_state_path) as f:
+            experiment_state = json.loads(f.readall(), cls=TuneFunctionDecoder)
+
+        self.__setstate__(experiment_state["runner_data"])
+
+        # 2. Get the trial states that the run left off at.
+        trials = self._restore_trials(experiment_state)
+
+        # 3. Restore search algorithm and callback state
+        # Download the search algorithm and callback state to the driver staging dir.
+        self._checkpoint_manager.sync_down_experiment_state()
+
+        driver_staging_dir = self._storage.experiment_driver_staging_path
+        if self._search_alg.has_checkpoint(driver_staging_dir):
+            self._search_alg.restore_from_dir(driver_staging_dir)
+
+        if self._callbacks.can_restore(driver_staging_dir):
+            self._callbacks.restore_from_dir(driver_staging_dir)
+
+        # 4. Re-queue trials as needed, depending on their status.
+        self._requeue_restored_trials(trials, resume_config)
+
     def update_max_pending_trials(self, max_pending_trials: Optional[int] = None):
         self._max_pending_trials = max_pending_trials or _get_max_pending_trials(
             self._search_alg
@@ -1961,7 +1949,9 @@ def _schedule_trial_reset(
         extra_config[STDOUT_FILE] = stdout_file
         extra_config[STDERR_FILE] = stderr_file
 
-        logger_creator = partial(_noop_logger_creator, logdir=trial.local_path)
+        logger_creator = partial(
+            _noop_logger_creator, logdir=trial.storage.trial_working_directory
+        )
 
         self._resetting_trials.add(trial)
         self._schedule_trial_task(
diff --git a/python/ray/tune/experiment/experiment.py b/python/ray/tune/experiment/experiment.py
index 2819a534314fe6..287196bcffcd5a 100644
--- a/python/ray/tune/experiment/experiment.py
+++ b/python/ray/tune/experiment/experiment.py
@@ -22,6 +22,7 @@
 import ray
 from ray.exceptions import RpcError
 from ray.train import CheckpointConfig, SyncConfig
+from ray.train.constants import DEFAULT_STORAGE_PATH
 from ray.train._internal.storage import StorageContext
 from ray.tune.error import TuneError
 from ray.tune.registry import register_trainable, is_function_trainable
@@ -163,6 +164,7 @@ def __init__(
         if not name:
             name = StorageContext.get_experiment_dir_name(run)
 
+        storage_path = storage_path or DEFAULT_STORAGE_PATH
         self.storage = self._storage_context_cls(
             storage_path=storage_path,
             storage_filesystem=storage_filesystem,
@@ -365,7 +367,7 @@ def stopper(self):
 
     @property
     def local_path(self) -> Optional[str]:
-        return self.storage.experiment_local_path
+        return self.storage.experiment_driver_staging_path
 
     @property
     @Deprecated("Replaced by `local_path`")
diff --git a/python/ray/tune/experiment/trial.py b/python/ray/tune/experiment/trial.py
index f4654295c11e1c..d1bffb1cb0d31a 100644
--- a/python/ray/tune/experiment/trial.py
+++ b/python/ray/tune/experiment/trial.py
@@ -28,7 +28,7 @@
 )
 from ray.train._internal.checkpoint_manager import _CheckpointManager
 from ray.train._internal.session import _FutureTrainingResult, _TrainingResult
-from ray.train._internal.storage import StorageContext
+from ray.train._internal.storage import StorageContext, _exists_at_fs_path
 from ray.tune import TuneError
 from ray.tune.logger import NoopLogger
 
@@ -206,7 +206,9 @@ def _noop_logger_creator(config: Dict[str, Any], logdir: str):
 def _get_trainable_kwargs(trial: "Trial") -> Dict[str, Any]:
     trial.init_local_path()
 
-    logger_creator = partial(_noop_logger_creator, logdir=trial.local_path)
+    logger_creator = partial(
+        _noop_logger_creator, logdir=trial.storage.trial_working_directory
+    )
 
     trial_config = copy.deepcopy(trial.config)
     trial_config[TRIAL_INFO] = _TrialInfo(trial)
@@ -536,7 +538,7 @@ def remote_experiment_path(self) -> str:
 
     @property
     def local_experiment_path(self) -> str:
-        return self.storage.experiment_local_path
+        return self.storage.experiment_driver_staging_path
 
     @property
     @Deprecated("Replaced by `local_path`")
@@ -546,7 +548,7 @@ def logdir(self) -> Optional[str]:
 
     @property
     def local_path(self) -> Optional[str]:
-        return self.storage.trial_local_path
+        return self.storage.trial_driver_staging_path
 
     @property
     def path(self) -> Optional[str]:
@@ -753,6 +755,44 @@ def pickled_error_file(self):
             self.local_path, self.run_metadata.pickled_error_filename
         ).as_posix()
 
+    def get_pickled_error(self) -> Optional[Exception]:
+        """Returns the pickled error object if it exists in storage.
+
+        This is a pickled version of the latest error that the trial encountered.
+        """
+        error_filename = self.run_metadata.pickled_error_filename
+        if error_filename is None:
+            return None
+
+        fs = self.storage.storage_filesystem
+        pickled_error_fs_path = Path(
+            self.storage.trial_fs_path, error_filename
+        ).as_posix()
+
+        if _exists_at_fs_path(fs=fs, fs_path=pickled_error_fs_path):
+            with fs.open_input_stream(pickled_error_fs_path) as f:
+                return cloudpickle.loads(f.readall())
+        return None
+
+    def get_error(self) -> Optional[TuneError]:
+        """Returns the error text file trace as a TuneError object
+        if it exists in storage.
+
+        This is a text trace of the latest error that the trial encountered,
+        which is used in the case that the error is not picklable.
+        """
+        error_filename = self.run_metadata.error_filename
+        if error_filename is None:
+            return None
+
+        fs = self.storage.storage_filesystem
+        txt_error_fs_path = Path(self.storage.trial_fs_path, error_filename).as_posix()
+
+        if _exists_at_fs_path(fs=fs, fs_path=txt_error_fs_path):
+            with fs.open_input_stream(txt_error_fs_path) as f:
+                return f.readall().decode()
+        return None
+
     def _handle_restore_error(self, exc: Exception):
         if self.temporary_state.num_restore_failures >= int(
             os.environ.get("TUNE_RESTORE_RETRY_NUM", 0)
diff --git a/python/ray/tune/result_grid.py b/python/ray/tune/result_grid.py
index 86ac8b65585e3e..08dc91ddc108e4 100644
--- a/python/ray/tune/result_grid.py
+++ b/python/ray/tune/result_grid.py
@@ -1,10 +1,8 @@
-import os
 import pandas as pd
 import pyarrow
 from typing import Optional, Union
 
 from ray.air.result import Result
-from ray.cloudpickle import cloudpickle
 from ray.exceptions import RayTaskError
 from ray.tune.analysis import ExperimentAnalysis
 from ray.tune.error import TuneError
@@ -254,15 +252,7 @@ def num_terminated(self):
     def _populate_exception(trial: Trial) -> Optional[Union[TuneError, RayTaskError]]:
         if trial.status == Trial.TERMINATED:
             return None
-        # TODO(justinvyu): [populate_exception] for storage_path != None
-        if trial.pickled_error_file and os.path.exists(trial.pickled_error_file):
-            with open(trial.pickled_error_file, "rb") as f:
-                e = cloudpickle.load(f)
-                return e
-        elif trial.error_file and os.path.exists(trial.error_file):
-            with open(trial.error_file, "r") as f:
-                return TuneError(f.read())
-        return None
+        return trial.get_pickled_error() or trial.get_error()
 
     def _trial_to_result(self, trial: Trial) -> Result:
         cpm = trial.run_metadata.checkpoint_manager
diff --git a/python/ray/tune/tests/execution/test_actor_caching.py b/python/ray/tune/tests/execution/test_actor_caching.py
index 793f1295d9fea5..6808f6c92ffd64 100644
--- a/python/ray/tune/tests/execution/test_actor_caching.py
+++ b/python/ray/tune/tests/execution/test_actor_caching.py
@@ -1,12 +1,20 @@
 import pytest
 import sys
 
+import ray
 from ray.tune import PlacementGroupFactory
 
 from ray.tune.tests.execution.utils import create_execution_test_objects, TestingTrial
 
 
-def test_actor_cached(tmpdir):
+@pytest.fixture
+def ray_start_2_cpus():
+    address_info = ray.init(num_cpus=2)
+    yield address_info
+    ray.shutdown()
+
+
+def test_actor_cached(tmpdir, ray_start_2_cpus):
     tune_controller, actor_manger, resource_manager = create_execution_test_objects(
         max_pending_trials=8
     )
@@ -20,7 +28,7 @@ def test_actor_cached(tmpdir):
     assert cls_name == "trainable1"
 
 
-def test_actor_reuse_unstaged(tmpdir):
+def test_actor_reuse_unstaged(tmpdir, ray_start_2_cpus):
     """A trial that hasn't been staged can re-use an actor.
 
     In specific circumstances, this can lead to errors. Notably, when an
diff --git a/python/ray/tune/tests/execution/test_controller_checkpointing_integration.py b/python/ray/tune/tests/execution/test_controller_checkpointing_integration.py
index 96456c5c38baf6..9cf7f265cd6276 100644
--- a/python/ray/tune/tests/execution/test_controller_checkpointing_integration.py
+++ b/python/ray/tune/tests/execution/test_controller_checkpointing_integration.py
@@ -57,6 +57,13 @@ def on_trial_complete(self, trial_id, error=False, **kwargs):
     return searchalg, scheduler
 
 
+def num_checkpoints(trial):
+    return sum(
+        item.startswith("checkpoint_")
+        for item in os.listdir(trial.storage.trial_fs_path)
+    )
+
+
 @pytest.mark.parametrize(
     "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager]
 )
@@ -316,12 +323,6 @@ def test_checkpoint_freq_buffered(
         os.environ,
         {"TUNE_RESULT_BUFFER_LENGTH": "7", "TUNE_RESULT_BUFFER_MIN_TIME_S": "1"},
     ):
-
-        def num_checkpoints(trial):
-            return sum(
-                item.startswith("checkpoint_") for item in os.listdir(trial.local_path)
-            )
-
         trial = Trial(
             "__fake",
             checkpoint_config=CheckpointConfig(checkpoint_frequency=3),
@@ -367,12 +368,6 @@ def test_checkpoint_at_end_not_buffered(
         os.environ,
         {"TUNE_RESULT_BUFFER_LENGTH": "7", "TUNE_RESULT_BUFFER_MIN_TIME_S": "0.5"},
     ):
-
-        def num_checkpoints(trial):
-            return sum(
-                item.startswith("checkpoint_") for item in os.listdir(trial.local_path)
-            )
-
         trial = Trial(
             "__fake",
             checkpoint_config=CheckpointConfig(
@@ -482,11 +477,6 @@ def test_checkpoint_user_checkpoint_buffered(
     Legacy test: test_trial_runner_3.py::TrialRunnerTest::testUserCheckpointBuffered
     """
 
-    def num_checkpoints(trial):
-        return sum(
-            item.startswith("checkpoint_") for item in os.listdir(trial.local_path)
-        )
-
     with mock.patch.dict(
         os.environ,
         {"TUNE_RESULT_BUFFER_LENGTH": "8", "TUNE_RESULT_BUFFER_MIN_TIME_S": "1"},
@@ -547,13 +537,10 @@ def test_checkpoint_auto_period(
 
     Legacy test: test_trial_runner_3.py::TrialRunnerTest::testCheckpointAutoPeriod
     """
-    storage = mock_storage_context(delete_syncer=False)
+    storage = mock_storage_context()
 
-    with mock.patch.object(
-        storage.syncer, "sync_up"
-    ) as sync_up, tempfile.TemporaryDirectory() as local_dir:
+    with tempfile.TemporaryDirectory() as local_dir:
         storage.storage_local_path = local_dir
-        sync_up.side_effect = lambda *a, **kw: time.sleep(2)
 
         runner = TuneController(
             resource_manager_factory=lambda: resource_manager_cls(),
@@ -561,15 +548,21 @@ def test_checkpoint_auto_period(
             checkpoint_period="auto",
         )
 
-        runner.add_trial(
-            Trial("__fake", config={"user_checkpoint_freq": 1}, storage=storage)
-        )
+        with mock.patch.object(runner, "save_to_dir") as save_to_dir:
+            save_to_dir.side_effect = lambda *a, **kw: time.sleep(2)
+
+            runner.add_trial(
+                Trial("__fake", config={"user_checkpoint_freq": 1}, storage=storage)
+            )
 
-        runner.step()  # Run one step, this will trigger checkpointing
+            runner.step()  # Run one step, this will trigger checkpointing
 
         assert runner._checkpoint_manager._checkpoint_period > 38.0
 
 
+@pytest.mark.skip(
+    "TODO(justinvyu): We should remove forceful checkpointing based on num_to_keep."
+)
 @pytest.mark.parametrize(
     "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager]
 )
@@ -582,7 +575,7 @@ def test_checkpoint_force_with_num_to_keep(
     Legacy test: test_trial_runner_3.py::TrialRunnerTest::
         testCloudCheckpointForceWithNumToKeep
     """
-    storage = mock_storage_context(delete_syncer=False)
+    storage = mock_storage_context()
     # Needed to avoid infinite recursion error on CI runners
     storage.syncer.__getstate__ = lambda *a, **kw: {}
 
@@ -635,6 +628,7 @@ def get_json_state(self):
         assert sync_up.call_count == 6
 
 
+@pytest.mark.skip("TODO(justinvyu): Handle hanging/failing uploads.")
 @pytest.mark.parametrize(
     "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager]
 )
@@ -647,7 +641,7 @@ def test_checkpoint_forced_cloud_sync_timeout(
     Legacy test: test_trial_runner_3.py::TrialRunnerTest::
         testForcedCloudCheckpointSyncTimeout
     """
-    storage = mock_storage_context(delete_syncer=False)
+    storage = mock_storage_context()
 
     storage.syncer.sync_period = 60
     storage.syncer.sync_timeout = 0.001
@@ -679,6 +673,7 @@ def _sync_up_command(self, local_path: str, uri: str, exclude=None):
         assert sync_up_cmd.call_count == 2
 
 
+@pytest.mark.skip("TODO(justinvyu): Handle hanging/failing uploads.")
 @pytest.mark.parametrize(
     "resource_manager_cls", [FixedResourceManager, PlacementGroupResourceManager]
 )
@@ -691,7 +686,7 @@ def test_checkpoint_periodic_cloud_sync_timeout(
     Legacy test: test_trial_runner_3.py::TrialRunnerTest::
         testPeriodicCloudCheckpointSyncTimeout
     """
-    storage = mock_storage_context(delete_syncer=False)
+    storage = mock_storage_context()
 
     storage.syncer.sync_period = 60
     storage.syncer.sync_timeout = 0.5
diff --git a/python/ray/tune/tests/execution/test_controller_resume_integration.py b/python/ray/tune/tests/execution/test_controller_resume_integration.py
index 5237624f234fa9..75723d48efad2f 100644
--- a/python/ray/tune/tests/execution/test_controller_resume_integration.py
+++ b/python/ray/tune/tests/execution/test_controller_resume_integration.py
@@ -423,7 +423,7 @@ def count_checkpoints(cdir):
             for fname in os.listdir(cdir)
         )
 
-    tmpdir = storage.experiment_local_path
+    tmpdir = storage.experiment_driver_staging_path
     # The Trial `local_dir` must match the TrialRunner `local_checkpoint_dir`
     # to match the directory structure assumed by `TrialRunner.resume`.
     # See `test_trial_runner2.TrialRunnerTest2.testPauseResumeCheckpointCount`
diff --git a/python/ray/tune/tests/execution/utils.py b/python/ray/tune/tests/execution/utils.py
index 9e8efef258511f..8803ee9d07d7d5 100644
--- a/python/ray/tune/tests/execution/utils.py
+++ b/python/ray/tune/tests/execution/utils.py
@@ -105,6 +105,9 @@ def get_num_cpus(self):
     def get_num_gpus(self) -> int:
         return self._resource_manager._total_resources.get("GPU", 0)
 
+    def update_avail_resources(self, *args, **kwargs):
+        pass
+
 
 class TestingTrial(Trial):
     def __init__(self, *args, **kwargs):
diff --git a/python/ray/tune/tests/test_actor_reuse.py b/python/ray/tune/tests/test_actor_reuse.py
index b16dd924d37daa..37f02a3eaa35a6 100644
--- a/python/ray/tune/tests/test_actor_reuse.py
+++ b/python/ray/tune/tests/test_actor_reuse.py
@@ -299,17 +299,24 @@ def test_trial_reuse_log_to_file(trainable, ray_start_1_cpu, tmp_path):
         reuse_actors=True,
     ).trials
 
+    def get_trial_logfiles(trial):
+        return (
+            os.path.join(trial.storage.trial_working_directory, "stdout"),
+            os.path.join(trial.storage.trial_working_directory, "stderr"),
+        )
+
     # Check trial 1
     assert trial1.last_result["num_resets"] == 2
-    assert os.path.exists(os.path.join(trial1.local_path, "stdout"))
-    assert os.path.exists(os.path.join(trial1.local_path, "stderr"))
+    [stdout, stderr] = get_trial_logfiles(trial1)
+    assert os.path.exists(stdout)
+    assert os.path.exists(stderr)
 
     # We expect that only "First" output is found in the first trial output
-    with open(os.path.join(trial1.local_path, "stdout"), "rt") as fp:
+    with open(stdout, "rt") as fp:
         content = fp.read()
         assert "PRINT_STDOUT: First" in content
         assert "PRINT_STDOUT: Second" not in content
-    with open(os.path.join(trial1.local_path, "stderr"), "rt") as fp:
+    with open(stderr, "rt") as fp:
         content = fp.read()
         assert "PRINT_STDERR: First" in content
         assert "LOG_STDERR: First" in content
@@ -318,15 +325,16 @@ def test_trial_reuse_log_to_file(trainable, ray_start_1_cpu, tmp_path):
 
     # Check trial 2
     assert trial2.last_result["num_resets"] == 3
-    assert os.path.exists(os.path.join(trial2.local_path, "stdout"))
-    assert os.path.exists(os.path.join(trial2.local_path, "stderr"))
+    [stdout, stderr] = get_trial_logfiles(trial2)
+    assert os.path.exists(stdout)
+    assert os.path.exists(stderr)
 
     # We expect that only "Second" output is found in the first trial output
-    with open(os.path.join(trial2.local_path, "stdout"), "rt") as fp:
+    with open(stdout, "rt") as fp:
         content = fp.read()
         assert "PRINT_STDOUT: Second" in content
         assert "PRINT_STDOUT: First" not in content
-    with open(os.path.join(trial2.local_path, "stderr"), "rt") as fp:
+    with open(stderr, "rt") as fp:
         content = fp.read()
         assert "PRINT_STDERR: Second" in content
         assert "LOG_STDERR: Second" in content
@@ -481,15 +489,12 @@ class MyTrainable(Trainable):
     assert _check_mixin(dummy_mixin(MyTrainable))
 
 
-def test_remote_trial_dir_with_reuse_actors(
-    trainable, ray_start_2_cpus, monkeypatch, tmp_path
-):
+def test_remote_trial_dir_with_reuse_actors(trainable, ray_start_2_cpus, tmp_path):
     """Check that the trainable has its remote directory set to the right
     location, when new trials get swapped in on actor reuse.
     Each trial runs for 2 iterations, with checkpoint_frequency=1, so each
     remote trial dir should have 2 checkpoints.
     """
-    monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", str(tmp_path))
     tmp_target = str(tmp_path / "upload_dir")
     exp_name = "remote_trial_dir_update_on_actor_reuse"
 
diff --git a/python/ray/tune/tests/test_api.py b/python/ray/tune/tests/test_api.py
index 9b866ed55b1c30..3e2da6588843c2 100644
--- a/python/ray/tune/tests/test_api.py
+++ b/python/ray/tune/tests/test_api.py
@@ -367,38 +367,23 @@ def train_fn(config):
             }
         )
 
-    def testLogdir(self):
-        logdir = os.path.join(ray._private.utils.get_user_temp_dir(), "logdir")
-
-        def train_fn(config):
-            assert logdir in os.getcwd(), os.getcwd()
-            train.report(dict(timesteps_total=1))
-
-        register_trainable("f1", train_fn)
-        with unittest.mock.patch.dict(os.environ, {"RAY_AIR_LOCAL_CACHE_DIR": logdir}):
-            tune.run("f1")
-
     def testLongFilename(self):
-        logdir = os.path.join(ray._private.utils.get_user_temp_dir(), "logdir")
-
         def train_fn(config):
-            assert os.path.join(logdir, "foo") in os.getcwd(), os.getcwd()
             train.report(dict(timesteps_total=1))
 
         register_trainable("f1", train_fn)
 
-        with unittest.mock.patch.dict(os.environ, {"RAY_AIR_LOCAL_CACHE_DIR": logdir}):
-            run_experiments(
-                {
-                    "foo": {
-                        "run": "f1",
-                        "config": {
-                            "a" * 50: tune.sample_from(lambda spec: 5.0 / 7),
-                            "b" * 50: tune.sample_from(lambda spec: "long" * 40),
-                        },
-                    }
+        run_experiments(
+            {
+                "foo": {
+                    "run": "f1",
+                    "config": {
+                        "a" * 50: tune.sample_from(lambda spec: 5.0 / 7),
+                        "b" * 50: tune.sample_from(lambda spec: "long" * 40),
+                    },
                 }
-            )
+            }
+        )
 
     def testBadParams(self):
         def f():
@@ -826,6 +811,8 @@ def track_train(config):
         )
 
     def testLotsOfStops(self):
+        tmpdir = self.tmpdir
+
         class TestTrainable(Trainable):
             def step(self):
                 result = {"name": self.trial_name, "trial_id": self.trial_id}
@@ -833,13 +820,14 @@ def step(self):
 
             def cleanup(self):
                 time.sleep(0.3)
-                open(os.path.join(self.logdir, "marker"), "a").close()
+                open(os.path.join(tmpdir, f"marker-{self.trial_id}"), "a").close()
                 return 1
 
-        analysis = tune.run(TestTrainable, num_samples=10, stop={TRAINING_ITERATION: 1})
-        for trial in analysis.trials:
-            path = os.path.join(trial.local_path, "marker")
-            assert os.path.exists(path)
+        num_samples = 10
+        tune.run(TestTrainable, num_samples=num_samples, stop={TRAINING_ITERATION: 1})
+
+        markers = [m for m in os.listdir(tmpdir) if "marker" in m]
+        assert len(markers) == num_samples
 
     def testReportTimeStep(self):
         # Test that no timestep count are logged if never the Trainable never
@@ -1098,27 +1086,39 @@ def train_fn(config):
 
         # Do not log to file
         [trial] = tune.run("f1", log_to_file=False).trials
-        self.assertFalse(os.path.exists(os.path.join(trial.local_path, "stdout")))
-        self.assertFalse(os.path.exists(os.path.join(trial.local_path, "stderr")))
+        trial_working_dir = trial.storage.trial_working_directory
+        self.assertFalse(
+            os.path.exists(
+                os.path.join(trial.storage.trial_working_directory, "stdout")
+            )
+        )
+        self.assertFalse(
+            os.path.exists(
+                os.path.join(trial.storage.trial_working_directory, "stderr")
+            )
+        )
 
         # Log to default files
         [trial] = tune.run("f1", log_to_file=True).trials
-        self.assertTrue(os.path.exists(os.path.join(trial.local_path, "stdout")))
-        self.assertTrue(os.path.exists(os.path.join(trial.local_path, "stderr")))
-        with open(os.path.join(trial.local_path, "stdout"), "rt") as fp:
+        trial_working_dir = trial.storage.trial_working_directory
+
+        self.assertTrue(os.path.exists(os.path.join(trial_working_dir, "stdout")))
+        self.assertTrue(os.path.exists(os.path.join(trial_working_dir, "stderr")))
+        with open(os.path.join(trial_working_dir, "stdout"), "rt") as fp:
             content = fp.read()
             self.assertIn("PRINT_STDOUT", content)
-        with open(os.path.join(trial.local_path, "stderr"), "rt") as fp:
+        with open(os.path.join(trial_working_dir, "stderr"), "rt") as fp:
             content = fp.read()
             self.assertIn("PRINT_STDERR", content)
             self.assertIn("LOG_STDERR", content)
 
         # Log to one file
         [trial] = tune.run("f1", log_to_file="combined").trials
-        self.assertFalse(os.path.exists(os.path.join(trial.local_path, "stdout")))
-        self.assertFalse(os.path.exists(os.path.join(trial.local_path, "stderr")))
-        self.assertTrue(os.path.exists(os.path.join(trial.local_path, "combined")))
-        with open(os.path.join(trial.local_path, "combined"), "rt") as fp:
+        trial_working_dir = trial.storage.trial_working_directory
+        self.assertFalse(os.path.exists(os.path.join(trial_working_dir, "stdout")))
+        self.assertFalse(os.path.exists(os.path.join(trial_working_dir, "stderr")))
+        self.assertTrue(os.path.exists(os.path.join(trial_working_dir, "combined")))
+        with open(os.path.join(trial_working_dir, "combined"), "rt") as fp:
             content = fp.read()
             self.assertIn("PRINT_STDOUT", content)
             self.assertIn("PRINT_STDERR", content)
@@ -1126,15 +1126,16 @@ def train_fn(config):
 
         # Log to two files
         [trial] = tune.run("f1", log_to_file=("alt.stdout", "alt.stderr")).trials
-        self.assertFalse(os.path.exists(os.path.join(trial.local_path, "stdout")))
-        self.assertFalse(os.path.exists(os.path.join(trial.local_path, "stderr")))
-        self.assertTrue(os.path.exists(os.path.join(trial.local_path, "alt.stdout")))
-        self.assertTrue(os.path.exists(os.path.join(trial.local_path, "alt.stderr")))
+        trial_working_dir = trial.storage.trial_working_directory
+        self.assertFalse(os.path.exists(os.path.join(trial_working_dir, "stdout")))
+        self.assertFalse(os.path.exists(os.path.join(trial_working_dir, "stderr")))
+        self.assertTrue(os.path.exists(os.path.join(trial_working_dir, "alt.stdout")))
+        self.assertTrue(os.path.exists(os.path.join(trial_working_dir, "alt.stderr")))
 
-        with open(os.path.join(trial.local_path, "alt.stdout"), "rt") as fp:
+        with open(os.path.join(trial_working_dir, "alt.stdout"), "rt") as fp:
             content = fp.read()
             self.assertIn("PRINT_STDOUT", content)
-        with open(os.path.join(trial.local_path, "alt.stderr"), "rt") as fp:
+        with open(os.path.join(trial_working_dir, "alt.stderr"), "rt") as fp:
             content = fp.read()
             self.assertIn("PRINT_STDERR", content)
             self.assertIn("LOG_STDERR", content)
diff --git a/python/ray/tune/tests/test_api_checkpoint_integration.py b/python/ray/tune/tests/test_api_checkpoint_integration.py
index b44f3fb924a14e..20ed0aeae82246 100644
--- a/python/ray/tune/tests/test_api_checkpoint_integration.py
+++ b/python/ray/tune/tests/test_api_checkpoint_integration.py
@@ -41,13 +41,14 @@ def test_checkpoint_freq_dir_name(
 
     def num_checkpoints(trial):
         return sum(
-            item.startswith("checkpoint_") for item in os.listdir(trial.local_path)
+            item.startswith("checkpoint_")
+            for item in os.listdir(trial.storage.trial_fs_path)
         )
 
     def last_checkpoint_dir(trial):
         return max(
             item
-            for item in os.listdir(trial.local_path)
+            for item in os.listdir(trial.storage.trial_fs_path)
             if item.startswith("checkpoint_")
         )
 
@@ -108,12 +109,11 @@ def _update_checkpoint_index(self, metrics):
                 )
 
         storage = mock_storage_context(
-            delete_syncer=False,
             storage_context_cls=CustomStorageContext,
             storage_path=tmp_path,
         )
     else:
-        storage = mock_storage_context(delete_syncer=False, storage_path=tmp_path)
+        storage = mock_storage_context(storage_path=tmp_path)
 
     trial = Trial(
         "test_checkpoint_freq",
diff --git a/python/ray/tune/tests/test_experiment_analysis.py b/python/ray/tune/tests/test_experiment_analysis.py
index f65bde1fb39125..9e6d91568e103d 100644
--- a/python/ray/tune/tests/test_experiment_analysis.py
+++ b/python/ray/tune/tests/test_experiment_analysis.py
@@ -73,8 +73,6 @@ def experiment_analysis(request):
     load_from = request.param
     tmp_path = Path(tempfile.mkdtemp())
 
-    os.environ["RAY_AIR_LOCAL_CACHE_DIR"] = str(tmp_path / "ray_results")
-
     context_manager = (
         mock_s3_bucket_uri if load_from == "cloud" else dummy_context_manager
     )
diff --git a/python/ray/tune/tests/test_function_api.py b/python/ray/tune/tests/test_function_api.py
index a9b7d180e02cce..8c533c308914db 100644
--- a/python/ray/tune/tests/test_function_api.py
+++ b/python/ray/tune/tests/test_function_api.py
@@ -1,7 +1,6 @@
 import json
 import os
 import sys
-import shutil
 import tempfile
 import unittest
 
@@ -33,8 +32,10 @@ def logger_creator(config):
 
 class FunctionCheckpointingTest(unittest.TestCase):
     def setUp(self):
-        self.logdir = tempfile.mkdtemp()
-        self.logger_creator = creator_generator(self.logdir)
+        self.tmpdir = tempfile.TemporaryDirectory()
+        self.logger_creator = creator_generator(
+            os.path.join(self.tmpdir.name, "logdir")
+        )
 
     def create_trainable(self, train_fn):
         return wrap_function(train_fn)(
@@ -42,7 +43,7 @@ def create_trainable(self, train_fn):
         )
 
     def tearDown(self):
-        shutil.rmtree(self.logdir)
+        self.tmpdir.cleanup()
 
     def testCheckpointReuse(self):
         """Test that repeated save/restore never reuses same checkpoint dir."""
diff --git a/python/ray/tune/tests/test_run_experiment.py b/python/ray/tune/tests/test_run_experiment.py
index 865705b53d6f98..942e7155b5836a 100644
--- a/python/ray/tune/tests/test_run_experiment.py
+++ b/python/ray/tune/tests/test_run_experiment.py
@@ -129,7 +129,11 @@ def _export_model(self, export_formats, export_dir):
         )
         for trial in trials:
             self.assertEqual(trial.status, Trial.TERMINATED)
-            self.assertTrue(os.path.exists(os.path.join(trial.local_path, "exported")))
+            self.assertTrue(
+                os.path.exists(
+                    os.path.join(trial.storage.trial_working_directory, "exported")
+                )
+            )
 
     def testInvalidExportFormats(self):
         class MyTrainable(Trainable):
diff --git a/python/ray/tune/tests/test_trial.py b/python/ray/tune/tests/test_trial.py
index fdc121749f4cd4..5338c2133b3adf 100644
--- a/python/ray/tune/tests/test_trial.py
+++ b/python/ray/tune/tests/test_trial.py
@@ -1,4 +1,3 @@
-import os
 import sys
 import pytest
 
@@ -114,7 +113,7 @@ def test_trial_logdir_length():
         storage=mock_storage_context(),
     )
     trial.init_local_path()
-    assert len(os.path.basename(trial.local_path)) < 200
+    assert len(trial.storage.trial_dir_name) < 200
 
 
 if __name__ == "__main__":
diff --git a/python/ray/tune/tests/test_trial_scheduler.py b/python/ray/tune/tests/test_trial_scheduler.py
index c1d44e799db038..019c4ffcf59c37 100644
--- a/python/ray/tune/tests/test_trial_scheduler.py
+++ b/python/ray/tune/tests/test_trial_scheduler.py
@@ -1112,7 +1112,6 @@ def basicSetup(
         self.storage = StorageContext(
             storage_path=tmpdir, experiment_dir_name="test_trial_scheduler"
         )
-        self.storage.storage_local_path = tmpdir
         runner = _MockTrialRunner(pbt)
         for i in range(num_trials):
             trial_hyperparams = hyperparams or {
@@ -1122,6 +1121,7 @@ def basicSetup(
                 "id_factor": i,
             }
             trial = _MockTrial(i, trial_hyperparams, self.storage)
+            trial.init_local_path()
             runner.add_trial(trial)
             trial.status = Trial.RUNNING
         for i in range(num_trials):
@@ -1614,11 +1614,11 @@ def check_policy(policy):
         for log_file in log_files:
             self.assertTrue(
                 os.path.exists(
-                    os.path.join(self.storage.experiment_local_path, log_file)
+                    os.path.join(self.storage.experiment_driver_staging_path, log_file)
                 )
             )
             raw_policy = open(
-                os.path.join(self.storage.experiment_local_path, log_file), "r"
+                os.path.join(self.storage.experiment_driver_staging_path, log_file), "r"
             ).readlines()
             for line in raw_policy:
                 check_policy(json.loads(line))
@@ -1651,11 +1651,11 @@ def check_policy(policy):
         for log_file in log_files:
             self.assertTrue(
                 os.path.exists(
-                    os.path.join(self.storage.experiment_local_path, log_file)
+                    os.path.join(self.storage.experiment_driver_staging_path, log_file)
                 )
             )
             raw_policy = open(
-                os.path.join(self.storage.experiment_local_path, log_file), "r"
+                os.path.join(self.storage.experiment_driver_staging_path, log_file), "r"
             ).readlines()
             for line in raw_policy:
                 check_policy(json.loads(line))
@@ -1792,7 +1792,7 @@ def load_checkpoint(self, checkpoint_dir):
 
             replay = PopulationBasedTrainingReplay(
                 os.path.join(
-                    self.storage.experiment_local_path,
+                    self.storage.experiment_driver_staging_path,
                     "pbt_policy_{}.txt".format(trial.trial_id),
                 )
             )
@@ -1809,7 +1809,7 @@ def load_checkpoint(self, checkpoint_dir):
         with self.assertRaises(ValueError):
             replay = PopulationBasedTrainingReplay(
                 os.path.join(
-                    self.storage.experiment_local_path,
+                    self.storage.experiment_driver_staging_path,
                     "pbt_policy_{}.txt".format(trials[1].trial_id),
                 )
             )
diff --git a/python/ray/tune/tests/test_trial_scheduler_pbt.py b/python/ray/tune/tests/test_trial_scheduler_pbt.py
index e7a9ea6fdd6e84..b70750d1643356 100644
--- a/python/ray/tune/tests/test_trial_scheduler_pbt.py
+++ b/python/ray/tune/tests/test_trial_scheduler_pbt.py
@@ -578,20 +578,18 @@ def status(self):
             def status(self, status):
                 pass
 
-        trial1 = MockTrial("PPO", config=dict(num=1), storage=storage_context)
-        trial2 = MockTrial("PPO", config=dict(num=2), storage=storage_context)
-        trial3 = MockTrial("PPO", config=dict(num=3), storage=storage_context)
-        trial4 = MockTrial("PPO", config=dict(num=4), storage=storage_context)
-
-        runner.add_trial(trial1)
-        runner.add_trial(trial2)
-        runner.add_trial(trial3)
-        runner.add_trial(trial4)
-
-        scheduler.on_trial_add(runner, trial1)
-        scheduler.on_trial_add(runner, trial2)
-        scheduler.on_trial_add(runner, trial3)
-        scheduler.on_trial_add(runner, trial4)
+        trials = [
+            MockTrial("PPO", config=dict(num=i), storage=storage_context)
+            for i in range(1, 5)
+        ]
+        trial1, trial2, trial3, trial4 = trials
+
+        for trial in trials:
+            trial.init_local_path()
+            runner.add_trial(trial)
+
+        for trial in trials:
+            scheduler.on_trial_add(runner, trial)
 
         # Add initial results.
         scheduler.on_trial_result(
diff --git a/python/ray/tune/tests/test_tuner.py b/python/ray/tune/tests/test_tuner.py
index a0e1692dae9c7e..f5c054b08904d4 100644
--- a/python/ray/tune/tests/test_tuner.py
+++ b/python/ray/tune/tests/test_tuner.py
@@ -101,10 +101,8 @@ class TunerTest(unittest.TestCase):
     """The e2e test for hparam tuning using Tuner API."""
 
     @pytest.fixture(autouse=True)
-    def local_dir(self, tmp_path, monkeypatch):
-        monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", str(tmp_path / "ray_results"))
-        self.local_dir = str(tmp_path / "ray_results")
-        yield self.local_dir
+    def tmp_path(self, tmp_path):
+        self.tmp_path = tmp_path
 
     def setUp(self):
         ray.init()
@@ -192,7 +190,9 @@ def on_step_end(self, iteration, trials, **kwargs):
         tuner = Tuner(
             trainable=trainer,
             run_config=RunConfig(
-                name="test_tuner_driver_fail", callbacks=[FailureInjectionCallback()]
+                name="test_tuner_driver_fail",
+                storage_path=str(self.tmp_path),
+                callbacks=[FailureInjectionCallback()],
             ),
             param_space=param_space,
             tune_config=TuneConfig(mode="min", metric="train-error"),
@@ -204,7 +204,7 @@ def on_step_end(self, iteration, trials, **kwargs):
             tuner.fit()
 
         # Test resume
-        restore_path = os.path.join(self.local_dir, "test_tuner_driver_fail")
+        restore_path = os.path.join(self.tmp_path, "test_tuner_driver_fail")
         tuner = Tuner.restore(restore_path, trainable=trainer, param_space=param_space)
         # A hack before we figure out RunConfig semantics across resumes.
         tuner._local_tuner._run_config.callbacks = None
@@ -448,7 +448,9 @@ def test_trainer_no_chdir_to_trial_dir(
 
 
 @pytest.mark.parametrize("runtime_env", [{}, {"working_dir": "."}])
-def test_tuner_relative_pathing_with_env_vars(shutdown_only, chdir_tmpdir, runtime_env):
+def test_tuner_relative_pathing_with_env_vars(
+    shutdown_only, chdir_tmpdir, tmp_path, runtime_env
+):
     """Tests that `TUNE_ORIG_WORKING_DIR` environment variable can be used to access
     relative paths to the original working directory.
     """
@@ -471,14 +473,21 @@ def train_func(config):
         data_path = orig_working_dir / "read.txt"
         assert os.path.exists(data_path) and open(data_path, "r").read() == "data"
 
-        trial_dir = Path(train.get_context().get_trial_dir())
-        # Tune should have changed the working directory to the trial directory
-        assert str(trial_dir) == os.getcwd()
+        # Tune chdirs to the trial working directory
+        storage = train.get_context().get_storage()
+        assert Path(storage.trial_working_directory).resolve() == Path.cwd().resolve()
 
-        with open(trial_dir / "write.txt", "w") as f:
+        with open("write.txt", "w") as f:
             f.write(f"{config['id']}")
 
-    tuner = Tuner(train_func, param_space={"id": tune.grid_search(list(range(4)))})
+    tuner = Tuner(
+        train_func,
+        param_space={"id": tune.grid_search(list(range(4)))},
+        run_config=RunConfig(
+            storage_path=str(tmp_path),
+            sync_config=train.SyncConfig(sync_artifacts=True),
+        ),
+    )
     results = tuner.fit()
     assert not results.errors
     for result in results:
diff --git a/python/ray/tune/tests/test_tuner_restore.py b/python/ray/tune/tests/test_tuner_restore.py
index ccfde333f5eef4..5c0ba30dccdcbb 100644
--- a/python/ray/tune/tests/test_tuner_restore.py
+++ b/python/ray/tune/tests/test_tuner_restore.py
@@ -5,6 +5,7 @@
 import time
 import unittest
 
+import pyarrow.fs
 import pytest
 
 import ray
@@ -19,11 +20,7 @@
 )
 from ray.air._internal.uri_utils import URI
 from ray.train.data_parallel_trainer import DataParallelTrainer
-from ray.train._internal.storage import (
-    get_fs_and_path,
-    _download_from_fs_path,
-    _upload_to_fs_path,
-)
+from ray.train._internal.storage import get_fs_and_path, _download_from_fs_path
 from ray.tune import Callback, Trainable
 from ray.tune.analysis import ExperimentAnalysis
 from ray.tune.execution.experiment_state import _find_newest_experiment_checkpoint
@@ -337,12 +334,9 @@ def test_tuner_restore_restart_errored(ray_start_2_cpus, tmpdir):
 def test_tuner_resume_unfinished(ray_start_2_cpus, tmpdir, monkeypatch):
     """Resuming unfinished trials should pick up existing state"""
     monkeypatch.setenv("TUNE_GLOBAL_CHECKPOINT_S", "0")
-
-    # TODO(justinvyu): Setting storage_path to this tempdir causes this test to fail.
-    # This is because the error raised by the driver callback doesn't let the
-    # experiment sync happen (from ~/ray_results -> tmpdir). This would also
-    # be the case for real cloud syncing.
-    monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", str(tmpdir))
+    # Make sure that only one trial is pending at a time to prevent
+    # the trial order from getting shuffled around.
+    monkeypatch.setenv("TUNE_MAX_PENDING_TRIALS_PG", "1")
 
     fail_marker = tmpdir / "fail_marker"
     fail_marker.write_text("", encoding="utf-8")
@@ -350,6 +344,17 @@ def test_tuner_resume_unfinished(ray_start_2_cpus, tmpdir, monkeypatch):
     hang_marker = tmpdir / "hang_marker"
     hang_marker.write_text("", encoding="utf-8")
 
+    param_space = {
+        # First trial succeeds, second hangs, third fails, fourth hangs
+        "failing_hanging": tune.grid_search(
+            [
+                (None, None),
+                (None, hang_marker),
+                (fail_marker, None),
+                (None, hang_marker),
+            ]
+        ),
+    }
     tuner = Tuner(
         _train_fn_sometimes_failing,
         tune_config=TuneConfig(num_samples=1),
@@ -359,17 +364,7 @@ def test_tuner_resume_unfinished(ray_start_2_cpus, tmpdir, monkeypatch):
             failure_config=FailureConfig(fail_fast=False),
             callbacks=[_FailOnStats(num_trials=4, num_finished=2, delay=1)],
         ),
-        param_space={
-            # First trial succeeds, second hangs, third fails, fourth hangs
-            "failing_hanging": tune.grid_search(
-                [
-                    (None, None),
-                    (None, hang_marker),
-                    (fail_marker, None),
-                    (None, hang_marker),
-                ]
-            ),
-        },
+        param_space=param_space,
     )
     # Catch the FailOnStats error
     with pytest.raises(RuntimeError):
@@ -392,6 +387,7 @@ def test_tuner_resume_unfinished(ray_start_2_cpus, tmpdir, monkeypatch):
     tuner = Tuner.restore(
         str(tmpdir / "test_tuner_resume_unfinished"),
         trainable=_train_fn_sometimes_failing,
+        param_space=param_space,
     )
     tuner._local_tuner._run_config.callbacks = None
 
@@ -405,9 +401,6 @@ def test_tuner_resume_errored_only(ray_start_2_cpus, tmpdir, monkeypatch):
     """Not resuming unfinished trials (but only errored and pending) should work"""
     monkeypatch.setenv("TUNE_GLOBAL_CHECKPOINT_S", "0")
 
-    # TODO(justinvyu): Same as above.
-    monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", str(tmpdir))
-
     fail_marker = tmpdir / "fail_marker"
     fail_marker.write_text("", encoding="utf-8")
 
@@ -419,6 +412,7 @@ def test_tuner_resume_errored_only(ray_start_2_cpus, tmpdir, monkeypatch):
         tune_config=TuneConfig(num_samples=1),
         run_config=RunConfig(
             name="test_tuner_resume_errored_only",
+            storage_path=str(tmpdir),
             failure_config=FailureConfig(fail_fast=False),
             callbacks=[_FailOnStats(num_trials=4, num_finished=2, delay=1)],
         ),
@@ -466,11 +460,8 @@ def test_tuner_resume_errored_only(ray_start_2_cpus, tmpdir, monkeypatch):
     assert sorted([r.metrics.get("it", 0) for r in results]) == sorted([2, 1, 3, 0])
 
 
-def _test_tuner_restore_from_cloud(
-    tmpdir, configure_storage_path, storage_path, monkeypatch
-):
+def _test_tuner_restore_from_cloud(tmpdir, configure_storage_path, storage_path):
     """Check that restoring Tuner() objects from cloud storage works"""
-    monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", str(tmpdir / "ray_results"))
     tuner = Tuner(
         _dummy_train_fn,
         run_config=RunConfig(name="exp_dir", storage_path=configure_storage_path),
@@ -487,58 +478,43 @@ def _test_tuner_restore_from_cloud(
     prev_cp = _find_newest_experiment_checkpoint(str(check_path / "exp_dir"))
     prev_lstat = os.lstat(prev_cp)
 
-    (tmpdir / "ray_results").remove(ignore_errors=True)
-
     tuner2 = Tuner.restore(
         str(URI(storage_path) / "exp_dir"), trainable=_dummy_train_fn
     )
     results = tuner2.fit()
 
     assert results[0].metrics["_metric"] == 1
-    local_contents = os.listdir(tmpdir / "ray_results" / "exp_dir")
-    assert "tuner.pkl" in local_contents
 
-    after_cp = _find_newest_experiment_checkpoint(
-        str(tmpdir / "ray_results" / "exp_dir")
-    )
+    check_path_2 = tmpdir / "check_save_2"
+    _download_from_fs_path(fs=fs, fs_path=fs_path, local_path=str(check_path_2))
+    after_cp = _find_newest_experiment_checkpoint(str(check_path_2 / "exp_dir"))
     after_lstat = os.lstat(after_cp)
 
     # Experiment checkpoint was updated
     assert os.path.basename(prev_cp) != os.path.basename(after_cp)
     # Old experiment checkpoint still exists in dir
-    assert os.path.basename(prev_cp) in local_contents
+    assert os.path.basename(prev_cp) in os.listdir(check_path_2 / "exp_dir")
     # Contents changed
     assert prev_lstat.st_size != after_lstat.st_size
 
-    # Overwriting should work
-    tuner3 = Tuner.restore(
-        str(URI(storage_path) / "exp_dir"), trainable=_dummy_train_fn
-    )
-    tuner3.fit()
-
 
 def test_tuner_restore_from_cloud_manual_path(
-    ray_start_2_cpus, tmpdir, mock_s3_bucket_uri, monkeypatch
+    ray_start_2_cpus, tmpdir, mock_s3_bucket_uri
 ):
     _test_tuner_restore_from_cloud(
         tmpdir,
         configure_storage_path=mock_s3_bucket_uri,
         storage_path=mock_s3_bucket_uri,
-        monkeypatch=monkeypatch,
     )
 
 
-@pytest.mark.skip("Hanging due to some problem with ray storage.")
-def test_tuner_restore_from_cloud_ray_storage(
-    ray_shutdown, tmpdir, mock_s3_bucket_uri, monkeypatch
-):
+def test_tuner_restore_from_cloud_ray_storage(ray_shutdown, tmpdir, mock_s3_bucket_uri):
     ray.init(num_cpus=2, configure_logging=False, storage=mock_s3_bucket_uri)
 
     _test_tuner_restore_from_cloud(
         tmpdir / "local",
         configure_storage_path=None,
         storage_path=mock_s3_bucket_uri,
-        monkeypatch=monkeypatch,
     )
 
 
@@ -555,7 +531,7 @@ def test_tuner_restore_latest_available_checkpoint(
 
 
 @pytest.mark.parametrize("retry_num", [0, 2])
-def test_restore_retry(ray_start_2_cpus, tmpdir, monkeypatch, retry_num):
+def test_restore_retry(ray_start_2_cpus, tmpdir, retry_num):
     """Test retrying restore on a trial level by setting `TUNE_RESTORE_RETRY_NUM`."""
 
     class MockTrainable(Trainable):
@@ -743,14 +719,13 @@ def create_trainable_with_params():
     assert not results.errors
 
 
-@pytest.mark.parametrize("use_tune_run", [True])
+@pytest.mark.parametrize("use_tune_run", [True, False])
 def test_tuner_restore_from_moved_experiment_path(
     ray_start_2_cpus, tmp_path, use_tune_run
 ):
     """Check that restoring a Tuner from a moved experiment directory works."""
     # Create a fail_marker dummy file that causes the first Tune run to fail and
     # the second run to succeed
-    os.environ["RAY_AIR_LOCAL_CACHE_DIR"] = str(tmp_path / "local_dir")
     fail_marker = tmp_path / "fail_marker"
     fail_marker.write_text("", encoding="utf-8")
 
@@ -983,24 +958,22 @@ def get_checkpoints(experiment_dir):
     assert [load_dict_checkpoint(ckpt)["it"] for ckpt in checkpoints] == [2, 3, 4, 5]
 
 
-def test_tuner_can_restore(tmp_path, monkeypatch):
+def test_tuner_can_restore(tmp_path):
     """Make sure that `can_restore` detects an existing experiment at a
     path and only returns True if it's at the experiment dir root.
     """
-    monkeypatch.setenv("RAY_AIR_LOCAL_CACHE_DIR", str(tmp_path))
-
     name = "exp_name"
-    Tuner(lambda _: print("dummy"), run_config=RunConfig(name=name))
-
-    fs, fs_path = get_fs_and_path("mock:///bucket/exp_name")
-    _upload_to_fs_path(local_path=str(tmp_path / name), fs=fs, fs_path=fs_path)
+    Tuner(
+        lambda _: print("dummy"),
+        run_config=RunConfig(name=name, storage_path=str(tmp_path)),
+    )
 
     assert Tuner.can_restore(tmp_path / name)
+    assert Tuner.can_restore(
+        tmp_path / name, storage_filesystem=pyarrow.fs.LocalFileSystem()
+    )
     assert not Tuner.can_restore(tmp_path)
     assert not Tuner.can_restore(tmp_path / name / "other")
-    assert Tuner.can_restore("/bucket/exp_name", storage_filesystem=fs)
-    assert not Tuner.can_restore("/bucket", storage_filesystem=fs)
-    assert not Tuner.can_restore("/bucket/exp_name/other", storage_filesystem=fs)
 
 
 def testParamSpaceOverwriteValidation(ray_start_4_cpus, tmp_path):
diff --git a/python/ray/tune/tests/test_var.py b/python/ray/tune/tests/test_var.py
index 053fee569838d4..d814af70797de6 100644
--- a/python/ray/tune/tests/test_var.py
+++ b/python/ray/tune/tests/test_var.py
@@ -7,7 +7,7 @@
 from ray.rllib import _register_all
 
 from ray import tune
-from ray.tune.result import DEFAULT_RESULTS_DIR
+from ray.train.constants import DEFAULT_STORAGE_PATH
 from ray.tune.search import grid_search, BasicVariantGenerator
 from ray.tune.search.variant_generator import (
     RecursiveDependencyError,
@@ -54,7 +54,8 @@ def testParseToTrials(self):
         self.assertEqual(trials[0].max_failures, 5)
         self.assertEqual(trials[0].evaluated_params, {})
         self.assertEqual(
-            trials[0].local_dir, os.path.join(DEFAULT_RESULTS_DIR, "tune-pong")
+            trials[0].storage.experiment_fs_path,
+            os.path.join(DEFAULT_STORAGE_PATH, "tune-pong"),
         )
         self.assertEqual(trials[1].experiment_tag, "1")
 
diff --git a/python/ray/tune/tests/tutorial.py b/python/ray/tune/tests/tutorial.py
index 42ac835e9ce60c..ac5667a183b69d 100644
--- a/python/ray/tune/tests/tutorial.py
+++ b/python/ray/tune/tests/tutorial.py
@@ -78,6 +78,11 @@ def test_func(model, data_loader):
 
 
 # __train_func_begin__
+import os
+import tempfile
+
+from ray.train import Checkpoint
+
 def train_mnist(config):
     # Data Setup
     mnist_transforms = transforms.Compose(
@@ -104,12 +109,20 @@ def train_mnist(config):
         train_func(model, optimizer, train_loader)
         acc = test_func(model, test_loader)
 
-        # Send the current training result back to Tune
-        train.report({"mean_accuracy": acc})
 
-        if i % 5 == 0:
-            # This saves the model to the trial directory
-            torch.save(model.state_dict(), "./model.pth")
+        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+            checkpoint = None
+            if (i + 1) % 5 == 0:
+                # This saves the model to the trial directory
+                torch.save(
+                    model.state_dict(),
+                    os.path.join(temp_checkpoint_dir, "model.pth")
+                )
+                checkpoint = Checkpoint.from_directory(temp_checkpoint_dir)
+
+            # Send the current training result back to Tune
+            train.report({"mean_accuracy": acc}, checkpoint=checkpoint)
+
 # __train_func_end__
 # fmt: on
 
@@ -188,10 +201,9 @@ def train_mnist(config):
 # __run_searchalg_end__
 
 # __run_analysis_begin__
-import os
-
-logdir = results.get_best_result("mean_accuracy", mode="max").path
-state_dict = torch.load(os.path.join(logdir, "model.pth"))
+best_result = results.get_best_result("mean_accuracy", mode="max")
+with best_result.checkpoint.as_directory() as checkpoint_dir:
+    state_dict = torch.load(os.path.join(checkpoint_dir, "model.pth"))
 
 model = ConvNet()
 model.load_state_dict(state_dict)
diff --git a/python/ray/tune/trainable/function_trainable.py b/python/ray/tune/trainable/function_trainable.py
index f74d2a8c87853f..2f7cb989614058 100644
--- a/python/ray/tune/trainable/function_trainable.py
+++ b/python/ray/tune/trainable/function_trainable.py
@@ -55,7 +55,7 @@ def setup(self, config):
                 name=self.trial_name,
                 id=self.trial_id,
                 resources=self.trial_resources,
-                logdir=self._storage.trial_local_path,
+                logdir=self._storage.trial_driver_staging_path,
                 driver_ip=None,
                 experiment_name=self._storage.experiment_dir_name,
             ),
@@ -189,7 +189,7 @@ def reset_config(self, new_config):
                 name=self.trial_name,
                 id=self.trial_id,
                 resources=self.trial_resources,
-                logdir=self._storage.trial_local_path,
+                logdir=self._storage.trial_working_directory,
                 driver_ip=None,
                 experiment_name=self._storage.experiment_dir_name,
             ),
diff --git a/python/ray/tune/tune.py b/python/ray/tune/tune.py
index 53720e2e17fb55..59a1f93ad36860 100644
--- a/python/ray/tune/tune.py
+++ b/python/ray/tune/tune.py
@@ -978,7 +978,7 @@ def run(
                 )
                 break
 
-    experiment_local_path = runner._storage.experiment_local_path
+    experiment_local_path = runner._storage.experiment_driver_staging_path
     experiment_dir_name = runner._storage.experiment_dir_name
 
     if any(isinstance(cb, TBXLoggerCallback) for cb in callbacks):
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
index 55bce7e40b69a0..fbd7cbc4b1146e 100644
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@@ -1217,38 +1217,6 @@
 #######################
 # Tune cloud  tests
 #######################
-- name: tune_cloud_durable_upload
-  group: Tune cloud tests
-  working_dir: tune_tests/cloud_tests
-  frequency: nightly
-  team: ml
-
-  cluster:
-    byod: {}
-    cluster_compute: tpl_aws_4x2.yaml
-
-  run:
-    timeout: 600
-    script: python workloads/run_cloud_test.py durable_upload --bucket s3://tune-cloud-tests/durable_upload
-
-    wait_for_nodes:
-      num_nodes: 4
-
-  variations:
-    - __suffix__: aws
-    - __suffix__: gce
-      env: gce
-      frequency: manual
-      cluster:
-        cluster_compute: tpl_gce_4x8.yaml
-      run:
-        timeout: 600
-        script: python workloads/run_cloud_test.py durable_upload --bucket gs://tune-cloud-tests/durable_upload
-        wait_for_nodes:
-          num_nodes: 4
-
-  alert: tune_tests
-
 - name: tune_cloud_long_running_cloud_storage
   group: Tune cloud tests
   working_dir: tune_tests/cloud_tests
diff --git a/release/tune_tests/cloud_tests/app_config.yaml b/release/tune_tests/cloud_tests/app_config.yaml
deleted file mode 100755
index e5cd8a82ad0990..00000000000000
--- a/release/tune_tests/cloud_tests/app_config.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-base_image: {{ env["RAY_IMAGE_NIGHTLY_CPU"] }}
-env_vars: {}
-debian_packages:
-  - curl
-
-python:
-  pip_packages:
-    - pytest
-    - awscli
-    - gsutil
-    - gym>=0.21.0,<0.24.1
-    - gcsfs<=2022.7.1
-    - pyarrow>=6.0.1,<7.0.0
-    - tensorboardX
-  conda_packages: []
-
-post_build_cmds:
-  # Install Ray
-  - pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
-  - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
diff --git a/release/tune_tests/cloud_tests/app_config_ml.yaml b/release/tune_tests/cloud_tests/app_config_ml.yaml
deleted file mode 100755
index 75bbc1938db282..00000000000000
--- a/release/tune_tests/cloud_tests/app_config_ml.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] }}
-env_vars: {}
-debian_packages:
-  - curl
-
-python:
-  pip_packages:
-    - pytest
-    - awscli
-    - gsutil
-    - gym>=0.21.0,<0.24.1
-    - gcsfs<=2022.7.1
-    - pyarrow>=6.0.1,<7.0.0
-    - tensorboardX
-  conda_packages: []
-
-post_build_cmds:
-  # Install Ray
-  - pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
-  - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
diff --git a/release/tune_tests/cloud_tests/tpl_aws_4x2.yaml b/release/tune_tests/cloud_tests/tpl_aws_4x2.yaml
deleted file mode 100644
index 507c0040f751a6..00000000000000
--- a/release/tune_tests/cloud_tests/tpl_aws_4x2.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
-region: us-west-2
-
-max_workers: 3
-
-head_node_type:
-    name: head_node
-    instance_type: m5.large
-
-worker_node_types:
-    - name: worker_node
-      instance_type: m5.large
-      min_workers: 3
-      max_workers: 3
-      use_spot: false
diff --git a/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml b/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml
deleted file mode 100644
index 1c62f09bb7e06d..00000000000000
--- a/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
-region: us-west1
-allowed_azs: 
-    - us-west1-c
-
-max_workers: 3
-
-head_node_type:
-    name: head_node
-    instance_type: n2-standard-2
-
-worker_node_types:
-    - name: worker_node
-      instance_type: n2-standard-2
-      min_workers: 3
-      max_workers: 3
-      use_spot: false
diff --git a/release/tune_tests/cloud_tests/workloads/_tune_script.py b/release/tune_tests/cloud_tests/workloads/_tune_script.py
deleted file mode 100644
index bd1e14cfc4e249..00000000000000
--- a/release/tune_tests/cloud_tests/workloads/_tune_script.py
+++ /dev/null
@@ -1,155 +0,0 @@
-from typing import Optional
-
-import argparse
-import os
-import pickle
-import tempfile
-import time
-
-import ray
-from ray import train, tune
-from ray.train import Checkpoint
-from ray.rllib.algorithms.callbacks import DefaultCallbacks
-from ray.rllib.algorithms.ppo import PPO
-
-from run_cloud_test import ARTIFACT_FILENAME, CHECKPOINT_DATA_FILENAME
-
-
-def fn_trainable(config):
-    checkpoint = train.get_checkpoint()
-    if checkpoint:
-        with checkpoint.as_directory() as checkpoint_dir:
-            with open(
-                os.path.join(checkpoint_dir, CHECKPOINT_DATA_FILENAME), "rb"
-            ) as f:
-                checkpoint_dict = pickle.load(f)
-        state = {"internal_iter": checkpoint_dict["internal_iter"] + 1}
-    else:
-        state = {"internal_iter": 1}
-
-    for i in range(state["internal_iter"], config["max_iterations"] + 1):
-        state["internal_iter"] = i
-        time.sleep(config["sleep_time"])
-
-        # Log artifacts to the trial dir.
-        trial_dir = train.get_context().get_trial_dir()
-        with open(os.path.join(trial_dir, ARTIFACT_FILENAME), "a") as f:
-            f.write(f"{config['id']},")
-
-        metrics = dict(
-            score=i * 10 * config["score_multiplied"],
-            internal_iter=state["internal_iter"],
-        )
-        if i % config["checkpoint_freq"] == 0:
-            with tempfile.TemporaryDirectory() as tmpdir:
-                with open(os.path.join(tmpdir, CHECKPOINT_DATA_FILENAME), "wb") as f:
-                    pickle.dump({"internal_iter": i}, f)
-                train.report(metrics, checkpoint=Checkpoint.from_directory(tmpdir))
-        else:
-            train.report(metrics)
-
-
-class RLlibCallback(DefaultCallbacks):
-    def on_train_result(self, *, algorithm, result: dict, **kwargs) -> None:
-        result["internal_iter"] = result["training_iteration"]
-
-        # Log artifacts to the trial dir.
-        with open(os.path.join(algorithm.logdir, ARTIFACT_FILENAME), "a") as f:
-            f.write(f"{algorithm.config['id']},")
-
-
-class IndicatorCallback(tune.Callback):
-    def __init__(self, indicator_file):
-        self.indicator_file = indicator_file
-
-    def on_step_begin(self, iteration, trials, **info):
-        with open(self.indicator_file, "wt") as fp:
-            fp.write("1")
-
-
-def run_tune(
-    storage_path: Optional[str] = None,
-    experiment_name: str = "cloud_test",
-    indicator_file: str = "/tmp/tune_cloud_indicator",
-    trainable: str = "function",
-    num_cpus_per_trial: int = 2,
-):
-    if trainable == "function":
-        train_fn = fn_trainable
-        config = {
-            "max_iterations": 100,
-            "sleep_time": 5,
-            "checkpoint_freq": 2,
-            "score_multiplied": tune.randint(0, 100),
-            "id": tune.grid_search([0, 1, 2, 3]),
-        }
-        kwargs = {"resources_per_trial": {"cpu": num_cpus_per_trial}}
-    elif trainable == "rllib_str" or trainable == "rllib_trainer":
-        if trainable == "rllib_str":
-            train_fn = "PPO"
-        else:
-            train_fn = PPO
-
-        config = {
-            "env": "CartPole-v1",
-            "num_workers": 1,
-            "num_envs_per_worker": 1,
-            "callbacks": RLlibCallback,
-            "id": tune.grid_search([0, 1, 2, 3]),
-        }
-        kwargs = {
-            "stop": {"training_iteration": 100},
-            "checkpoint_freq": 2,
-            "checkpoint_at_end": True,
-        }
-    else:
-        raise RuntimeError(f"Unknown trainable: {trainable}")
-
-    tune.run(
-        train_fn,
-        name=experiment_name,
-        resume="AUTO",
-        num_samples=1,  # 4 trials from the grid search
-        config=config,
-        storage_path=storage_path,
-        sync_config=train.SyncConfig(
-            sync_period=0.5,
-            sync_artifacts=True,
-        ),
-        keep_checkpoints_num=2,
-        callbacks=[IndicatorCallback(indicator_file=indicator_file)],
-        verbose=2,
-        **kwargs,
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--storage-path", required=False, default=None, type=str)
-    parser.add_argument("--experiment-name", required=False, default=None, type=str)
-    parser.add_argument(
-        "--indicator-file",
-        required=False,
-        default="/tmp/tune_cloud_indicator",
-        type=str,
-    )
-    args = parser.parse_args()
-
-    trainable = str(os.environ.get("TUNE_TRAINABLE", "function"))
-    num_cpus_per_trial = int(os.environ.get("TUNE_NUM_CPUS_PER_TRIAL", "2"))
-
-    run_kwargs = dict(
-        storage_path=args.storage_path or None,
-        experiment_name=args.experiment_name or "cloud_test",
-        indicator_file=args.indicator_file,
-        trainable=trainable,
-        num_cpus_per_trial=num_cpus_per_trial,
-    )
-
-    if not ray.is_initialized:
-        ray.init(address="auto")
-
-    print(f"Running on node {ray.util.get_node_ip_address()} with settings:")
-    print(run_kwargs)
-
-    run_tune(**run_kwargs)
diff --git a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py b/release/tune_tests/cloud_tests/workloads/run_cloud_test.py
deleted file mode 100644
index ba750228a8d433..00000000000000
--- a/release/tune_tests/cloud_tests/workloads/run_cloud_test.py
+++ /dev/null
@@ -1,1141 +0,0 @@
-"""Run cloud checkpointing tests.
-
-This script provides utilities and end to end tests for cloud checkpointing.
-
-Generally the flow is as follows:
-
-A Tune run is started in a separate process. It is terminated after some
-time. It is then restarted for another period of time.
-
-We also ensure that checkpoints are properly deleted.
-
-The Tune run is kicked off in _tune_script.py. Trials write a checkpoint
-every 2 iterations, and take 5 seconds per iteration.
-
-More details on the expected results can be found in the scenario descriptions.
-"""
-
-import argparse
-import csv
-import io
-import tarfile
-from dataclasses import dataclass
-import json
-import os
-import pickle
-import platform
-import re
-import shutil
-import signal
-import subprocess
-import tempfile
-import time
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
-
-import ray
-from ray.train._internal.storage import StorageContext
-from ray.tune.execution.experiment_state import _find_newest_experiment_checkpoint
-from ray.tune.result import _get_defaults_results_dir
-from ray.tune.utils.serialization import TuneFunctionDecoder
-
-TUNE_SCRIPT = os.path.join(os.path.dirname(__file__), "_tune_script.py")
-ARTIFACT_FILENAME = "artifact.txt"
-CHECKPOINT_DATA_FILENAME = "dict_checkpoint.pkl"
-
-# Classes to hold data from experiment checkpoints
-
-
-class ExperimentStateCheckpoint:
-    def __init__(
-        self, dir: str, runner_data: Dict[str, Any], trials: List["TrialStub"]
-    ):
-        self.dir = dir
-        self.runner_data = runner_data
-        self.trials = trials
-
-
-class ExperimentDirCheckpoint:
-    def __init__(
-        self, dir: str, trial_to_cps: Dict["TrialStub", "TrialCheckpointData"]
-    ):
-        self.dir = dir
-        self.trial_to_cps = trial_to_cps
-
-
-class TrialStub:
-    def __init__(
-        self,
-        trainable_name: str,
-        trial_id: str,
-        status: str,
-        config: Dict[str, Any],
-        experiment_tag: str,
-        last_result: Dict[str, Any],
-        relative_logdir: str,
-        storage: StorageContext,
-        *args,
-        **kwargs,
-    ):
-        self.trainable_name = trainable_name
-        self.trial_id = trial_id
-        self.status = status
-        self.config = config
-        self.storage = storage
-        self.experiment_tag = experiment_tag
-        self.last_result = last_result
-        self.relative_logdir = relative_logdir
-
-        # Ignore remaining arguments
-
-    @property
-    def hostname(self):
-        return self.last_result.get("hostname")
-
-    @property
-    def node_ip(self):
-        return self.last_result.get("node_ip")
-
-    @property
-    def dirname(self):
-        return os.path.basename(self.relative_logdir)
-
-    @property
-    def was_on_driver_node(self):
-        return self.hostname == platform.node()
-
-    def __hash__(self):
-        return hash(self.trial_id)
-
-    def __repr__(self):
-        return f"<TrialStub trial_id={self.trial_id}>"
-
-
-@dataclass
-class TrialCheckpointData:
-    params: Dict[str, Any]
-    results: List[Dict[str, Any]]
-    progress: List[Dict[str, Any]]
-    checkpoints: List[Tuple[str, Dict[Any, Any]]]
-    num_skipped: int
-    artifact_data: str
-
-
-# Utility functions
-
-
-def delete_file_if_exists(filename: str):
-    if os.path.exists(filename):
-        os.remove(filename)
-
-
-def cleanup_driver_experiment_dir(experiment_name: str):
-    experiment_dir = os.path.join(os.path.expanduser("~/ray_results"), experiment_name)
-    if os.path.exists(experiment_dir):
-        print("Removing existing experiment dir:", experiment_dir)
-        shutil.rmtree(experiment_dir)
-
-
-def cleanup_remote_node_experiment_dir(experiment_name: str):
-    experiment_dir = os.path.join(os.path.expanduser("~/ray_results"), experiment_name)
-
-    @ray.remote
-    def _remove_on_remove_node(path: str):
-        return shutil.rmtree(path, ignore_errors=True)
-
-    futures = []
-    for node in ray.nodes():
-        if not node["Alive"]:
-            continue
-
-        hostname = node["NodeManagerHostname"]
-        ip = node["NodeManagerAddress"]
-
-        if hostname == platform.node():
-            # Skip on driver
-            continue
-
-        rfn = _remove_on_remove_node.options(resources={f"node:{ip}": 0.01})
-        futures.append(rfn.remote(experiment_dir))
-    ray.get(futures)
-
-
-# Cluster utility
-def wait_for_nodes(
-    num_nodes: int, timeout: float = 300.0, feedback_interval: float = 10.0
-):
-    start = time.time()
-
-    max_time = start + timeout
-    next_feedback = start + feedback_interval
-
-    curr_nodes = len(ray.nodes())
-    while curr_nodes < num_nodes:
-        now = time.time()
-
-        if now >= max_time:
-            raise RuntimeError(
-                f"Maximum wait time reached, but only "
-                f"{curr_nodes}/{num_nodes} nodes came up. Aborting."
-            )
-
-        if now >= next_feedback:
-            passed = now - start
-            print(
-                f"Waiting for more nodes to come up: "
-                f"{curr_nodes}/{num_nodes} "
-                f"({passed:.0f} seconds passed)"
-            )
-            next_feedback = now + feedback_interval
-
-        time.sleep(5)
-        curr_nodes = len(ray.nodes())
-
-
-# Run tune script in different process
-
-
-def start_run(
-    storage_path: Optional[str] = None,
-    experiment_name: str = "cloud_test",
-    indicator_file: str = "/tmp/tune_cloud_indicator",
-) -> subprocess.Popen:
-    args = []
-    if storage_path:
-        args.extend(["--storage-path", storage_path])
-
-    if experiment_name:
-        args.extend(["--experiment-name", experiment_name])
-
-    if indicator_file:
-        args.extend(["--indicator-file", indicator_file])
-
-    env = os.environ.copy()
-    env["TUNE_RESULT_BUFFER_LENGTH"] = "1"
-    env["TUNE_GLOBAL_CHECKPOINT_S"] = "10"
-
-    tune_script = os.environ.get("OVERWRITE_TUNE_SCRIPT", TUNE_SCRIPT)
-
-    full_command = ["python", tune_script] + args
-
-    print(f"Running command: {' '.join(full_command)}")
-    process = subprocess.Popen(full_command, env=env)
-
-    return process
-
-
-def wait_for_run_or_raise(
-    process: subprocess.Popen, indicator_file: str, timeout: int = 30
-):
-    print(
-        f"Waiting up to {timeout} seconds until trials have been started "
-        f"(indicated by existence of `{indicator_file}`)"
-    )
-
-    timeout = time.monotonic() + timeout
-    while (
-        process.poll() is None
-        and time.monotonic() < timeout
-        and not os.path.exists(indicator_file)
-    ):
-        time.sleep(1)
-
-    if not os.path.exists(indicator_file):
-        process.terminate()
-
-        raise RuntimeError(
-            f"Indicator file `{indicator_file}` still doesn't exist, "
-            f"indicating that trials have not been started. "
-            f"Please check the process output."
-        )
-
-    print("Process started, trials are running")
-
-
-def send_signal_after_wait(process: subprocess.Popen, signal: int, wait: int = 30):
-    print(
-        f"Waiting {wait} seconds until sending signal {signal} "
-        f"to process {process.pid}"
-    )
-
-    time.sleep(wait)
-
-    if process.poll() is not None:
-        raise RuntimeError(
-            f"Process {process.pid} already terminated. This usually means "
-            f"that some of the trials ERRORed (e.g. because they couldn't be "
-            f"restored. Try re-running this test to see if this fixes the "
-            f"issue."
-        )
-
-    print(f"Sending signal {signal} to process {process.pid}")
-    process.send_signal(signal)
-
-
-def wait_until_process_terminated(process: subprocess.Popen, timeout: int = 60):
-    print(f"Waiting up to {timeout} seconds until process " f"{process.pid} terminates")
-
-    timeout = time.monotonic() + timeout
-    while process.poll() is None and time.monotonic() < timeout:
-        time.sleep(1)
-
-    if process.poll() is None:
-        process.terminate()
-
-        print(
-            f"Warning: Process {process.pid} did not terminate within "
-            f"timeout, terminating forcefully instead."
-        )
-    else:
-        print(f"Process {process.pid} terminated gracefully.")
-
-
-def run_tune_script_for_time(
-    run_time: int,
-    experiment_name: str,
-    indicator_file: str,
-    storage_path: Optional[str],
-    run_start_timeout: int = 30,
-):
-    # Start run
-    process = start_run(
-        storage_path=storage_path,
-        experiment_name=experiment_name,
-        indicator_file=indicator_file,
-    )
-    try:
-        # Wait until indicator file exists
-        wait_for_run_or_raise(
-            process, indicator_file=indicator_file, timeout=run_start_timeout
-        )
-        # Stop experiment (with checkpoint) after some time
-        send_signal_after_wait(process, signal=signal.SIGUSR1, wait=run_time)
-        # Wait until process gracefully terminated
-        wait_until_process_terminated(process, timeout=45)
-    finally:
-        process.terminate()
-
-
-# Run full flow
-
-
-def run_resume_flow(
-    experiment_name: str,
-    indicator_file: str,
-    storage_path: Optional[str],
-    first_run_time: int = 33,
-    second_run_time: int = 33,
-    run_start_timeout: int = 30,
-    before_experiments_callback: Optional[Callable[[], None]] = None,
-    between_experiments_callback: Optional[Callable[[], None]] = None,
-    after_experiments_callback: Optional[Callable[[], None]] = None,
-):
-    """Run full flow, i.e.
-
-    - Clean up existing experiment dir
-    - Call before experiment callback
-    - Run tune script for `first_run_time` seconds
-    - Call between experiment callback
-    - Run tune script for another `second_run_time` seconds
-    - Call after experiment callback
-    """
-    # Cleanup ~/ray_results/<experiment_name> folder
-    cleanup_driver_experiment_dir(experiment_name)
-
-    # Cleanup experiment folder on remote nodes
-    cleanup_remote_node_experiment_dir(experiment_name)
-
-    # Run before experiment callbacks
-    if before_experiments_callback:
-        print("Before experiments: Invoking callback")
-        before_experiments_callback()
-        print("Before experiments: Callback completed")
-
-    # Delete indicator file
-    delete_file_if_exists(indicator_file)
-
-    # Run tune script for `first_run_time` seconds
-    run_tune_script_for_time(
-        run_time=first_run_time,
-        experiment_name=experiment_name,
-        indicator_file=indicator_file,
-        storage_path=storage_path,
-        run_start_timeout=run_start_timeout,
-    )
-
-    # Before we restart, run a couple of checks
-    # Run before experiment callbacks
-    if between_experiments_callback:
-        print("Between experiments: Invoking callback")
-        between_experiments_callback()
-        print("Between experiments: Callback completed")
-
-    # Restart. First, clean up indicator file
-    delete_file_if_exists(indicator_file)
-
-    # Start run again, run for another `second_run_time` seconds
-    run_tune_script_for_time(
-        run_time=second_run_time,
-        experiment_name=experiment_name,
-        indicator_file=indicator_file,
-        storage_path=storage_path,
-    )
-
-    if after_experiments_callback:
-        print("After experiments: Invoking callback")
-        after_experiments_callback()
-        print("After experiments: Callback completed")
-
-
-# Download data from remote nodes
-
-
-def fetch_remote_directory_content(
-    node_ip: str,
-    remote_dir: str,
-    local_dir: str,
-):
-    def _pack(dir: str):
-        stream = io.BytesIO()
-        with tarfile.open(
-            fileobj=stream, mode="w:gz", format=tarfile.PAX_FORMAT
-        ) as tar:
-            tar.add(dir, arcname="")
-
-        return stream.getvalue()
-
-    def _unpack(stream: str, dir: str):
-        with tarfile.open(fileobj=io.BytesIO(stream)) as tar:
-            tar.extractall(dir)
-
-    try:
-        packed = ray.get(
-            ray.remote(resources={f"node:{node_ip}": 0.01})(_pack).remote(remote_dir)
-        )
-        _unpack(packed, local_dir)
-    except Exception as e:
-        print(
-            f"Warning: Could not fetch remote directory contents. Message: " f"{str(e)}"
-        )
-
-
-def send_local_file_to_remote_file(local_path: str, remote_path: str, ip: str):
-    def _write(stream: bytes, path: str):
-        with open(path, "wb") as f:
-            f.write(stream)
-
-    with open(local_path, "rb") as f:
-        stream = f.read()
-
-    _remote_write = ray.remote(resources={f"node:{ip}": 0.01})(_write)
-    return ray.get(_remote_write.remote(stream, remote_path))
-
-
-def fetch_remote_file_to_local_file(remote_path: str, ip: str, local_path: str):
-    def _read(path: str):
-        with open(path, "rb") as f:
-            return f.read()
-
-    _remote_read = ray.remote(resources={f"node:{ip}": 0.01})(_read)
-    stream = ray.get(_remote_read.remote(remote_path))
-
-    with open(local_path, "wb") as f:
-        f.write(stream)
-
-
-def fetch_trial_node_dirs_to_tmp_dir(trials: List[TrialStub]) -> Dict[TrialStub, str]:
-    dirmap = {}
-
-    for trial in trials:
-        tmpdir = tempfile.mkdtemp(prefix="tune_cloud_test")
-
-        if trial.was_on_driver_node:
-            # Trial was run on driver
-            shutil.rmtree(tmpdir)
-            shutil.copytree(trial.storage.experiment_local_path, tmpdir)
-            print(
-                "Copied local node experiment dir",
-                trial.storage.experiment_local_path,
-                "to",
-                tmpdir,
-                "for trial",
-                trial.trial_id,
-            )
-
-        else:
-            # Trial was run on remote node
-            fetch_remote_directory_content(
-                trial.node_ip,
-                remote_dir=trial.storage.experiment_local_path,
-                local_dir=tmpdir,
-            )
-
-        dirmap[trial] = tmpdir
-
-    return dirmap
-
-
-# Bucket interaction
-
-
-def clear_bucket_contents(bucket: str):
-    if bucket.startswith("s3://"):
-        print("Clearing bucket contents:", bucket)
-        subprocess.check_call(["aws", "s3", "rm", "--recursive", "--quiet", bucket])
-    elif bucket.startswith("gs://"):
-        print("Clearing bucket contents:", bucket)
-        try:
-            subprocess.check_call(["gsutil", "-m", "rm", "-f", "-r", bucket])
-        except subprocess.CalledProcessError:
-            # If empty, ignore error
-            pass
-    else:
-        raise ValueError(f"Invalid bucket URL: {bucket}")
-
-
-def fetch_bucket_contents_to_tmp_dir(bucket: str) -> str:
-    tmpdir = tempfile.mkdtemp(prefix="tune_cloud_test")
-    subfolder = None
-
-    if bucket.startswith("s3://"):
-        subprocess.check_call(
-            ["aws", "s3", "cp", "--recursive", "--quiet", bucket, tmpdir]
-        )
-    elif bucket.startswith("gs://"):
-        try:
-            subprocess.check_call(["gsutil", "-m", "cp", "-r", bucket, tmpdir])
-        except subprocess.CalledProcessError as e:
-            # Sometimes single files cannot be processed
-            if len(os.listdir(tmpdir)) == 0:
-                raise RuntimeError(
-                    f"Local dir {tmpdir} empty after trying to fetch bucket data."
-                ) from e
-        pattern = re.compile("gs://[^/]+/(.+)")
-        subfolder = re.match(pattern, bucket).group(1).split("/")[-1]
-    else:
-        raise ValueError(f"Invalid bucket URL: {bucket}")
-
-    if subfolder:
-        tmpdir = os.path.join(tmpdir, subfolder)
-
-    print("Copied bucket data from", bucket, "to", tmpdir)
-
-    return tmpdir
-
-
-# Load data from local dirs into objects
-
-
-def load_experiment_checkpoint_from_state_file(
-    experiment_dir: str,
-) -> ExperimentStateCheckpoint:
-    newest_ckpt_path = _find_newest_experiment_checkpoint(experiment_dir)
-    with open(newest_ckpt_path, "r") as f:
-        runner_state = json.load(f, cls=TuneFunctionDecoder)
-
-    trials = []
-    for trial_cp_str, trial_runtime_str in runner_state["trial_data"]:
-        trial_state = json.loads(trial_cp_str, cls=TuneFunctionDecoder)
-        runtime = json.loads(trial_runtime_str, cls=TuneFunctionDecoder)
-        trial_state.update(runtime)
-        trial = TrialStub(**trial_state)
-        trials.append(trial)
-
-    runner_data = runner_state["runner_data"]
-
-    return ExperimentStateCheckpoint(experiment_dir, runner_data, trials)
-
-
-def load_experiment_checkpoint_from_dir(
-    trials: Iterable[TrialStub], experiment_dir: str
-) -> ExperimentDirCheckpoint:
-    trial_to_cps = {}
-    for f in sorted(os.listdir(experiment_dir)):
-        full_path = os.path.join(experiment_dir, f)
-        if os.path.isdir(full_path):
-            # Map to TrialStub object
-            trial_stub = None
-            for trial in trials:
-                if trial.dirname == f:
-                    trial_stub = trial
-                    break
-
-            if not trial_stub:
-                raise RuntimeError(f"Trial with dirname {f} not found.")
-
-            trial_checkpoint_data = load_trial_checkpoint_data(full_path)
-
-            trial_to_cps[trial_stub] = trial_checkpoint_data
-
-    return ExperimentDirCheckpoint(experiment_dir, trial_to_cps)
-
-
-def load_trial_checkpoint_data(trial_dir: str) -> TrialCheckpointData:
-    params_file = os.path.join(trial_dir, "params.json")
-    if os.path.exists(params_file):
-        with open(params_file, "rt") as f:
-            params = json.load(f)
-    else:
-        params = {}
-
-    result_file = os.path.join(trial_dir, "result.json")
-    if os.path.exists(result_file):
-        results = []
-        with open(result_file, "rt") as f:
-            for line in f.readlines():
-                results.append(json.loads(line))
-    else:
-        results = []
-
-    progress_file = os.path.join(trial_dir, "progress.csv")
-    if os.path.exists(progress_file):
-        with open(progress_file, "rt") as f:
-            reader = csv.DictReader(f)
-            progress = list(reader)
-    else:
-        progress = []
-
-    checkpoints = []
-    num_skipped = 0
-    for cp_dir in sorted(os.listdir(trial_dir)):
-        if not cp_dir.startswith("checkpoint_"):
-            continue
-
-        cp_full_dir = os.path.join(trial_dir, cp_dir)
-        json_path = os.path.join(cp_full_dir, CHECKPOINT_DATA_FILENAME)
-
-        if os.path.exists(json_path):
-            with open(json_path, "rb") as f:
-                checkpoint_data = pickle.load(f)
-        else:
-            # If neither file exists, this means the checkpoint got only synced half,
-            # so we should skip it
-            continue
-        checkpoints.append((cp_dir, checkpoint_data))
-
-    # Load the artifact data
-    trial_artifact_path = os.path.join(trial_dir, ARTIFACT_FILENAME)
-    artifact_data = None
-    if os.path.exists(trial_artifact_path):
-        with open(trial_artifact_path, "r") as f:
-            artifact_data = f.read()
-
-    return TrialCheckpointData(
-        params=params,
-        results=results,
-        progress=progress,
-        checkpoints=checkpoints,
-        num_skipped=num_skipped,
-        artifact_data=artifact_data,
-    )
-
-
-def load_data_from_trial_exp_checkpoints(
-    trial_to_exp_dir: Dict[TrialStub, str]
-) -> Dict[TrialStub, ExperimentDirCheckpoint]:
-    trial_to_checkpoint_data = {}
-    for trial, dirname in trial_to_exp_dir.items():
-        trial_to_checkpoint_data[trial] = load_experiment_checkpoint_from_dir(
-            trial_to_exp_dir.keys(), dirname
-        )
-
-    return trial_to_checkpoint_data
-
-
-# Load all relevant data
-
-
-def get_experiment_and_trial_data(
-    experiment_name: str,
-) -> Tuple[
-    ExperimentStateCheckpoint,
-    ExperimentDirCheckpoint,
-    Dict[TrialStub, ExperimentDirCheckpoint],
-]:
-    experiment_dir = assert_experiment_dir_exists(experiment_name=experiment_name)
-
-    experiment_state = load_experiment_checkpoint_from_state_file(
-        experiment_dir=experiment_dir
-    )
-
-    assert_experiment_checkpoint_validity(experiment_state)
-
-    driver_dir_cp = load_experiment_checkpoint_from_dir(
-        experiment_state.trials, experiment_dir
-    )
-
-    # Fetch experiment dirs from remote nodes to a local temp dir
-    trial_to_exp_dir = fetch_trial_node_dirs_to_tmp_dir(experiment_state.trials)
-
-    # Load data stored in these experiment dirs
-    trial_exp_checkpoint_data = load_data_from_trial_exp_checkpoints(trial_to_exp_dir)
-
-    return experiment_state, driver_dir_cp, trial_exp_checkpoint_data
-
-
-def get_bucket_data(
-    bucket: str,
-    experiment_name: str,
-) -> Tuple[ExperimentStateCheckpoint, ExperimentDirCheckpoint]:
-    local_bucket_dir = fetch_bucket_contents_to_tmp_dir(bucket)
-    local_experiment_dir = os.path.join(local_bucket_dir, experiment_name)
-
-    bucket_state_cp = load_experiment_checkpoint_from_state_file(local_experiment_dir)
-
-    bucket_dir_cp = load_experiment_checkpoint_from_dir(
-        bucket_state_cp.trials, local_experiment_dir
-    )
-
-    return bucket_state_cp, bucket_dir_cp
-
-
-# Assertions
-
-
-def assert_experiment_dir_exists(experiment_name: str) -> str:
-    experiment_dir = os.path.join(_get_defaults_results_dir(), experiment_name)
-
-    if not os.path.exists(experiment_dir):
-        raise RuntimeError(
-            f"Check failed: Experiment dir {experiment_dir} does not exist."
-        )
-
-    return experiment_dir
-
-
-def assert_experiment_checkpoint_validity(experiment_state: ExperimentStateCheckpoint):
-    assert len(experiment_state.trials) == 4, "Not all trials have been created."
-
-
-def assert_min_num_trials(
-    trials: Iterable[TrialStub], on_driver: int, on_worker: int
-) -> Tuple[int, int]:
-    num_trials_on_driver = len([trial for trial in trials if trial.was_on_driver_node])
-
-    num_trials_not_on_driver = len(trials) - num_trials_on_driver
-
-    assert num_trials_on_driver >= on_driver, (
-        f"Not enough trials were scheduled on the driver node "
-        f"({num_trials_on_driver} < {on_driver})."
-    )
-
-    assert num_trials_not_on_driver >= on_worker, (
-        f"Not enough trials were scheduled on remote nodes."
-        f"({num_trials_on_driver} < {on_worker})."
-    )
-
-    return num_trials_on_driver, len(trials) - num_trials_on_driver
-
-
-def assert_checkpoint_count(
-    experiment_dir_cp: ExperimentDirCheckpoint,
-    for_driver_trial: int,
-    for_worker_trial: int,
-    max_additional: int = 0,
-):
-    for trial, trial_cp in experiment_dir_cp.trial_to_cps.items():
-        cps = len(trial_cp.checkpoints)
-        num_skipped = trial_cp.num_skipped
-        if trial.was_on_driver_node:
-            assert (
-                cps >= for_driver_trial and cps <= for_driver_trial + max_additional
-            ), (
-                f"Trial {trial.trial_id} was on driver, "
-                f"but did not observe the expected amount of checkpoints "
-                f"({cps} != {for_driver_trial}, "
-                f"skipped={num_skipped}, max_additional={max_additional}). "
-                f"Directory: {experiment_dir_cp.dir}"
-            )
-        else:
-            assert (
-                cps >= for_worker_trial and cps <= for_worker_trial + max_additional
-            ), (
-                f"Trial {trial.trial_id} was not on the driver, "
-                f"but did not observe the expected amount of checkpoints "
-                f"({cps} != {for_worker_trial}, "
-                f"skipped={num_skipped}, max_additional={max_additional}). "
-                f"Directory: {experiment_dir_cp.dir}"
-            )
-
-
-def assert_artifact_existence_and_validity(
-    experiment_dir_cp: ExperimentDirCheckpoint,
-    exists_for_driver_trials: bool,
-    exists_for_worker_trials: bool,
-    skip_validation: bool = False,
-):
-    for trial, trial_cp in experiment_dir_cp.trial_to_cps.items():
-        artifact_data = trial_cp.artifact_data
-        artifact_exists = artifact_data is not None
-        # exists_for_xxx_trials == artifact_exists covers 2 cases:
-        # 1. Artifact should not exist for driver trial + it actually doesn't exist
-        # 2. Artifact should exist + it actually exists
-        if trial.was_on_driver_node:
-            assert exists_for_driver_trials == artifact_exists, (
-                "Trial {trial.trial_id} was ON THE DRIVER, where the artifact "
-                f"SHOULD {'' if exists_for_driver_trials else 'NOT'} exist, "
-                f"but found that it DOES {'' if artifact_exists else 'NOT'} exist.\n"
-                f"Directory: {experiment_dir_cp.dir}"
-            )
-        else:
-            assert exists_for_worker_trials == artifact_exists, (
-                "Trial {trial.trial_id} was NOT ON THE DRIVER, where the artifact "
-                f"SHOULD {'' if exists_for_driver_trials else 'NOT'} exist, "
-                f"but found that it DOES {'' if artifact_exists else 'NOT'} exist.\n"
-                f"Directory: {experiment_dir_cp.dir}"
-            )
-
-        if not artifact_exists or skip_validation:
-            continue
-
-        # NOTE: This expects `artifact_data` to be comma-separated string
-        # with the trial.config["id"] showing up as many times as the
-        # latest checkpoint iteration.
-        # Ex: checkpoint_5 is the latest checkpoint for trial with id=0
-        # -> Expect artifact_data == "0,0,0,0,0,"
-        artifact_data_list = artifact_data.split(",")[:-1]  # Account for extra ,
-        artifact_iter = len(artifact_data_list)
-        checkpoint_iters = sorted(
-            [
-                checkpoint_data["internal_iter"]
-                for _, checkpoint_data in trial_cp.checkpoints
-            ],
-            reverse=True,
-        )
-        # TODO(ml-team): Compare to latest checkpoint only after
-        # checkpoint+artifact saving is done atomically.
-        top_two = checkpoint_iters[:2]
-        print(
-            f"\nGot artifact_iter = {artifact_iter}, "
-            f"and top 2 checkpoint iters were {top_two}"
-        )
-        trial_id = trial.config["id"]
-        assert all(
-            id == str(trial_id) for id in artifact_data_list
-        ), f"The artifact data should contain only {trial_id}: {artifact_data_list}"
-        assert artifact_iter >= min(top_two), (
-            "The artifact data is not synced with respect to the latest checkpoint! "
-            f"Expected the artifact to contain at least {min(top_two)} "
-            f"iterations of data, but only got {artifact_iter}."
-        )
-
-
-def assert_trial_progressed_training(trial: TrialStub):
-    assert (
-        trial.last_result["training_iteration"]
-        > trial.last_result["iterations_since_restore"]
-    ), (
-        f"Trial {trial.trial_id} had a checkpoint but did not continue "
-        f"on resume (training iteration: "
-        f"{trial.last_result['training_iteration']} <="
-        f"{trial.last_result['iterations_since_restore']}). "
-        f"This probably means the checkpoint has not been synced "
-        f"to the node correctly."
-    )
-
-
-def test_durable_upload(bucket: str):
-    """
-    Sync trial and experiment checkpoints to cloud, so:
-
-        storage_path="s3://"
-
-    Expected results after first checkpoint:
-
-        - 4 trials are running
-        - At least one trial ran on the head node
-        - At least one trial ran remotely
-        - Driver has NO trial checkpoints from head node trial
-          (since they're uploaded directly to storage instead)
-        - Driver has trial artifacts from head node trial
-        - Driver has NO trial checkpoints from remote node trials
-        - Driver has NO trial artifacts from remote node trials
-        - Remote trial dirs only have data for one trial
-        - Remote trial dirs have NO checkpoints for node-local trials
-          (since they're uploaded directly to storage instead)
-        - Remote trial dirs have trial artifacts for node-local trials
-        - Cloud checkpoint is valid
-        - Cloud checkpoint has checkpoints from ALL trials
-        - Cloud checkpoint has artifacts from ALL trials (NOT IMPLEMENTED)
-
-    Then, remote checkpoint directories are cleaned up.
-
-    Expected results after second checkpoint:
-
-        - 4 trials are running
-        - All trials progressed with training
-        - Cloud checkpoint is valid
-        - Cloud checkpoint has checkpoints from all trials
-        - Cloud checkpoint has updated synced artifacts for all trials (NOT IMPLEMENTED)
-
-    """
-    if not bucket:
-        raise ValueError(
-            "The `durable_upload` test requires a `--bucket` argument to be set."
-        )
-
-    experiment_name = "cloud_durable_upload"
-    indicator_file = f"/tmp/{experiment_name}_indicator"
-
-    def before_experiments():
-        clear_bucket_contents(bucket)
-
-    def between_experiments():
-        (
-            experiment_state,
-            driver_dir_cp,
-            trial_exp_checkpoint_data,
-        ) = get_experiment_and_trial_data(experiment_name=experiment_name)
-
-        # Req: 4 trials are running
-        assert all(
-            trial.status == "RUNNING" for trial in experiment_state.trials
-        ), "Not all trials are RUNNING"
-
-        # Req: At least one trial ran on driver
-        # Req: At least one trial ran remotely
-        assert_min_num_trials(
-            driver_dir_cp.trial_to_cps.keys(), on_driver=1, on_worker=1
-        )
-
-        # Req: Driver has NO trial checkpoints from head node trial
-        # Req: Driver has NO trial checkpoints from remote node trials
-        assert_checkpoint_count(
-            driver_dir_cp, for_driver_trial=0, for_worker_trial=0, max_additional=0
-        )
-
-        # Req: Driver has trial artifacts from head node trial
-        # Req: Driver has no trial artifacts from remote node trials
-        # assert_artifact_existence_and_validity(
-        #     driver_dir_cp,
-        #     exists_for_driver_trials=True,
-        #     exists_for_worker_trials=False,
-        # )
-
-        for trial, exp_dir_cp in trial_exp_checkpoint_data.items():
-            # Req: Remote trial dirs only have data for one trial
-
-            seen = len(exp_dir_cp.trial_to_cps)
-
-            if trial.was_on_driver_node:
-                assert seen == 4, (
-                    f"Trial {trial.trial_id} was on driver, "
-                    f"but observed too few trials ({seen}) "
-                    f"in experiment dir."
-                )
-            else:
-                assert seen == 1, (
-                    f"Trial {trial.trial_id} was not on driver, "
-                    f"but observed not exactly 1 trials ({seen}) "
-                    f"in experiment dir."
-                )
-
-                assert_checkpoint_count(
-                    exp_dir_cp, for_driver_trial=0, for_worker_trial=0, max_additional=0
-                )
-
-                # Req: Remote trial dirs have artifacts for node-local trials
-                # assert_artifact_existence_and_validity(
-                #     exp_dir_cp,
-                #     exists_for_driver_trials=False,
-                #     exists_for_worker_trials=True,
-                # )
-
-        bucket_state_cp, bucket_dir_cp = get_bucket_data(bucket, experiment_name)
-
-        # Req: Cloud checkpoint is valid
-        assert_experiment_checkpoint_validity(bucket_state_cp)
-
-        # Req: Cloud checkpoint has checkpoints from all trials
-        assert_checkpoint_count(
-            bucket_dir_cp, for_driver_trial=2, for_worker_trial=2, max_additional=2
-        )
-
-        # Req: Cloud checkpoint has artifacts from all trials
-        # assert_artifact_existence_and_validity(
-        #     bucket_dir_cp,
-        #     exists_for_driver_trials=True,
-        #     exists_for_worker_trials=True,
-        # )
-
-        # Delete remote checkpoints before resume
-        print("Deleting remote checkpoints before resume")
-        cleanup_remote_node_experiment_dir(experiment_name)
-
-    def after_experiments():
-        (experiment_state, _, _) = get_experiment_and_trial_data(
-            experiment_name=experiment_name
-        )
-
-        # Req: 4 trials are running
-        assert all(
-            trial.status == "RUNNING" for trial in experiment_state.trials
-        ), "Not all trials are RUNNING"
-
-        for trial in experiment_state.trials:
-            assert_trial_progressed_training(trial)
-
-        bucket_state_cp, bucket_dir_cp = get_bucket_data(bucket, experiment_name)
-
-        # Req: Cloud checkpoint is valid
-        assert_experiment_checkpoint_validity(bucket_state_cp)
-
-        # Req: Cloud checkpoint has checkpoints from all trials
-        assert_checkpoint_count(
-            bucket_dir_cp, for_driver_trial=2, for_worker_trial=2, max_additional=2
-        )
-
-        # assert_artifact_existence_and_validity(
-        #     bucket_dir_cp,
-        #     exists_for_driver_trials=True,
-        #     exists_for_worker_trials=True,
-        #     # TODO(ml-team): Set this flag to True after restoration w/
-        #     # artifacts is supported.
-        #     skip_validation=True,
-        # )
-
-        # clear_bucket_contents(bucket)
-
-    run_time = int(os.getenv("TUNE_RUN_TIME", "180")) or 180
-
-    run_start_timeout = 600 if "rllib" in os.environ["TUNE_TRAINABLE"] else 30
-
-    run_resume_flow(
-        experiment_name=experiment_name,
-        indicator_file=indicator_file,
-        storage_path=bucket,
-        first_run_time=run_time,
-        second_run_time=run_time,
-        run_start_timeout=run_start_timeout,
-        before_experiments_callback=before_experiments,
-        between_experiments_callback=between_experiments,
-        after_experiments_callback=after_experiments,
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "variant", choices=["no_sync_down", "ssh_sync", "durable_upload"]
-    )
-    parser.add_argument("--trainable", type=str, default="function")
-    parser.add_argument("--bucket", type=str, default=None)
-    parser.add_argument("--cpus-per-trial", required=False, default=2, type=int)
-
-    args = parser.parse_args()
-
-    # Check if test should be run using Ray client
-    addr = os.environ.get("RAY_ADDRESS", "")
-    job_name = os.environ.get("RAY_JOB_NAME", "client_cloud_test")
-    if addr.startswith("anyscale://"):
-        uses_ray_client = True
-        ray.init(
-            address=addr,
-            job_name=job_name,
-            runtime_env={"working_dir": os.path.abspath(os.path.dirname(__file__))},
-        )
-    else:
-        uses_ray_client = False
-        ray.init(address="auto")
-
-    print(f"Running cloud test variant: {args.variant}")
-
-    release_test_out = os.environ.get("TEST_OUTPUT_JSON", "/tmp/release_test_out.json")
-
-    def _run_test(
-        variant: str,
-        trainable: str = "function",
-        run_time: int = 180,
-        bucket: str = "",
-        cpus_per_trial: int = 2,
-        overwrite_tune_script: Optional[str] = None,
-    ) -> Dict:
-        start_time = time.monotonic()
-        print(
-            f"Running test variant `{variant}` on "
-            f"node {ray.util.get_node_ip_address()} with "
-            f"{cpus_per_trial} CPUs per trial."
-        )
-
-        os.environ["TUNE_TRAINABLE"] = str(trainable)
-        os.environ["TUNE_RUN_TIME"] = str(run_time)
-        os.environ["TUNE_NUM_CPUS_PER_TRIAL"] = str(cpus_per_trial)
-
-        if overwrite_tune_script:
-            os.environ["OVERWRITE_TUNE_SCRIPT"] = overwrite_tune_script
-            print(
-                f"The test script has been overwritten with " f"{overwrite_tune_script}"
-            )
-
-        if variant == "durable_upload":
-            test_durable_upload(bucket)
-        else:
-            raise NotImplementedError(f"Unknown variant: {variant}")
-
-        time_taken = time.monotonic() - start_time
-
-        result = {"time_taken": time_taken, "last_update": time.time()}
-        return result
-
-    run_time = 180 if "rllib" in args.trainable else 90
-
-    bucket = None
-    if args.bucket:
-        bucket = os.path.join(args.bucket, f"test_{int(time.time())}")
-
-    err = None
-    try:
-        if not uses_ray_client:
-            print("This test will *not* use Ray client.")
-            result = _run_test(
-                args.variant, args.trainable, run_time, bucket, args.cpus_per_trial
-            )
-        else:
-            print("This test will run using Ray client.")
-
-            wait_for_nodes(num_nodes=4, timeout=300.0)
-
-            # This will usually run on the head node
-            @ray.remote
-            def _get_head_ip():
-                return ray.util.get_node_ip_address()
-
-            ip = ray.get(_get_head_ip.remote())
-
-            remote_tune_script = "/tmp/_tune_script.py"
-
-            print(f"Sending tune script to remote node {ip} ({remote_tune_script})")
-            send_local_file_to_remote_file(TUNE_SCRIPT, remote_tune_script, ip)
-            print("Starting remote cloud test using Ray client")
-
-            _run_test_remote = ray.remote(resources={f"node:{ip}": 0.01}, num_cpus=0)(
-                _run_test
-            )
-            result = ray.get(
-                _run_test_remote.remote(
-                    args.variant,
-                    args.trainable,
-                    run_time,
-                    bucket,
-                    args.cpus_per_trial,
-                    remote_tune_script,
-                )
-            )
-    except Exception as e:
-        err = e
-        result = {}
-
-    if bucket:
-        try:
-            # clear_bucket_contents(bucket)
-            pass
-        except Exception as be:
-            print(f"Error during cleanup of bucket: {be}")
-
-    with open(release_test_out, "wt") as f:
-        json.dump(result, f)
-
-    if err:
-        raise err
-
-    print(f"Test for variant {args.variant} SUCCEEDED")