ray-project · matthewdeng · Aug 17, 2023 · Aug 11, 2023 · Aug 11, 2023 · Aug 11, 2023
@@ -420,6 +420,9 @@ Suppose your cluster has 4 nodes, each with 16 CPUs. To limit to at most
     predictions.show(limit=1)
 
 
+.. _batch_inference_ray_train:
+
+
 Using models from Ray Train
 ---------------------------
 

@@ -70,7 +70,8 @@ def train_fn(config):
     for i in range(state["step"], 10):
         state["step"] += 1
         train.report(
-            metrics={"step": state["step"]}, checkpoint=Checkpoint.from_dict(state)
+            metrics={"step": state["step"], "loss": (100 - i) / 100},
+            checkpoint=Checkpoint.from_dict(state),
         )
 
 
@@ -160,12 +161,57 @@ def train_fn(config):
 # __checkpoint_config_ckpt_freq_end__
 
 
-# __results_start__
+# __result_metrics_start__
 result = trainer.fit()
 
-# Print metrics
 print("Observed metrics:", result.metrics)
+# __result_metrics_end__
 
-checkpoint_data = result.checkpoint.to_dict()
-print("Checkpoint data:", checkpoint_data["step"])
-# __results_end__
+
+# __result_dataframe_start__
+df = result.metrics_dataframe
+print("Minimum loss", min(df["loss"]))
+# __result_dataframe_end__
+
+
+# __result_checkpoint_start__
+print("Last checkpoint:", result.checkpoint)
+
+with result.checkpoint.as_directory() as tmpdir:
+    # Load model from directory
+    ...
+# __result_checkpoint_end__
+
+# __result_best_checkpoint_start__
+# Print available checkpoints
+for checkpoint, metrics in result.best_checkpoints:
+    print("Loss", metrics["loss"], "checkpoint", checkpoint)
+
+# Get checkpoint with minimal loss
+best_checkpoint = min(result.best_checkpoints, key=lambda bc: bc[1]["loss"])[0]
+
+with best_checkpoint.as_directory() as tmpdir:
+    # Load model from directory
+    ...
+# __result_best_checkpoint_end__
+
+# __result_path_start__
+result_path = result.path
+print("Results location", result_path)
+# __result_path_end__
+
+
+# __result_restore_start__
+from ray.train import Result
+
+restored_result = Result.from_path(result_path)
+print("Restored loss", result.metrics["loss"])
+# __result_restore_end__
+
+
+# __result_error_start__
+if result.error:
+    assert isinstance(result.error, Exception)
+
+    print("Got exception:", result.error)
+# __result_error_end__
@@ -73,7 +73,7 @@ The latter contains subconfigurations, such as the :class:`FailureConfig <ray.ai
 Train Checkpoints
 -----------------
 
-Calling ``Trainer.fit()`` returns a :class:`Result <ray.air.result.Result>` object, which includes
+Calling ``Trainer.fit()`` returns a :class:`Result <ray.train.result.Result>` object, which includes
 information about the run such as the reported metrics and the saved checkpoints.
 
 Checkpoints have the following purposes:

@@ -12,5 +12,6 @@ Ray Train User Guides
     user-guides/monitoring-logging
     user-guides/checkpoints
     user-guides/experiment-tracking
+    user-guides/results
     user-guides/fault-tolerance
     user-guides/advanced
@@ -0,0 +1,125 @@
+Inspecting Training Results
+===========================
+
+The return value of your :meth:`Trainer.fit() <ray.train.trainer.BaseTrainer.fit>`
+call is a :class:`~ray.train.Result` object.
+
+The :class:`~ray.train.Result` object contains, among other information:
+
+- The last reported metrics (e.g. the loss)
+- The last reported checkpoint (to load the model)
+- Error messages, if any errors occurred
+
+Viewing metrics
+---------------
+You can retrieve metrics reported to Ray Train from the :class:`~ray.train.Result`
+object.
+
+Common metrics include the training or validation loss, or prediction accuracies.
+
+The metrics retrieved from the :class:`~ray.train.Result` object
+correspond to those you passed to :func:`train.report <ray.train.report>`
+as an argument :ref:`in your training function <train-monitoring-and-logging>`.
+
+
+Last reported metrics
+~~~~~~~~~~~~~~~~~~~~~
+
+Use :attr:`Result.metrics <ray.train.Result.metrics>` to retrieve the
+latest reported metrics.
+
+.. literalinclude:: ../doc_code/key_concepts.py
+    :language: python
+    :start-after: __result_metrics_start__
+    :end-before: __result_metrics_end__
+
+Dataframe of all reported metrics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use :attr:`Result.metrics_dataframe <ray.train.Result.metrics_dataframe>` to retrieve
+a pandas DataFrame of all reported metrics.
+
+.. literalinclude:: ../doc_code/key_concepts.py
+    :language: python
+    :start-after: __result_dataframe_start__
+    :end-before: __result_dataframe_end__
+
+
+Retrieving checkpoints
+----------------------
+You can retrieve checkpoints reported to Ray Train from the :class:`~ray.train.Result`
+object.
+
+:ref:`Checkpoints <train-checkpointing>` contain all the information that is needed
+to restore the training state. This usually includes the trained model.
+
+You can use checkpoints for common downstream tasks such as
+:ref:`offline batch inference with Ray Data <batch_inference_ray_train>`,
+or :doc:`online model serving with Ray Serve </serve/index>`.
+
+The checkpoints retrieved from the :class:`~ray.train.Result` object
+correspond to those you passed to :func:`train.report <ray.train.report>`
+as an argument :ref:`in your training function <train-monitoring-and-logging>`.
+
+Last saved checkpoint
+~~~~~~~~~~~~~~~~~~~~~
+Use :attr:`Result.checkpoint <ray.train.Result.checkpoint>` to retrieve the
+last checkpoint.
+
+.. literalinclude:: ../doc_code/key_concepts.py
+    :language: python
+    :start-after: __result_checkpoint_start__
+    :end-before: __result_checkpoint_end__
+
+
+Other checkpoints
+~~~~~~~~~~~~~~~~~
+Sometimes you want to access an earlier checkpoint. For instance, if your loss increased
+after more training due to overfitting, you may want to retrieve the checkpoint with
+the lowest loss.
+
+You can retrieve a list of all available checkpoints and their metrics with
+:attr:`Result.best_checkpoints <ray.train.Result.best_checkpoints>`
+
+.. literalinclude:: ../doc_code/key_concepts.py
+    :language: python
+    :start-after: __result_best_checkpoint_start__
+    :end-before: __result_best_checkpoint_end__
+
+Accessing storage location
+---------------------------
+If you need to retrieve the results later, you can get the storage location
+of the training run with :attr:`Result.path <ray.train.Result.path>`.
+
+This path will correspond to the :ref:`storage_path <train-log-dir>` you configured
+in the :class:`~ray.train.RunConfig`. It will be a
+(nested) subdirectory within that path, usually
+of the form `TrainerName_date-string/TrainerName_id_00000_0_...`.
+
+
+.. literalinclude:: ../doc_code/key_concepts.py
+    :language: python
+    :start-after: __result_path_start__
+    :end-before: __result_path_end__
+
+
+You can restore a result with :meth:`Result.from_path <ray.train.Result.from_path>`:
+
+.. literalinclude:: ../doc_code/key_concepts.py
+    :language: python
+    :start-after: __result_restore_start__
+    :end-before: __result_restore_end__
+
+
+
+Viewing Errors
+--------------
+If an error occurred during training,
+:attr:`Result.error <ray.train.Result.error>` will be set and contain the exception
+that was raised.
+
+.. literalinclude:: ../doc_code/key_concepts.py
+    :language: python
+    :start-after: __result_error_start__
+    :end-before: __result_error_end__
+