ray-project · sven1977 · May 1, 2024 · Mar 13, 2024 · Mar 13, 2024 · Apr 2, 2024
@@ -158,15 +158,3 @@ Adding and Removing Modules
 
     Learner.add_module
     Learner.remove_module
-
-Managing Results
-----------------
-
-.. autosummary::
-    :nosignatures:
-    :toctree: doc/
-
-    Learner.compile_results
-    Learner.register_metric
-    Learner.register_metrics
-    Learner._check_result
@@ -244,12 +244,10 @@ Updates
             results = learner_group.update_from_batch(
                 batch=DUMMY_BATCH, async_update=True
             )
-            # `results` is a list of results dict. The items in the list represent the different
-            # remote results from the different calls to
-            # `update_from_batch(..., async_update=True)`.
-            assert len(results) > 0
-            # Each item is a results dict, already reduced over the n Learner workers.
-            assert isinstance(results[0], dict), results[0]
+            # `results` is an already reduced dict, which is the result of
+            # reducing over the individual async `update_from_batch(..., async_update=True)`
+            # calls.
+            assert isinstance(results, dict), results
 
             # This is an additional non-gradient based update.
             learner_group.additional_update(**ADDITIONAL_UPDATE_KWARGS)
@@ -373,9 +371,7 @@ Implementation
      - calculate the loss for gradient based update to a module.
    * - :py:meth:`~ray.rllib.core.learner.learner.Learner.additional_update_for_module()`
      - do any non gradient based updates to a RLModule, e.g. target network updates.
-   * - :py:meth:`~ray.rllib.core.learner.learner.Learner.compile_results()`
-     - compute training statistics and format them for downstream use.
-
+
 Starter Example
 ---------------
 
@@ -417,30 +413,4 @@ A :py:class:`~ray.rllib.core.learner.learner.Learner` that implements behavior c
 
             return loss
 
-        @override(Learner)
-        def compile_results(
-            self,
-            *,
-            batch: MultiAgentBatch,
-            fwd_out: Dict[str, Any],
-            loss_per_module: Dict[str, TensorType],
-            metrics_per_module: DefaultDict[ModuleID, Dict[str, Any]],
-        ) -> Dict[str, Any]:
-
-            results = super().compile_results(
-                batch=batch,
-                fwd_out=fwd_out,
-                loss_per_module=loss_per_module,
-                metrics_per_module=metrics_per_module,
-            )
-            # report the mean weight of each 
-            mean_ws = {}
-            for module_id in self.module.keys():
-                m = self.module[module_id]
-                parameters = convert_to_numpy(self.get_parameters(m))
-                mean_ws[module_id] = np.mean([w.mean() for w in parameters])
-                results[module_id]["mean_weight"] = mean_ws[module_id]
-
-            return results
-
 
@@ -91,6 +91,7 @@
 from ray.rllib.utils.metrics import (
     ALL_MODULES,
     ENV_RUNNER_RESULTS,
+    ENV_RUNNER_SAMPLING_TIMER,
     EVALUATION_ITERATION_TIMER,
     EVALUATION_RESULTS,
     FAULT_TOLERANCE_STATS,
@@ -117,7 +118,6 @@
     TIMERS,
     TRAINING_ITERATION_TIMER,
     TRAINING_STEP_TIMER,
-    SAMPLE_TIMER,
     STEPS_TRAINED_THIS_ITER_COUNTER,
 )
 from ray.rllib.utils.metrics.learner_info import LEARNER_INFO
@@ -1608,9 +1608,9 @@ def training_step(self) -> ResultDict:
             )
 
         # Collect SampleBatches from sample workers until we have a full batch.
-        with self.metrics.log_time((TIMERS, SAMPLE_TIMER)):
+        with self.metrics.log_time((TIMERS, ENV_RUNNER_SAMPLING_TIMER)):
             if self.config.count_steps_by == "agent_steps":
-                train_batch, env_runner_metrics = synchronous_parallel_sample(
+                train_batch, env_runner_results = synchronous_parallel_sample(
                     worker_set=self.workers,
                     max_agent_steps=self.config.train_batch_size,
                     sample_timeout_s=self.config.sample_timeout_s,
@@ -1620,7 +1620,7 @@ def training_step(self) -> ResultDict:
                     _return_metrics=True,
                 )
             else:
-                train_batch, env_runner_metrics = synchronous_parallel_sample(
+                train_batch, env_runner_results = synchronous_parallel_sample(
                     worker_set=self.workers,
                     max_env_steps=self.config.train_batch_size,
                     sample_timeout_s=self.config.sample_timeout_s,
@@ -1632,7 +1632,7 @@ def training_step(self) -> ResultDict:
         train_batch = train_batch.as_multi_agent()
 
         # Reduce EnvRunner metrics over the n EnvRunners.
-        self.metrics.log_n_dicts(env_runner_metrics, key=ENV_RUNNER_RESULTS)
+        self.metrics.log_n_dicts(env_runner_results, key=ENV_RUNNER_RESULTS)
 
         # Only train if train_batch is not empty.
         # In an extreme situation, all rollout workers die during the
@@ -3115,13 +3115,16 @@ def _create_local_replay_buffer_if_necessary(
         return from_config(ReplayBuffer, config["replay_buffer_config"])
 
     def _run_one_training_iteration(self) -> Tuple[ResultDict, "TrainIterCtx"]:
-        """Runs one training iteration (self.iteration will be +1 after this).
+        """Runs one training iteration (`self.iteration` will be +1 after this).
 
-        Calls `self.training_step()` repeatedly until the minimum time (sec),
-        sample- or training steps have been reached.
+        Calls `self.training_step()` repeatedly until the configured minimum time (sec),
+        minimum sample- or minimum training steps have been reached.
 
         Returns:
-            The results dict from the training iteration.
+            The ResultDict from the last call to `training_step()`. Note that even
+            though we only return the last ResultDict, the user stil has full control
+            over the history and reduce behavior of individual metrics at the time these
+            metrics are logged with `self.metrics.log_...()`.
         """
         with self._timers[TRAINING_ITERATION_TIMER]:
             # In case we are training (in a thread) parallel to evaluation,

@@ -1,5 +1,4 @@
 import abc
-from typing import Any, Dict
 
 from ray.rllib.algorithms.appo.appo import APPOConfig
 from ray.rllib.algorithms.impala.impala_learner import ImpalaLearner
@@ -46,7 +45,7 @@ def additional_update_for_module(
         last_update: int,
         mean_kl_loss_per_module: dict,
         **kwargs,
-    ) -> Dict[str, Any]:
+    ) -> None:
         """Updates the target networks and KL loss coefficients (per module).
 
         Args:
@@ -63,27 +62,28 @@ def additional_update_for_module(
         #  updates.
         #  We should instead have the target / kl threshold update be based off
         #  of the train_batch_size * some target update frequency * num_sgd_iter.
-        results = super().additional_update_for_module(
+        super().additional_update_for_module(
             module_id=module_id, config=config, timestep=timestep
         )
 
-        if (timestep - last_update) >= config.target_update_frequency:
+        # TODO (Sven): DQN uses `config.target_network_update_freq`. Can we
+        #  choose a standard here?
+        last_update_ts_key = (module_id, LAST_TARGET_UPDATE_TS)
+        if (
+            timestep - self.metrics.peek(last_update_ts_key, default=0)
+            >= config.target_update_frequency
+        ):
             self._update_module_target_networks(module_id, config)
-            results[NUM_TARGET_UPDATES] = 1
-            results[LAST_TARGET_UPDATE_TS] = timestep
-        else:
-            results[NUM_TARGET_UPDATES] = 0
-            results[LAST_TARGET_UPDATE_TS] = last_update
+            # Increase lifetime target network update counter by one.
+            self.metrics.log_value((module_id, NUM_TARGET_UPDATES), 1, reduce="sum")
+            # Update the (single-value -> window=1) last updated timestep metric.
+            self.metrics.log_value(last_update_ts_key, timestep, window=1)
 
         if config.use_kl_loss and module_id in mean_kl_loss_per_module:
-            results.update(
-                self._update_module_kl_coeff(
-                    module_id, config, mean_kl_loss_per_module[module_id]
-                )
+            self._update_module_kl_coeff(
+                module_id, config, mean_kl_loss_per_module[module_id]
             )
 
-        return results
-
     @abc.abstractmethod
     def _update_module_target_networks(
         self, module_id: ModuleID, config: APPOConfig
@@ -100,7 +100,7 @@ def _update_module_target_networks(
     @abc.abstractmethod
     def _update_module_kl_coeff(
         self, module_id: ModuleID, config: APPOConfig, sampled_kl: float
-    ) -> Dict[str, Any]:
+    ) -> None:
         """Dynamically update the KL loss coefficients of each module with.
 
         The update is completed using the mean KL divergence between the action

@@ -1,4 +1,4 @@
-from typing import Any, Dict
+from typing import Dict
 
 from ray.rllib.algorithms.appo.appo import (
     APPOConfig,
@@ -159,9 +159,11 @@ def compute_loss_for_module(
             + (mean_kl_loss * self.curr_kl_coeffs_per_module[module_id])
         )
 
-        # Register important loss stats.
-        self.register_metrics(
-            module_id,
+        # Register all important loss stats.
+        # Note that our MetricsLogger (self.metrics) is currently in tensor-mode,
+        # meaning that it allows us to even log in-graph/compiled tensors through
+        # its `log_...()` APIs.
+        self.metrics.log_dict(
             {
                 POLICY_LOSS_KEY: mean_pi_loss,
                 VF_LOSS_KEY: mean_vf_loss,
@@ -171,6 +173,8 @@ def compute_loss_for_module(
                     self.curr_kl_coeffs_per_module[module_id]
                 ),
             },
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
         )
         # Return the total loss.
         return total_loss
@@ -194,7 +198,7 @@ def _update_module_target_networks(
     @override(AppoLearner)
     def _update_module_kl_coeff(
         self, module_id: ModuleID, config: APPOConfig, sampled_kl: float
-    ) -> Dict[str, Any]:
+    ) -> None:
         # Update the current KL value based on the recently measured value.
         # Increase.
         kl_coeff_var = self.curr_kl_coeffs_per_module[module_id]
@@ -206,4 +210,8 @@ def _update_module_kl_coeff(
         elif sampled_kl < 0.5 * config.kl_target:
             kl_coeff_var.assign(kl_coeff_var * 0.5)
 
-        return {LEARNER_RESULTS_CURR_KL_COEFF_KEY: kl_coeff_var.numpy()}
+        self.metrics.log_value(
+            (module_id, LEARNER_RESULTS_CURR_KL_COEFF_KEY),
+            kl_coeff_var.numpy(),
+            window=1,
+        )
@@ -1,4 +1,4 @@
-from typing import Any, Dict
+from typing import Dict
 
 from ray.rllib.algorithms.appo.appo import (
     APPOConfig,
@@ -163,9 +163,8 @@ def compute_loss_for_module(
             + (mean_kl_loss * self.curr_kl_coeffs_per_module[module_id])
         )
 
-        # Register important loss stats.
-        self.register_metrics(
-            module_id,
+        # Log important loss stats.
+        self.metrics.log_dict(
             {
                 POLICY_LOSS_KEY: mean_pi_loss,
                 VF_LOSS_KEY: mean_vf_loss,
@@ -175,6 +174,8 @@ def compute_loss_for_module(
                     self.curr_kl_coeffs_per_module[module_id]
                 ),
             },
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
         )
         # Return the total loss.
         return total_loss
@@ -231,7 +232,7 @@ def _update_module_target_networks(
     @override(AppoLearner)
     def _update_module_kl_coeff(
         self, module_id: ModuleID, config: APPOConfig, sampled_kl: float
-    ) -> Dict[str, Any]:
+    ) -> None:
         # Update the current KL value based on the recently measured value.
         # Increase.
         kl_coeff_var = self.curr_kl_coeffs_per_module[module_id]
@@ -243,4 +244,8 @@ def _update_module_kl_coeff(
         elif sampled_kl < 0.5 * config.kl_target:
             kl_coeff_var.data *= 0.5
 
-        return {LEARNER_RESULTS_CURR_KL_COEFF_KEY: kl_coeff_var.item()}
+        self.metrics.log_value(
+            (module_id, LEARNER_RESULTS_CURR_KL_COEFF_KEY),
+            kl_coeff_var.item(),
+            window=1,
+        )
@@ -57,11 +57,13 @@ def possibly_masked_mean(t):
 
         policy_loss = -possibly_masked_mean(log_probs)
 
-        self.register_metrics(
-            module_id,
+        # Log important loss stats.
+        self.metrics.log_dict(
             {
                 POLICY_LOSS_KEY: policy_loss,
             },
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
         )
 
         # Return total loss which is for BC simply the policy loss.

@@ -57,7 +57,12 @@ def possibly_masked_mean(t):
 
         policy_loss = -possibly_masked_mean(log_probs)
 
-        self.register_metrics(module_id, {POLICY_LOSS_KEY: policy_loss})
+        # Log important loss stats.
+        self.metrics.log_dict(
+            {POLICY_LOSS_KEY: policy_loss},
+            key=module_id,
+            window=1,  # <- single items (should not be mean/ema-reduced over time).
+        )
 
         # Return the total loss which is for BC simply the policy loss.
         return policy_loss