From 8f85a70265293f20df1fdef48ae112a5da936097 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 2 May 2024 12:08:39 +0200
Subject: [PATCH 1/4] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/env/multi_agent_env.py                  |  12 +-
 .../examples/envs/custom_env_render_method.py | 203 ++++++++++++++++++
 2 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 rllib/examples/envs/custom_env_render_method.py

diff --git a/rllib/env/multi_agent_env.py b/rllib/env/multi_agent_env.py
index 19554ff24444..78be0ec26db8 100644
--- a/rllib/env/multi_agent_env.py
+++ b/rllib/env/multi_agent_env.py
@@ -2,6 +2,8 @@
 import logging
 from typing import Callable, Dict, List, Tuple, Optional, Union, Set, Type
 
+import numpy as np
+
 from ray.rllib.env.base_env import BaseEnv
 from ray.rllib.env.env_context import EnvContext
 from ray.rllib.utils.annotations import (
@@ -554,7 +556,15 @@ def step(self, action_dict):
 
         @override(MultiAgentEnv)
         def render(self):
-            return self.envs[0].render(self.render_mode)
+            # This render method simply renders all n underlying individual single-agent
+            # envs and concatenates their images (on top of each other if the returned
+            # images have dims where [width] > [height], otherwise next to each other).
+            render_images = [e.render() for e in self.envs]
+            if render_images[0].shape[1] > render_images[0].shape[0]:
+                concat_dim = 0
+            else:
+                concat_dim = 1
+            return np.concatenate(render_images, axis=concat_dim)
 
     return MultiEnv
 
diff --git a/rllib/examples/envs/custom_env_render_method.py b/rllib/examples/envs/custom_env_render_method.py
new file mode 100644
index 000000000000..ee75b1863a73
--- /dev/null
+++ b/rllib/examples/envs/custom_env_render_method.py
@@ -0,0 +1,203 @@
+"""Example of implementing a custom `render()` method for your gymnasium RL environment.
+
+This example:
+    - shows how to write a simple gym.Env class yourself, in this case a corridor env,
+    in which the agent starts at the left side of the corridor and has to reach the
+    goal state all the way at the right.
+    - in particular, the new class overrides the Env's `render()` method to show, how
+    you can write your own rendering logic.
+    - furthermore, we use the RLlib callbacks class introduced in this example here:
+    https://github.com/ray-project/ray/blob/master/rllib/examples/envs/env_rendering_and_recording.py  # noqa
+    in order to compile videos of the worst and best performing episodes in each
+    iteration and log these videos to your WandB account, so you can view them.
+
+
+How to run this script
+----------------------
+`python [script file name].py --enable-new-api-stack
+--wandb-key=[your WandB API key] --wandb-project=[some WandB project name]
+--wandb-run-name=[optional: WandB run name within --wandb-project]`
+
+In order to see the actual videos, you need to have a WandB account and provide your
+API key and a project name on the command line (see above).
+
+Use the `--num-agents` argument to set up the env as a multi-agent env. If
+`--num-agents` > 0, RLlib will simply run as many of the defined single-agent
+environments in parallel and with different policies to be trained for each agent.
+
+For debugging, use the following additional command line options
+`--no-tune --num-env-runners=0`
+which should allow you to set breakpoints anywhere in the RLlib code and
+have the execution stop there for inspection and debugging.
+
+
+Results to expect
+-----------------
+After the first training iteration, you should see the videos in your WandB account
+under the provided `--wandb-project` name. Filter for "videos_best" or "videos_worst".
+
+Note that the default Tune TensorboardX (TBX) logger might complain about the videos
+being logged. This is ok, the TBX logger will simply ignore these. The WandB logger,
+however, will recognize the video tensors shaped
+(1 [batch], T [video len], 3 [rgb], [height], [width]) and properly create a WandB video
+object to be sent to their server.
+
+Your terminal output should look similar to this (the following is for a
+`--num-agents=2` run; expect similar results for the other `--num-agents`
+settings):
++---------------------+------------+----------------+--------+------------------+
+| Trial name          | status     | loc            |   iter |   total time (s) |
+|---------------------+------------+----------------+--------+------------------+
+| PPO_env_fb1c0_00000 | TERMINATED | 127.0.0.1:8592 |      3 |          21.1876 |
++---------------------+------------+----------------+--------+------------------+
++-------+-------------------+-------------+-------------+
+|    ts |   combined return |   return p1 |   return p0 |
+|-------+-------------------+-------------+-------------|
+| 12000 |           12.7655 |      7.3605 |      5.4095 |
++-------+-------------------+-------------+-------------+
+"""
+
+import gymnasium as gym
+import numpy as np
+from gymnasium.spaces import Box, Discrete
+from PIL import Image, ImageDraw
+
+from ray.rllib.algorithms.ppo import PPOConfig
+from ray.rllib.env.multi_agent_env import make_multi_agent
+from ray.rllib.examples.envs.env_rendering_and_recording import EnvRenderCallback
+from ray.rllib.utils.test_utils import (
+    add_rllib_example_script_args,
+    run_rllib_example_script_experiment,
+)
+from ray import tune
+
+parser = add_rllib_example_script_args(
+    default_iters=10,
+    default_reward=9.0,
+    default_timesteps=10000,
+)
+
+
+class CustomRenderedCorridorEnv(gym.Env):
+    """Example of a custom env, for which we specify rendering behavior."""
+
+    def __init__(self, config):
+        self.end_pos = config.get("corridor_length", 10)
+        self.max_steps = config.get("max_steps", 100)
+        self.cur_pos = 0
+        self.steps = 0
+        self.action_space = Discrete(2)
+        self.observation_space = Box(0.0, 999.0, shape=(1,), dtype=np.float32)
+
+    def reset(self, *, seed=None, options=None):
+        self.cur_pos = 0.0
+        self.steps = 0
+        return np.array([self.cur_pos], np.float32), {}
+
+    def step(self, action):
+        self.steps += 1
+        assert action in [0, 1], action
+        if action == 0 and self.cur_pos > 0:
+            self.cur_pos -= 1.0
+        elif action == 1:
+            self.cur_pos += 1.0
+        truncated = self.steps >= self.max_steps
+        terminated = self.cur_pos >= self.end_pos
+        return (
+            np.array([self.cur_pos], np.float32),
+            10.0 if terminated else -0.1,
+            terminated,
+            truncated,
+            {},
+        )
+
+    def render(self) -> np._typing.NDArray[np.uint8]:
+        """Implements rendering logic for this env (given the current observation).
+
+        You should return a numpy RGB image like so:
+        np.array([height, width, 3], dtype=np.uint8).
+
+        Returns:
+            np.ndarray: A numpy uint8 3D array (image) to render.
+        """
+        # Image dimensions.
+        # Each position in the corridor is 50 pixels wide.
+        width = (self.end_pos + 2) * 50
+        # Fixed height of the image.
+        height = 100
+
+        # Create a new image with white background
+        image = Image.new("RGB", (width, height), "white")
+        draw = ImageDraw.Draw(image)
+
+        # Draw the corridor walls
+        # Grey rectangle for the corridor.
+        draw.rectangle([50, 30, width - 50, 70], fill="grey")
+
+        # Draw the agent.
+        # Calculate the x coordinate of the agent.
+        agent_x = (self.cur_pos + 1) * 50
+        # Blue rectangle for the agent.
+        draw.rectangle([agent_x + 10, 40, agent_x + 40, 60], fill="blue")
+
+        # Draw the goal state.
+        # Calculate the x coordinate of the goal.
+        goal_x = self.end_pos * 50
+        # Green rectangle for the goal state.
+        draw.rectangle([goal_x + 10, 40, goal_x + 40, 60], fill="green")
+
+        # Convert the image to a uint8 numpy array.
+        return np.array(image, dtype=np.uint8)
+
+
+# Create a simple multi-agent version of the above Env by duplicating the single-agent
+# env n (n=num agents) times and having the agents act independently, each one in a
+# different corridor.
+MultiAgentCustomRenderedCorridorEnv = make_multi_agent(
+    lambda config: CustomRenderedCorridorEnv(config)
+)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    assert (
+        args.enable_new_api_stack
+    ), "Must set --enable-new-api-stack when running this script!"
+
+    # The `config` arg passed into our Env's constructor (see the class' __init__ method
+    # above). Feel free to change these.
+    env_options = {
+        "corridor_length": 10,
+        "max_steps": 100,
+        "num_agents": args.num_agents,  # <- only used by the multu-agent version.
+    }
+
+    env_cls_to_use = (
+        CustomRenderedCorridorEnv
+        if args.num_agents == 0
+        else MultiAgentCustomRenderedCorridorEnv
+    )
+
+    tune.register_env("env", lambda _: env_cls_to_use(env_options))
+
+    # Example config switching on rendering.
+    base_config = (
+        PPOConfig()
+        # Configure our env to be the above-registered one.
+        .environment("env")
+        # Plugin our env-rendering (and logging) callback. This callback class allows
+        # you to fully customize your rendering behavior (which workers should render,
+        # which episodes, which (vector) env indices, etc..). We refer to this example
+        # script here for further details:
+        # https://github.com/ray-project/ray/blob/master/rllib/examples/envs/env_rendering_and_recording.py  # noqa
+        .callbacks(EnvRenderCallback)
+    )
+
+    if args.num_agents > 0:
+        base_config.multi_agent(
+            policies={f"p{i}" for i in range(args.num_agents)},
+            policy_mapping_fn=lambda aid, eps, **kw: f"p{aid}",
+        )
+
+    run_rllib_example_script_experiment(base_config, args)

From 918f4a37ed4c97f3b7372bd07e3035a63931fe47 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 2 May 2024 12:11:55 +0200
Subject: [PATCH 2/4] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/rllib/BUILD b/rllib/BUILD
index d1209d8a6227..da81761726a9 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -2278,6 +2278,24 @@ py_test(
     args = ["--enable-new-api-stack", "--as-test"]
 )
 
+py_test(
+    name = "examples/envs/custom_env_render_method",
+    main = "examples/envs/custom_env_render_method.py",
+    tags = ["team:rllib", "exclusive", "examples"],
+    size = "small",
+    srcs = ["examples/envs/custom_env_render_method.py"],
+    args = ["--enable-new-api-stack", "--num-agents=0"]
+)
+
+py_test(
+    name = "examples/envs/custom_env_render_method_multi_agent",
+    main = "examples/envs/custom_env_render_method.py",
+    tags = ["team:rllib", "exclusive", "examples"],
+    size = "small",
+    srcs = ["examples/envs/custom_env_render_method.py"],
+    args = ["--enable-new-api-stack", "--num-agents=2"]
+)
+
 #@OldAPIStack
 py_test(
     name = "examples/envs/greyscale_env",

From e85886979796d231c011f3e48800f8742b68b2f6 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 2 May 2024 19:00:03 +0200
Subject: [PATCH 3/4] fixes

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                                   | 10 +++++----
 rllib/algorithms/algorithm.py                 | 22 +++++++++++++------
 .../tests/test_callbacks_on_algorithm.py      |  2 +-
 .../tests/test_callbacks_on_env_runner.py     |  1 +
 .../algorithms/tests/test_worker_failures.py  |  2 +-
 rllib/env/multi_agent_env_runner.py           |  1 +
 rllib/env/single_agent_env_runner.py          |  1 +
 .../self_play_with_policy_checkpoint.py       |  2 +-
 .../curriculum/curriculum_learning.py         |  1 +
 .../envs/env_rendering_and_recording.py       |  2 ++
 .../evaluation_parallel_to_training.py        | 12 +++++++++-
 .../multi_agent/utils/self_play_callback.py   |  2 +-
 .../utils/self_play_league_based_callback.py  |  2 +-
 13 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index 29e022ade72e..e956d5ea116a 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -2280,17 +2280,19 @@ py_test(
 
 py_test(
     name = "examples/envs/custom_env_render_method",
-    srcs = ["examples/envs/custom_env_render_method.py"],
+    main = "examples/envs/custom_env_render_method.py",
     tags = ["team:rllib", "exclusive", "examples"],
-    size = "small",
+    size = "medium",
+    srcs = ["examples/envs/custom_env_render_method.py"],
     args = ["--enable-new-api-stack", "--num-agents=0"]
 )
 
 py_test(
     name = "examples/envs/custom_env_render_method_multi_agent",
-    srcs = ["examples/envs/custom_env_render_method.py"],
+    main = "examples/envs/custom_env_render_method.py",
     tags = ["team:rllib", "exclusive", "examples"],
-    size = "small",
+    size = "medium",
+    srcs = ["examples/envs/custom_env_render_method.py"],
     args = ["--enable-new-api-stack", "--num-agents=2"]
 )
 
diff --git a/rllib/algorithms/algorithm.py b/rllib/algorithms/algorithm.py
index 72be65d07cfb..c1d30284bd06 100644
--- a/rllib/algorithms/algorithm.py
+++ b/rllib/algorithms/algorithm.py
@@ -796,7 +796,7 @@ def setup(self, config: AlgorithmConfig) -> None:
             self.workers.sync_weights(inference_only=True)
 
         # Run `on_algorithm_init` callback after initialization is done.
-        self.callbacks.on_algorithm_init(algorithm=self)
+        self.callbacks.on_algorithm_init(algorithm=self, metrics_logger=self.metrics)
 
     @OverrideToImplementCustomLogic
     @classmethod
@@ -999,7 +999,7 @@ def evaluate(
                     config=self.evaluation_config,
                 )
 
-        self.callbacks.on_evaluate_start(algorithm=self)
+        self.callbacks.on_evaluate_start(algorithm=self, metrics_logger=self.metrics)
 
         env_steps = agent_steps = 0
         batches = []
@@ -1097,7 +1097,11 @@ def evaluate(
                     eval_results["off_policy_estimator"][name] = avg_estimate
 
         # Trigger `on_evaluate_end` callback.
-        self.callbacks.on_evaluate_end(algorithm=self, evaluation_metrics=eval_results)
+        self.callbacks.on_evaluate_end(
+            algorithm=self,
+            metrics_logger=self.metrics,
+            evaluation_metrics=eval_results,
+        )
 
         # Also return the results here for convenience.
         return eval_results
@@ -2447,9 +2451,13 @@ def load_checkpoint(self, checkpoint_dir: str) -> None:
     def log_result(self, result: ResultDict) -> None:
         # Log after the callback is invoked, so that the user has a chance
         # to mutate the result.
-        # TODO: Remove `algorithm` arg at some point to fully deprecate the old
-        #  signature.
-        self.callbacks.on_train_result(algorithm=self, result=result)
+        # TODO (sven): It might not make sense to pass in the MetricsLogger at this late
+        #  point in time. In here, the result dict has already been "compiled" (reduced)
+        #  by the MetricsLogger and there is probably no point in adding more Stats
+        #  here.
+        self.callbacks.on_train_result(
+            algorithm=self, metrics_logger=self.metrics, result=result
+        )
         # Then log according to Trainable's logging logic.
         Trainable.log_result(self, result)
 
@@ -3264,7 +3272,7 @@ def _run_one_training_iteration_and_evaluation_in_parallel_wo_thread(
                 config=self.evaluation_config,
             )
 
-        self.callbacks.on_evaluate_start(algorithm=self)
+        self.callbacks.on_evaluate_start(algorithm=self, metrics_logger=self.metrics)
 
         env_steps = agent_steps = 0
 
diff --git a/rllib/algorithms/tests/test_callbacks_on_algorithm.py b/rllib/algorithms/tests/test_callbacks_on_algorithm.py
index c3533ab6ac8b..9a07da3850fc 100644
--- a/rllib/algorithms/tests/test_callbacks_on_algorithm.py
+++ b/rllib/algorithms/tests/test_callbacks_on_algorithm.py
@@ -35,7 +35,7 @@ def on_workers_recreated(
 
 
 class InitAndCheckpointRestoredCallbacks(DefaultCallbacks):
-    def on_algorithm_init(self, *, algorithm, **kwargs):
+    def on_algorithm_init(self, *, algorithm, metrics_logger, **kwargs):
         self._on_init_was_called = True
 
     def on_checkpoint_loaded(self, *, algorithm, **kwargs):
diff --git a/rllib/algorithms/tests/test_callbacks_on_env_runner.py b/rllib/algorithms/tests/test_callbacks_on_env_runner.py
index 062f39a99f01..34329c20bf41 100644
--- a/rllib/algorithms/tests/test_callbacks_on_env_runner.py
+++ b/rllib/algorithms/tests/test_callbacks_on_env_runner.py
@@ -49,6 +49,7 @@ def on_episode_created(
         episode,
         worker=None,
         env_runner=None,
+        metrics_logger=None,
         base_env=None,
         env=None,
         policies=None,
diff --git a/rllib/algorithms/tests/test_worker_failures.py b/rllib/algorithms/tests/test_worker_failures.py
index 1548aa2b4291..adaa80dc675e 100644
--- a/rllib/algorithms/tests/test_worker_failures.py
+++ b/rllib/algorithms/tests/test_worker_failures.py
@@ -225,7 +225,7 @@ class AddModuleCallback(DefaultCallbacks):
     def __init__(self):
         super().__init__()
 
-    def on_algorithm_init(self, *, algorithm, **kwargs):
+    def on_algorithm_init(self, *, algorithm, metrics_logger, **kwargs):
         # Add a custom module to algorithm.
         spec = algorithm.config.get_default_rl_module_spec()
         spec.observation_space = gym.spaces.Box(low=0, high=1, shape=(8,))
diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py
index 7b9042d013f6..e26968a7d36c 100644
--- a/rllib/env/multi_agent_env_runner.py
+++ b/rllib/env/multi_agent_env_runner.py
@@ -749,6 +749,7 @@ def make_env(self):
         # Call the `on_environment_created` callback.
         self._callbacks.on_environment_created(
             env_runner=self,
+            metrics_logger=self.metrics,
             env=self.env,
             env_context=env_ctx,
         )
diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py
index 5250cddadcb5..23407629835e 100644
--- a/rllib/env/single_agent_env_runner.py
+++ b/rllib/env/single_agent_env_runner.py
@@ -702,6 +702,7 @@ def make_env(self) -> None:
         # Call the `on_environment_created` callback.
         self._callbacks.on_environment_created(
             env_runner=self,
+            metrics_logger=self.metrics,
             env=self.env,
             env_context=env_ctx,
         )
diff --git a/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py
index 38531c626b5f..26d663cc7f2e 100644
--- a/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py
+++ b/rllib/examples/_old_api_stack/connectors/self_play_with_policy_checkpoint.py
@@ -46,7 +46,7 @@ def __init__(self, checkpoint_dir):
         self._checkpoint_dir = checkpoint_dir
         super().__init__()
 
-    def on_algorithm_init(self, *, algorithm, **kwargs):
+    def on_algorithm_init(self, *, algorithm, metrics_logger, **kwargs):
         policy = Policy.from_checkpoint(
             self._checkpoint_dir, policy_ids=[OPPONENT_POLICY_ID]
         )
diff --git a/rllib/examples/curriculum/curriculum_learning.py b/rllib/examples/curriculum/curriculum_learning.py
index b0cb6865e98a..5529138e1024 100644
--- a/rllib/examples/curriculum/curriculum_learning.py
+++ b/rllib/examples/curriculum/curriculum_learning.py
@@ -149,6 +149,7 @@ def on_train_result(
         self,
         *,
         algorithm: Algorithm,
+        metrics_logger=None,
         result: dict,
         **kwargs,
     ) -> None:
diff --git a/rllib/examples/envs/env_rendering_and_recording.py b/rllib/examples/envs/env_rendering_and_recording.py
index b7a3e743c93b..60b7a44606d5 100644
--- a/rllib/examples/envs/env_rendering_and_recording.py
+++ b/rllib/examples/envs/env_rendering_and_recording.py
@@ -103,6 +103,7 @@ def on_episode_step(
         *,
         episode,
         env_runner,
+        metrics_logger,
         env,
         env_index,
         rl_module,
@@ -137,6 +138,7 @@ def on_episode_end(
         *,
         episode,
         env_runner,
+        metrics_logger,
         env,
         env_index,
         rl_module,
diff --git a/rllib/examples/evaluation/evaluation_parallel_to_training.py b/rllib/examples/evaluation/evaluation_parallel_to_training.py
index d1e45bed5624..e7b6af7ed3e4 100644
--- a/rllib/examples/evaluation/evaluation_parallel_to_training.py
+++ b/rllib/examples/evaluation/evaluation_parallel_to_training.py
@@ -66,6 +66,8 @@
 |          81.7371 | 100000 |   494.68 |             494.68 |
 +------------------+--------+----------+--------------------+
 """
+from typing import Optional
+
 from ray.rllib.algorithms.algorithm import Algorithm
 from ray.rllib.algorithms.callbacks import DefaultCallbacks
 from ray.rllib.examples.envs.classes.multi_agent import MultiAgentCartPole
@@ -75,6 +77,7 @@
     NUM_EPISODES,
     NUM_ENV_STEPS_SAMPLED,
 )
+from ray.rllib.utils.metrics.metrics_logger import MetricsLogger
 from ray.rllib.utils.test_utils import (
     add_rllib_example_script_args,
     run_rllib_example_script_experiment,
@@ -124,7 +127,14 @@
 
 
 class AssertEvalCallback(DefaultCallbacks):
-    def on_train_result(self, *, algorithm: Algorithm, result: ResultDict, **kwargs):
+    def on_train_result(
+        self,
+        *,
+        algorithm: Algorithm,
+        metrics_logger: Optional[MetricsLogger] = None,
+        result: ResultDict,
+        **kwargs,
+    ):
         # The eval results can be found inside the main `result` dict
         # (old API stack: "evaluation").
         eval_results = result.get(EVALUATION_RESULTS, result.get("evaluation", {}))
diff --git a/rllib/examples/multi_agent/utils/self_play_callback.py b/rllib/examples/multi_agent/utils/self_play_callback.py
index 3554cebcff90..c9dd443ae517 100644
--- a/rllib/examples/multi_agent/utils/self_play_callback.py
+++ b/rllib/examples/multi_agent/utils/self_play_callback.py
@@ -18,7 +18,7 @@ def __init__(self, win_rate_threshold):
         # Report the matchup counters (who played against whom?).
         self._matching_stats = defaultdict(int)
 
-    def on_train_result(self, *, algorithm, result, **kwargs):
+    def on_train_result(self, *, algorithm, metrics_logger=None, result, **kwargs):
         # Get the win rate for the train batch.
         # Note that normally, one should set up a proper evaluation config,
         # such that evaluation always happens on the already updated policy,
diff --git a/rllib/examples/multi_agent/utils/self_play_league_based_callback.py b/rllib/examples/multi_agent/utils/self_play_league_based_callback.py
index 68c2880ac741..a8f1c74daf86 100644
--- a/rllib/examples/multi_agent/utils/self_play_league_based_callback.py
+++ b/rllib/examples/multi_agent/utils/self_play_league_based_callback.py
@@ -32,7 +32,7 @@ def __init__(self, win_rate_threshold):
         # Report the matchup counters (who played against whom?).
         self._matching_stats = defaultdict(int)
 
-    def on_train_result(self, *, algorithm, result, **kwargs):
+    def on_train_result(self, *, algorithm, metrics_logger=None, result, **kwargs):
         local_worker = algorithm.workers.local_worker()
 
         # Avoid `self` being pickled into the remote function below.

From 7bc37128657d30b1949b0600c58a3cab291c13bb Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 2 May 2024 19:53:06 +0200
Subject: [PATCH 4/4] fixes

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/env/multi_agent_env_runner.py  |  8 ++++----
 rllib/env/single_agent_env_runner.py | 14 +++++++-------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py
index e26968a7d36c..e342c2ded892 100644
--- a/rllib/env/multi_agent_env_runner.py
+++ b/rllib/env/multi_agent_env_runner.py
@@ -60,6 +60,10 @@ def __init__(self, config: AlgorithmConfig, **kwargs):
         # Get the worker index on which this instance is running.
         self.worker_index: int = kwargs.get("worker_index")
 
+        # Set up all metrics-related structures and counters.
+        self.metrics: Optional[MetricsLogger] = None
+        self._setup_metrics()
+
         # Create our callbacks object.
         self._callbacks: DefaultCallbacks = self.config.callbacks_class()
 
@@ -86,10 +90,6 @@ def __init__(self, config: AlgorithmConfig, **kwargs):
         # Create the two connector pipelines: env-to-module and module-to-env.
         self._module_to_env = self.config.build_module_to_env_connector(self.env)
 
-        # Set up all metrics-related structures and counters.
-        self.metrics: Optional[MetricsLogger] = None
-        self._setup_metrics()
-
         self._needs_initial_reset: bool = True
         self._episode: Optional[MultiAgentEpisode] = None
         self._shared_data = None
diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py
index 23407629835e..1bac56bf9dab 100644
--- a/rllib/env/single_agent_env_runner.py
+++ b/rllib/env/single_agent_env_runner.py
@@ -52,11 +52,16 @@ def __init__(self, config: AlgorithmConfig, **kwargs):
         """
         super().__init__(config=config)
 
+        self.worker_index = kwargs.get("worker_index")
+
+        # Create a MetricsLogger object for logging custom stats.
+        self.metrics = MetricsLogger()
+        # Initialize lifetime counts.
+        self.metrics.log_value(NUM_ENV_STEPS_SAMPLED_LIFETIME, 0, reduce="sum")
+
         # Create our callbacks object.
         self._callbacks: DefaultCallbacks = self.config.callbacks_class()
 
-        self.worker_index = kwargs.get("worker_index")
-
         # Create the vectorized gymnasium env.
         self.env: Optional[gym.Wrapper] = None
         self.num_envs: int = 0
@@ -98,11 +103,6 @@ def __init__(self, config: AlgorithmConfig, **kwargs):
         # Create the two connector pipelines: env-to-module and module-to-env.
         self._module_to_env = self.config.build_module_to_env_connector(self.env)
 
-        # Create a MetricsLogger object for logging custom stats.
-        self.metrics = MetricsLogger()
-        # Initialize lifetime counts.
-        self.metrics.log_value(NUM_ENV_STEPS_SAMPLED_LIFETIME, 0, reduce="sum")
-
         # This should be the default.
         self._needs_initial_reset: bool = True
         self._episodes: List[Optional[SingleAgentEpisode]] = [