## Setup

In [1]:
!pip install -q wheel setuptools pip swig --upgrade
!pip install -q blinker --ignore-installed

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradient-utils 0.5.0 requires wheel<0.36.0,>=0.35.1, but you have wheel 0.44.0 which is incompatible.
gradient 2.0.6 requires attrs<=19, but you have attrs 23.1.0 which is incompatible.[0m[31m
[0m

In [10]:
!python --version

Python 3.11.7


In [12]:
!pip install nvidia-pyindex
!pip install nvidia-tensorrt

Collecting nvidia-pyindex
  Downloading nvidia-pyindex-1.0.9.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: nvidia-pyindex
  Building wheel for nvidia-pyindex (setup.py) ... [?25ldone
[?25h  Created wheel for nvidia-pyindex: filename=nvidia_pyindex-1.0.9-py3-none-any.whl size=8419 sha256=110c8d93639fc0b60ca21d0596e1b788694d7b5b48b30077437a6d9cbfebcb1d
  Stored in directory: /root/.cache/pip/wheels/49/d0/7d/b68b3665d16ee20355e65fb7ef48b7ca26533217d9f09924fe
Successfully built nvidia-pyindex
Installing collected packages: nvidia-pyindex
Successfully installed nvidia-pyindex-1.0.9
[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting nvidia-tensorrt
  Downloading nvidia_tensorrt-99.0.0-py3-none-manylinux_2_17_x86_64.whl.metadata (596 bytes)
Collecting tensorrt (from nvidia-tensorrt)
  Downloading tensorrt-10.4.0.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting ten

In [7]:
!pip install -q stable-baselines3 "gymnasium[box2d]" tensorflow[and-cuda] dagshub mlflow hyperopt

[0m

In [3]:
!pip install -q stable-baselines3[extra]

[0m

In [13]:
import tensorrt as trt
print("TensorRT version:", trt.__version__)

TensorRT version: 10.4.0


In [4]:
import warnings
warnings.filterwarnings('ignore')
import gymnasium as gym

In [6]:
import dagshub
dagshub.init(repo_owner='smileynet', repo_name='gymnasium_experiments', mlflow=True)

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=c869c66b-6acc-4ca1-8169-b36be1a9854b&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=87855edcb7374861d92e2dae45e173d19c68bf5a92c0b455109f151286e5af7a




## Determine Best Device

In [14]:
import time
import torch
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

TOTAL_TIMESTEPS = 100000  # Short benchmark to measure FPS

def measure_fps(device):
    """
    Measure FPS on the given device (cpu or cuda).
    """
    # Create environment
    env = make_vec_env("LunarLander-v2", n_envs=1)

    # Initialize the model on the specified device
    model = PPO('MlpPolicy', env, device=device)

    # Start timer
    start_time = time.time()

    # Train for a small number of timesteps (benchmark)
    model.learn(total_timesteps=TOTAL_TIMESTEPS)

    # End timer
    end_time = time.time()

    # Calculate FPS (frames per second)
    elapsed_time = end_time - start_time
    fps = TOTAL_TIMESTEPS / elapsed_time

    # Clean up
    env.close()

    return fps

# Check if GPU is available
gpu_available = torch.cuda.is_available()

# Measure FPS on CPU
cpu_fps = measure_fps(device="cpu")
print(f"CPU FPS: {cpu_fps:.2f}")

# Measure FPS on GPU (if available)
if gpu_available:
    gpu_fps = measure_fps(device="cuda")
    print(f"GPU FPS: {gpu_fps:.2f}")
else:
    gpu_fps = 0
    print("GPU is not available.")

# Select the device with the higher FPS
if gpu_fps > cpu_fps:
    print("Using GPU for training.")
    chosen_device = "cuda"
else:
    print("Using CPU for training.")
    chosen_device = "cpu"



CPU FPS: 385.38
GPU FPS: 435.07
Using GPU for training.


In [23]:
import os
import sys
import pickle
from typing import Any, Dict, Union, Tuple
import torch
import mlflow
import numpy as np
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import stable_baselines3
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.logger import HumanOutputFormat, KVWriter, Logger

def ensure_directory_exists(new_dir):
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
    return new_dir

# Custom logging format to send metrics to MLflow
class MLflowOutputFormat(KVWriter):
    """
    Dumps key/value pairs into MLflow's numeric format.
    """
    def write(
        self,
        key_values: Dict[str, Any],
        key_excluded: Dict[str, Union[str, Tuple[str, ...]]],
        step: int = 0,
    ) -> None:

        for (key, value), (_, excluded) in zip(
            sorted(key_values.items()), sorted(key_excluded.items())
        ):
            if excluded is not None and "mlflow" in excluded:
                continue
            if isinstance(value, np.ScalarType):
                if not isinstance(value, str):
                    mlflow.log_metric(key, value, step)
                    

def set_mlflow_tags(hparams, trial_number):
    mlflow.set_tag("trial_number", trial_number)
    mlflow.set_tag("optimizer", "hyperopt")
    mlflow.set_tag("model_type", "PPO")
    mlflow.set_tag("policy_type", "MlpPolicy")
    mlflow.set_tag("environment_name", "LunarLander-v2")
    mlflow.set_tag("total_timesteps", TOTAL_TIMESTEPS)
    mlflow.set_tag("python_version", sys.version)
    mlflow.set_tag("stable_baselines3_version", stable_baselines3.__version__)
    mlflow.set_tag("device", DEVICE)
    mlflow.log_params(hparams)
    
def create_model(hparams, env):
    return PPO(
        policy="MlpPolicy",
        env=env,
        device=DEVICE,
        verbose=2,
        **hparams
    )

def evaluate_model(model, eval_env, n_eval_episodes=10):
    all_episode_rewards = []
    for _ in range(n_eval_episodes):
        episode_rewards = 0
        done = False
        obs = eval_env.reset()
        while not done:
            action, _ = model.predict(obs)
            obs, reward, done, info = eval_env.step(action)
            episode_rewards += reward
        all_episode_rewards.append(episode_rewards)
    mean_reward = np.mean(all_episode_rewards)
    return mean_reward

def train_and_evaluate(hparams, trial_number):
    global best_mean_reward
    global models_dir
    global best_model_path
    
    # Set up environments
    env = make_vec_env("LunarLander-v2", n_envs=16)
    eval_env = make_vec_env("LunarLander-v2", n_envs=1)

    try:
        # Initialize the model
        model = create_model(hparams, env)

        # Set up logging
        loggers = Logger(
            folder="logs",
            output_formats=[MLflowOutputFormat()], # HumanOutputFormat(sys.stdout), 
        )
        model.set_logger(loggers)

        # Train the model
        model.learn(total_timesteps=TOTAL_TIMESTEPS, log_interval=1)

        # Evaluate the model
        mean_reward = evaluate_model(model, eval_env)

        # If best model, save it
        if mean_reward > best_mean_reward:
            best_mean_reward = mean_reward
            model_save_path = os.path.join(models_dir, f"model_trial_{trial_number}.zip")
            model.save(model_save_path)
            model.save(best_model_path)
            mlflow.log_artifact(model_save_path)
        
        # Log the evaluation metric
        mlflow.log_metric("mean_reward", mean_reward)

        # Return the loss (negative reward to minimize)
        return {'loss': -mean_reward, 'status': STATUS_OK}

    finally:
        # Ensure environments are properly closed
        env.reset()
        env.close()
        eval_env.reset()
        eval_env.close()
        
def save_trials(trials):
    with open("trials.pkl", "wb") as f:
        pickle.dump(trials, f)        

def objective(hparams, trial_number):
    with mlflow.start_run(nested=True):  # Nested run for each trial
        # Set MLflow tags
        set_mlflow_tags(hparams, trial_number)

        # Train, evaluate, and get the result
        result = train_and_evaluate(hparams, trial_number)

        # Save updated trials.pkl after each trial
        save_trials(trials)

        return result

In [24]:
# Constants
TOTAL_TIMESTEPS = 1000
if chosen_device:
    DEVICE = chosen_device
else:
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_EVALS = 20 
ADDITIONAL_EVALS = 10
MODELS_DIR = "models"

# Global variables
models_dir = ensure_directory_exists(MODELS_DIR)
best_model_path = os.path.join(models_dir, f"best_model.zip")
best_mean_reward = -float("inf")

# Define the hyperparameter search space for Hyperopt
search_space = {
    'n_steps': hp.choice('n_steps', [512, 1024, 2048]),
    'batch_size': hp.choice('batch_size', [32, 64, 128]),
    'n_epochs': hp.choice('n_epochs', [3, 4, 5]),
    'gamma': hp.uniform('gamma', 0.9, 0.999),
    'gae_lambda': hp.uniform('gae_lambda', 0.8, 1.0),
    'ent_coef': hp.uniform('ent_coef', 0.0001, 0.01),
}

# Set up MLflow experiment tracking
mlflow.set_experiment("reinforcement_learning/ppo/LunarLander")

# Load or initialize the Trials object
if os.path.exists("trials.pkl"):
    with open("trials.pkl", "rb") as f:
        trials = pickle.load(f)
        # Number of total trials you want to run (existing trials + new)
    MAX_EVALS = len(trials) + ADDITIONAL_EVALS  # e.g., adding 10 more trials to the previous
else:
    trials = Trials()



# Start a single MLflow run to track the entire optimization process
with mlflow.start_run(run_name="Hyperopt_Search"):
    best_hparams = fmin(
        fn=lambda hparams: objective(hparams, len(trials)),  # Pass trial number
        space=search_space,
        algo=tpe.suggest,
        max_evals=20,  # Number of evaluations
        trials=trials
    )

    # Save the updated Trials object for future use
    with open("trials.pkl", "wb") as f:
        pickle.dump(trials, f)
    mlflow.log_artifact("trials.pkl")
    
    # Log the best hyperparameters found after optimization
    mlflow.log_params({"best_hparams": best_hparams})
    
    # Log the best model as an artifact
    mlflow.log_artifact(best_model_path)

print("Best hyperparameters found:", best_hparams)
print("Best model saved at:", best_model_path)

100%|██████████| 20/20 [00:00<?, ?trial/s, best loss=?]


2024/09/16 23:20:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run Hyperopt_Search at: https://dagshub.com/smileynet/gymnasium_experiments.mlflow/#/experiments/2/runs/09f24adde7ae4e589e0f1d3e05afc0fa.
2024/09/16 23:20:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/smileynet/gymnasium_experiments.mlflow/#/experiments/2.


Best hyperparameters found: {'batch_size': 0, 'ent_coef': 0.008583590383175264, 'gae_lambda': 0.9421110523120086, 'gamma': 0.9207209601828463, 'n_epochs': 1, 'n_steps': 0}
Best model saved at: models/best_model.zip


In [None]:
import mlflow
import os

# Specify the run ID for the best model (retrieve this from the MLflow UI or API)
run_id = "<run_id_for_best_model>"
local_dir = "downloaded_model"
if not os.path.exists(local_dir):
    os.makedirs(local_dir)

# Download the best model artifact
mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path="model_trial_5.zip", dst_path=local_dir)

In [None]:
import mlflow
client = mlflow.tracking.MlflowClient()

# Get the best run based on the `mean_reward` metric
runs = client.search_runs(
    experiment_ids="0",
    filter_string="",
    order_by=["metrics.mean_reward DESC"],
    max_results=1,
)
best_run = runs[0]
best_hparams = best_run.data.params

## Previous

In [None]:
import sys
from typing import Any, Dict, Union, Tuple

import mlflow
import numpy as np
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.logger import HumanOutputFormat, KVWriter, Logger

# Custom logging format to send metrics to MLflow
class MLflowOutputFormat(KVWriter):
    """
    Dumps key/value pairs into MLflow's numeric format.
    """
    def write(
        self,
        key_values: Dict[str, Any],
        key_excluded: Dict[str, Union[str, Tuple[str, ...]]],
        step: int = 0,
    ) -> None:

        for (key, value), (_, excluded) in zip(
            sorted(key_values.items()), sorted(key_excluded.items())
        ):
            if excluded is not None and "mlflow" in excluded:
                continue
            if isinstance(value, np.ScalarType):
                if not isinstance(value, str):
                    mlflow.log_metric(key, value, step)


# Define the hyperparameter search space for Hyperopt
search_space = {
    'n_steps': hp.choice('n_steps', [512, 1024, 2048]),
    'batch_size': hp.choice('batch_size', [32, 64, 128]),
    'n_epochs': hp.choice('n_epochs', [3, 4, 5]),
    'gamma': hp.uniform('gamma', 0.9, 0.999),
    'gae_lambda': hp.uniform('gae_lambda', 0.8, 1.0),
    'ent_coef': hp.uniform('ent_coef', 0.0001, 0.01),
}


# Objective function for Hyperopt
def objective(hparams, trial_number):
    with mlflow.start_run(nested=True):  # Nested run for each trial
        # Set metadata for the trial
        mlflow.set_tag("trial_number", trial_number)
        mlflow.set_tag("optimizer", "hyperopt")

        # Log the hyperparameters for this trial
        mlflow.log_params(hparams)

        # Create the environment
        env = make_vec_env("LunarLander-v2", n_envs=16)

        # Initialize the PPO model with the current hyperparameters
        model = PPO(
            policy="MlpPolicy",  # You can change this depending on the policy type you want
            env=env,
            **hparams
        )

        # Custom logger to log metrics to MLflow
        loggers = Logger(
            folder="logs",
            output_formats=[HumanOutputFormat(sys.stdout), MLflowOutputFormat()],
        )
        model.set_logger(loggers)

        # Train the model
        model.learn(total_timesteps=10000, log_interval=1)

        # Evaluation to calculate the score (mean reward)
        eval_env = make_vec_env("LunarLander-v2", n_envs=1)
        mean_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes=10)

        # Log the evaluation result as a metric
        mlflow.log_metric("mean_reward", mean_reward)

        # Close environments
        env.reset()
        env.close()
        eval_env.reset()
        eval_env.close()

        # Return loss (negative reward to minimize)
        return {'loss': -mean_reward, 'status': STATUS_OK}


# Helper function to evaluate the policy
def evaluate_policy(model, env, n_eval_episodes=10):
    all_episode_rewards = []
    for _ in range(n_eval_episodes):
        episode_rewards = 0
        done = False
        obs = env.reset()
        while not done:
            action, _ = model.predict(obs)
            obs, reward, done, info = env.step(action)
            episode_rewards += reward
        all_episode_rewards.append(episode_rewards)
    mean_reward = np.mean(all_episode_rewards)
    return mean_reward, all_episode_rewards


# Set up MLflow experiment tracking
mlflow.set_experiment("LunarLander_Hyperparameter_Optimization")

# Run Hyperopt optimization
trials = Trials()  # Keep track of all trials

# Start a single MLflow run to track the entire optimization process
with mlflow.start_run(run_name="Hyperopt_Search"):
    best_hparams = fmin(
        fn=lambda hparams: objective(hparams, len(trials)),  # Pass trial number
        space=search_space,
        algo=tpe.suggest,
        max_evals=20,  # Number of evaluations
        trials=trials
    )

    # Log the best hyperparameters found after optimization
    mlflow.log_params({"best_hparams": best_hparams})

print("Best hyperparameters found:", best_hparams)

In [7]:
import sys
from typing import Any, Dict, Tuple, Union

import mlflow
import numpy as np

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.logger import HumanOutputFormat, KVWriter, Logger

hparams = {
    'n_steps': 1024,
    'batch_size': 64,
    'n_epochs': 4,
    'gamma': 0.999,
    'gae_lambda': 0.98,
    'ent_coef': 0.01,
}

class MLflowOutputFormat(KVWriter):
    """
    Dumps key/value pairs into MLflow's numeric format.
    """

    def write(
        self,
        key_values: Dict[str, Any],
        key_excluded: Dict[str, Union[str, Tuple[str, ...]]],
        step: int = 0,
    ) -> None:

        for (key, value), (_, excluded) in zip(
            sorted(key_values.items()), sorted(key_excluded.items())
        ):

            if excluded is not None and "mlflow" in excluded:
                continue

            if isinstance(value, np.ScalarType):
                if not isinstance(value, str):
                    mlflow.log_metric(key, value, step)


loggers = Logger(
    folder="logs",
    output_formats=[HumanOutputFormat(sys.stdout), MLflowOutputFormat()],
)

with mlflow.start_run():
    
    # Log the hyperparameters to MLflow
    mlflow.log_params(hparams)
    
    env = make_vec_env("LunarLander-v2", n_envs=16)
    
    model = PPO(
        policy="MlpPolicy",
        env=env,
        verbose=2,
        **hparams
    )
    # Set custom logger
    model.set_logger(loggers)
    model.learn(total_timesteps=10000, log_interval=1)
    
    env.reset()
    env.close()


Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 93.9     |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 2567     |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 16384    |
---------------------------------


2024/09/16 19:13:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run blushing-wasp-717 at: https://dagshub.com/smileynet/gymnasium_experiments.mlflow/#/experiments/0/runs/411a68d0dfce4e4885fe5ecdc42eb648.
2024/09/16 19:13:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/smileynet/gymnasium_experiments.mlflow/#/experiments/0.


In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Create environment
env = make_vec_env("LunarLander-v2", n_envs=16)

# Instantiate the agent
# We use MultiLayerPerceptron (MLPPolicy) because the input is a vector,
# if we had frames as input we would use CnnPolicy
model = PPO(
    policy="MlpPolicy",
    env=env,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
)

# Train it for 1,000,000 timesteps
model.learn(total_timesteps=1000000)

# Save the model
model_name = "ppo-LunarLander-v2"

model.save(model_name)

## Examples

In [19]:
# Saving/ Resuming Trials
import os
import pickle
import mlflow
from hyperopt import fmin, tpe, hp, Trials
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Define the hyperparameter search space
search_space = {
    'n_steps': hp.choice('n_steps', [512, 1024, 2048]),
    'batch_size': hp.choice('batch_size', [32, 64, 128]),
    'n_epochs': hp.choice('n_epochs', [3, 4, 5]),
    'gamma': hp.uniform('gamma', 0.9, 0.999),
    'gae_lambda': hp.uniform('gae_lambda', 0.8, 1.0),
    'ent_coef': hp.uniform('ent_coef', 0.0001, 0.01),
}

# Objective function for Hyperopt
def objective(hparams):
    with mlflow.start_run(nested=True):
        mlflow.log_params(hparams)
        
        # Create the environment
        env = make_vec_env("LunarLander-v2", n_envs=16)

        # Initialize PPO model with hyperparameters
        model = PPO(policy="MlpPolicy", env=env, **hparams)
        model.learn(total_timesteps=10000)

        # Evaluate the model
        eval_env = make_vec_env("LunarLander-v2", n_envs=1)
        mean_reward = evaluate_policy(model, eval_env)

        mlflow.log_metric("mean_reward", mean_reward)
        return {'loss': -mean_reward, 'status': 'ok'}

# Helper function to evaluate the model
def evaluate_policy(model, env, n_eval_episodes=10):
    # Evaluate the model and return the average reward
    total_reward = 0
    for _ in range(n_eval_episodes):
        obs = env.reset()
        done = False
        while not done:
            action, _ = model.predict(obs)
            obs, reward, done, info = env.step(action)
            total_reward += reward
    mean_reward = total_reward / n_eval_episodes
    return mean_reward

# Load or initialize the Trials object
if os.path.exists("trials.pkl"):
    with open("trials.pkl", "rb") as f:
        trials = pickle.load(f)
else:
    trials = Trials()

# Set up MLflow experiment
mlflow.set_experiment("LunarLander_Hyperparam_Optimization")

# Number of total trials you want to run (existing trials + new)
new_max_evals = len(trials) + 10  # e.g., adding 10 more trials to the previous

# Run Hyperopt optimization
best_hparams = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=new_max_evals,  # Total number of trials
    trials=trials  # Resume from previous trials
)

# Save the updated Trials object for future use
with open("trials.pkl", "wb") as f:
    pickle.dump(trials, f)

print("Best hyperparameters found:", best_hparams)

In [None]:
# We create our environment with gym.make("<name_of_the_environment>")
env = gym.make("LunarLander-v2")
env.reset()
print("Observation Space Shape", env.observation_space.shape)
print("Action Space Shape", env.action_space.n)
env.close()

In [None]:
import gymnasium as gym

from stable_baselines3 import A2C

env = gym.make("CartPole-v1", render_mode="rgb_array")

model = A2C("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10_000)

vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    vec_env.render("human")

In [None]:
# Evaluate the agent

# Create a new environment for evaluation
eval_env = Monitor(gym.make("LunarLander-v2"))

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

# Print the results
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
import gymnasium as gym

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env

from huggingface_sb3 import package_to_hub

# PLACE the variables you've just defined two cells above
# Define the name of the environment
env_id = "LunarLander-v2"

# TODO: Define the model architecture we used
model_architecture = "PPO"

## Define a repo_id
## repo_id is the id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
## CHANGE WITH YOUR REPO ID
repo_id = "ThomasSimonini/ppo-LunarLander-v2"  # Change with your repo id, you can't push with mine 😄

## Define the commit message
commit_message = "Upload PPO LunarLander-v2 trained agent"

# Create the evaluation env and set the render_mode="rgb_array"
eval_env = DummyVecEnv([lambda: Monitor(gym.make(env_id, render_mode="rgb_array"))])

# PLACE the package_to_hub function you've just filled here
package_to_hub(
    model=model,  # Our trained model
    model_name=model_name,  # The name of our trained model
    model_architecture=model_architecture,  # The model architecture we used: in our case PPO
    env_id=env_id,  # Name of the environment
    eval_env=eval_env,  # Evaluation Environment
    repo_id=repo_id,  # id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
    commit_message=commit_message,
)

In [None]:
import sys
from typing import Any, Dict, Tuple, Union

import mlflow
import numpy as np

from stable_baselines3 import SAC
from stable_baselines3.common.logger import HumanOutputFormat, KVWriter, Logger


class MLflowOutputFormat(KVWriter):
    """
    Dumps key/value pairs into MLflow's numeric format.
    """

    def write(
        self,
        key_values: Dict[str, Any],
        key_excluded: Dict[str, Union[str, Tuple[str, ...]]],
        step: int = 0,
    ) -> None:

        for (key, value), (_, excluded) in zip(
            sorted(key_values.items()), sorted(key_excluded.items())
        ):

            if excluded is not None and "mlflow" in excluded:
                continue

            if isinstance(value, np.ScalarType):
                if not isinstance(value, str):
                    mlflow.log_metric(key, value, step)


loggers = Logger(
    folder=None,
    output_formats=[HumanOutputFormat(sys.stdout), MLflowOutputFormat()],
)

with mlflow.start_run():
    model = SAC("MlpPolicy", "Pendulum-v1", verbose=2)
    # Set custom logger
    model.set_logger(loggers)
    model.learn(total_timesteps=10000, log_interval=1)