In [None]:
import gymnasium as gym

from huggingface_sb3 import load_from_hub, package_to_hub
from huggingface_hub import (
    notebook_login,
)  # To log to our Hugging Face account to be able to upload models to the Hub.

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

# Setting up StableBaseline 4 Training

## PPO

In [None]:
import tensorboard
env = make_vec_env("LunarLander-v3", n_envs=16)

# Instantiate the agent
model = PPO('MlpPolicy', 
            env=env, 
            tensorboard_log="../LunarLander-v3_tensorboard/",
            learning_rate=0.0005, 
            n_steps=2048, 
            batch_size=256, 
            n_epochs=10, 
            gamma=0.999, 
            gae_lambda=0.98, 
            clip_range=0.2, 
            clip_range_vf=None, 
            normalize_advantage=True, 
            ent_coef=0.01, 
            vf_coef=0.5, 
            max_grad_norm=0.5, 
            use_sde=False, 
            sde_sample_freq=-1,
            rollout_buffer_class=None, 
            rollout_buffer_kwargs=None, 
            target_kl=None, 
            stats_window_size=100, 
            policy_kwargs=None, 
            verbose=0, 
            seed=None, 
            device='auto', 
            _init_setup_model=True)

# Train the agent
model.learn(total_timesteps=int(2e6), progress_bar=True)

# Save Model to file only if reward is better
eval_env = gym.make("LunarLander-v3")
eval_env = Monitor(eval_env)

ppo_new_model = model
ppo_old_model = PPO.load("models/ppo_LunarLander-v3_model", env=eval_env)
 
mean_reward_new_ppo, std_new_reward_ppo = evaluate_policy(ppo_new_model, ppo_new_model.get_env(), n_eval_episodes=10, deterministic=True)
mean_reward_old_ppo, std_reward_old_ppo = evaluate_policy(ppo_old_model, ppo_old_model.get_env(), n_eval_episodes=10, deterministic=True)

print(f"mean_reward ppo={mean_reward_new_ppo:.2f} +/- {std_new_reward_ppo}")
print(f"mean_reward ppo={mean_reward_old_ppo:.2f} +/- {std_reward_old_ppo}")

if mean_reward_new_ppo > mean_reward_old_ppo:
    print("Better PPO model trained! Saving...")
    model.save("models/ppo_LunarLander-v3_model")

Wrapping the env in a DummyVecEnv.
mean_reward ppo=271.73 +/- 20.961501937338195
mean_reward ppo=257.58 +/- 23.40161484122245
Better PPO model trained! Saving...


## DQN

In [9]:
from stable_baselines3 import DQN

env = make_vec_env("LunarLander-v3", n_envs=16)

model = DQN("MlpPolicy", 
            env=env,
            learning_rate=0.001, 
            buffer_size=5000000, 
            learning_starts=100, 
            batch_size=256, 
            tau=1.0, 
            gamma=0.999, 
            train_freq=50, 
            gradient_steps=10, 
            replay_buffer_class=None, 
            replay_buffer_kwargs=None, 
            optimize_memory_usage=False, 
            n_steps=1, 
            target_update_interval=10000, 
            exploration_fraction=0.2, 
            exploration_initial_eps=1.0,
            exploration_final_eps=0.05, 
            max_grad_norm=10, 
            stats_window_size=100, 
            tensorboard_log="../LunarLander-v3_tensorboard/", 
            policy_kwargs=None, 
            verbose=0, 
            seed=None, 
            device='auto', 
            _init_setup_model=True)

model.learn(total_timesteps=1e6)
model.save("models/dqn_LunarLander-v3_model")

KeyboardInterrupt: 

## PPO vs DQN

In [6]:
eval_env = gym.make("LunarLander-v3")
eval_env = Monitor(eval_env)

ppo_trained_model = PPO.load("models/ppo_LunarLander-v3_model", env=eval_env)
dqn_trained_model = DQN.load("models/dqn_LunarLander-v3_model", env=eval_env)

mean_reward_ppo, std_reward_ppo = evaluate_policy(ppo_trained_model, ppo_trained_model.get_env(), n_eval_episodes=10, deterministic=True)
print(f"mean_reward ppo={mean_reward_ppo:.2f} +/- {std_reward_ppo}")

mean_reward_dqn, std_reward_dqn= evaluate_policy(dqn_trained_model, dqn_trained_model.get_env(), n_eval_episodes=10, deterministic=True)
print(f"mean_reward dqn={mean_reward_dqn:.2f} +/- {std_reward_dqn}")

Wrapping the env in a DummyVecEnv.
mean_reward ppo=258.52 +/- 16.865362307268374
mean_reward dqn=-14.42 +/- 73.22439129315008


# Evaluation

In [None]:
eval_env = gym.make("LunarLander-v3")
eval_env = Monitor(eval_env)

# trained_model = PPO.load("ppo_LunarLander-v3_model", env=eval_env)
trained_model = DQN.load("models/dqn_LunarLander-v3_model", env=eval_env)

mean_reward, std_reward = evaluate_policy(trained_model, trained_model.get_env(), n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=-113.29 +/- 38.6918938208943


# Graphical Visualisation

In [None]:
# Creating the environment
env = gym.make("LunarLander-v3", render_mode="human")
# trained_model = PPO.load("models/ppo_LunarLander-v3_model", env=eval_env)
trained_model = DQN.load("models/dqn_LunarLander-v3_model", env=eval_env)

# Resetting the enironment
observation, info = env.reset()

for _ in range(500):
    # Take a random action 
    action, state = trained_model.predict(observation, )
    
    # Apply this action in the env
    observation, reward, terminated, truncated, info = env.step(action)
    env.render()

    if terminated or truncated:
        print("Episode is finished")
        break

In [None]:
import gymnasium as gym
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env

from huggingface_sb3 import package_to_hub

repo_id = "Pucciland95/ppo-LunarLander-v3"
env_id = "LunarLander-v3"

eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array")])

model_architecture = "PPO"
commit_message = "Uploaded PPO LunarLander-v3 trained agent"

model = PPO.load("models/ppo_LunarLander-v3_model", env=eval_env)

package_to_hub(model=model, 
               model_name="ChopChopMotherFucker",
               model_architecture=model_architecture,
               env_id=env_id,
               eval_env=eval_env,
               repo_id=repo_id,
               commit_message=commit_message)

[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m




Saving video to /tmp/tmpu93ohzk_/-step-0-to-step-1000.mp4
MoviePy - Building video /tmp/tmpu93ohzk_/-step-0-to-step-1000.mp4.
MoviePy - Writing video /tmp/tmpu93ohzk_/-step-0-to-step-1000.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready /tmp/tmpu93ohzk_/-step-0-to-step-1000.mp4
[38;5;1m✘ 'DummyVecEnv' object has no attribute 'video_recorder'[0m
[38;5;1m✘ We are unable to generate a replay of your agent, the package_to_hub
process continues[0m
[38;5;1m✘ Please open an issue at
https://github.com/huggingface/huggingface_sb3/issues[0m
[38;5;4mℹ Pushing repo Pucciland95/ppo-LunarLander-v3 to the Hugging Face
Hub[0m


Processing Files (4 / 4): 100%|██████████|  282kB /  282kB,  149kB/s  
New Data Upload: 100%|██████████|  109kB /  109kB,  109kB/s  


[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/Pucciland95/ppo-LunarLander-v3/tree/main/[0m


CommitInfo(commit_url='https://huggingface.co/Pucciland95/ppo-LunarLander-v3/commit/c7c678b9c912771347195671c06d64ccfbed56aa', commit_message='Uploaded PPO LunarLander-v3 trained agent', commit_description='', oid='c7c678b9c912771347195671c06d64ccfbed56aa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Pucciland95/ppo-LunarLander-v3', endpoint='https://huggingface.co', repo_type='model', repo_id='Pucciland95/ppo-LunarLander-v3'), pr_revision=None, pr_num=None)

# Cloning and Evaluating the Model you just pushed

In [25]:
from huggingface_sb3 import load_from_hub

repo_id = "Pucciland95/ppo-LunarLander-v3"
filename = "ChopChopMotherFucker.zip"

custom_objects = {
            "learning_rate": 0.0,
            "lr_schedule": lambda _: 0.0,
            "clip_range": lambda _: 0.0,
}

checkpoint = load_from_hub(repo_id, filename)
model = PPO.load(checkpoint, custom_objects=custom_objects, print_system_info=True)

eval_env = Monitor(gym.make("LunarLander-v3"))
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

== CURRENT SYSTEM INFO ==
- OS: Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39 # 1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
- Python: 3.12.3
- Stable-Baselines3: 2.7.0
- PyTorch: 2.9.0+cu128
- GPU Enabled: False
- Numpy: 2.2.6
- Cloudpickle: 3.1.1
- Gymnasium: 1.2.1

== SAVED MODEL SYSTEM INFO ==
- OS: Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39 # 1 SMP PREEMPT_DYNAMIC Thu Jun  5 18:30:46 UTC 2025
- Python: 3.12.3
- Stable-Baselines3: 2.7.0
- PyTorch: 2.9.0+cu128
- GPU Enabled: False
- Numpy: 2.2.6
- Cloudpickle: 3.1.1
- Gymnasium: 1.2.1

mean_reward=268.40 +/- 17.698211256933895
