In [1]:
import gymnasium as gym
import time
import numpy as np

In [2]:
from huggingface_sb3 import load_from_hub, package_to_hub
from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

In [4]:
# Create our training environment - a cart with a pole that needs balancing
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0, enable_wind=False, wind_power=15.0, turbulence_power=1.5, render_mode="human")

# Reset environment to start a new episode
observation, info = env.reset()
print(f"Starting observation: {observation}")

episode_over = False
total_reward = 0

while not episode_over:
    action = env.action_space.sample()  # Random action for now - real agents will be smarter!

    # Take the action and see what happens
    observation, reward, terminated, truncated, info = env.step(action)
 
    total_reward += reward
    # print(f"Action taken: {action}, New observation: {observation}, Reward: {reward}")
    episode_over = terminated or truncated
    time.sleep(0.2)  # Slow down the loop to see the action

print(f"Episode finished! Total reward: {total_reward}")
env.close()

Starting observation: [-7.3537824e-04  1.4168624e+00 -7.4500605e-02  2.6409853e-01
  8.5887959e-04  1.6875466e-02  0.0000000e+00  0.0000000e+00]
Episode finished! Total reward: -129.28694039649886


In [5]:
# Create the environment
env = make_vec_env('LunarLander-v3', n_envs=5)

model = PPO(
    policy = 'MlpPolicy',
    env = env,
    n_steps = 1024,
    batch_size = 64,
    n_epochs = 4,
    gamma = 0.999,
    gae_lambda = 0.98,
    ent_coef = 0.01,
    verbose=1)

Using cpu device


In [6]:
# SOLUTION
# Train it for 1,000,000 timesteps
model.learn(total_timesteps=1000000)
# Save the model
model_name = "ppo-LunarLander-v1"
model.save(model_name)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 93.9     |
|    ep_rew_mean     | -155     |
| time/              |          |
|    fps             | 6000     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 5120     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 96           |
|    ep_rew_mean          | -169         |
| time/                   |              |
|    fps                  | 4865         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 0.0012887996 |
|    clip_fraction        | 0.000342     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | 0.00093      |
|    learning_r

In [3]:
model_path = "ppo-LunarLander-v1.zip"
# Load the model
model = PPO.load(model_path)

In [4]:
eval_env = Monitor(gym.make("LunarLander-v3", render_mode='human'))
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=2, deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=243.88 +/- 18.258336999999997


In [5]:
eval_env.close()


In [12]:
# from huggingface_hub import notebook_login
# notebook_login()  # paste your HF token with 'write' permission


In [6]:
model

<stable_baselines3.ppo.ppo.PPO at 0x74ff911c8190>

In [7]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from huggingface_sb3 import package_to_hub
from stable_baselines3.common.monitor import Monitor

env_id = "LunarLander-v3"

# Create the evaluation env and set the render_mode="human"
eval_env = DummyVecEnv([lambda: Monitor(gym.make(env_id, render_mode="rgb_array"))])

package_to_hub(
    model=model,
    model_name="PPO-LunarLander-v3",
    model_architecture="PPO",
    env_id=env_id,
    eval_env=eval_env,
    repo_id="eequalsmcsquared/ppo-LunarLander-v3",
    commit_message="First upload from local VS Code",
)

[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m
Saving video to /tmp/tmprpu0kml9/-step-0-to-step-1000.mp4
MoviePy - Building video /tmp/tmprpu0kml9/-step-0-to-step-1000.mp4.
MoviePy - Writing video /tmp/tmprpu0kml9/-step-0-to-step-1000.mp4



sh: 1: ffmpeg: not found                                                   


MoviePy - Done !
MoviePy - video ready /tmp/tmprpu0kml9/-step-0-to-step-1000.mp4
[38;5;4mℹ Pushing repo eequalsmcsquared/ppo-LunarLander-v3 to the Hugging Face
Hub[0m


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/eequalsmcsquared/ppo-LunarLander-v3/tree/main/[0m


CommitInfo(commit_url='https://huggingface.co/eequalsmcsquared/ppo-LunarLander-v3/commit/e6857d26e90e45388a8063f1729e1b76857e5a66', commit_message='First upload from local VS Code', commit_description='', oid='e6857d26e90e45388a8063f1729e1b76857e5a66', pr_url=None, repo_url=RepoUrl('https://huggingface.co/eequalsmcsquared/ppo-LunarLander-v3', endpoint='https://huggingface.co', repo_type='model', repo_id='eequalsmcsquared/ppo-LunarLander-v3'), pr_revision=None, pr_num=None)

In [11]:
import numpy as np
arr = np.random.randint(0,10,(3, 4))
print(arr[2])
np.argmax(arr[2])

[9 1 2 5]


np.int64(0)