In [None]:
import os

os.kill(os.getpid(), 9)


In [None]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()


In [1]:
import gymnasium as gym

from huggingface_sb3 import load_from_hub, package_to_hub
from huggingface_hub import notebook_login

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor


In [2]:
env_name = "LunarLander-v2"


## 1. Explore the environment

- Horizontal pad coordinate (x)
- Vertical pad coordinate (y)
- Horizontal speed (x)
- Vertical speed (y)
- Angle
- Angular speed
- If the left leg contact point has touched the land (boolean)
- If the right leg contact point has touched the land (boolean)

In [3]:
env = gym.make(env_name)
env.reset()
print("___OBSERVATION SPACES___\n")
print("Observation space shape: ", env.observation_space.shape)
print("Sample Observation: ", env.observation_space.sample())


___OBSERVATION SPACES___

Observation space shape:  (8,)
Sample Observation:  [-1.2280285   0.20994906  3.332678   -0.8801509   0.28032592  1.5014582
  0.0671205   0.73256373]


- Action 0: Do nothing,
- Action 1: Fire left orientation engine,
- Action 2: Fire the main engine,
- Action 3: Fire right orientation engine.

In [4]:
print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())  # Take a random action



 _____ACTION SPACE_____ 

Action Space Shape 4
Action Space Sample 1


## 2. Test Random Environment

For each step, the reward:

- Is increased/decreased the closer/further the lander is to the landing pad.
- Is increased/decreased the slower/faster the lander is moving.
- Is decreased the more the lander is tilted (angle not horizontal).
- Is increased by 10 points for each leg that is in contact with the ground.
- Is decreased by 0.03 points each frame a side engine is firing.
- Is decreased by 0.3 points each frame the main engine is firing.

The episode receive an additional reward of -100 or +100 points for crashing or landing safely respectively.

An episode is considered a solution if it scores at least 200 points.

In [5]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    terminated = False
    truncated = False
    score = 0 
    
    while not (terminated or truncated):
        # env.render()
        action = env.action_space.sample()
        n_state, reward, terminated, truncated, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()


Episode:1 Score:-436.84224465071185
Episode:2 Score:-356.94994447644444
Episode:3 Score:-17.90229250543662
Episode:4 Score:28.898347516818433
Episode:5 Score:-285.74424420130737
Episode:6 Score:-112.2601736794308
Episode:7 Score:-6.743835313815154
Episode:8 Score:-68.37793493290286
Episode:9 Score:-117.41510322592843
Episode:10 Score:-129.1089082368268


## 3. Build And Train The Model

Strategies

- Usual env
- Vectorized
    - increase no. of envs
- Increase training steps
- Increase MLP hidden units and layers

#### Trial 1: Usual Env

In [6]:
# Create environment
env = gym.make('LunarLander-v2')

model = PPO(
    policy="MlpPolicy",
    env=env,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
)


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [7]:
# Train it for 1,000,000 timesteps
model.learn(total_timesteps=1000000)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -168     |
| time/              |          |
|    fps             | 791      |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 1024     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 165          |
|    ep_rew_mean          | -135         |
| time/                   |              |
|    fps                  | 838          |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0007345719 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | -0.00588     |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x76043040f280>

In [11]:
# Create a new environment for evaluation
eval_env = Monitor(gym.make("LunarLander-v2"))

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

# Print the results
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
# mean_reward=263.18 +/- 13.291454837671413

mean_reward=263.18 +/- 13.291454837671413


In [12]:
# Save the model
# model_name = "ppo-LunarLander-v2"
model_name = "trial1_usual_env"
model.save(model_name)


#### Trial 2: Vectorized Env

In [13]:
# Create the environment
env = make_vec_env("LunarLander-v2", n_envs=16)

model = PPO(
    policy="MlpPolicy",
    env=env,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
)


Using cuda device


In [14]:
# Train it for 1,000,000 timesteps
model.learn(total_timesteps=1000000)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 91.4     |
|    ep_rew_mean     | -185     |
| time/              |          |
|    fps             | 8490     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 16384    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 93.5         |
|    ep_rew_mean          | -147         |
| time/                   |              |
|    fps                  | 5203         |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 32768        |
| train/                  |              |
|    approx_kl            | 0.0067488346 |
|    clip_fraction        | 0.0615       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | -0.000864    |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x7906c88a7fd0>

In [15]:
# Create a new environment for evaluation
eval_env = Monitor(gym.make("LunarLander-v2"))

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

# Print the results
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
# mean_reward=242.99 +/- 18.700068823714055

mean_reward=242.99 +/- 18.700068823714055


In [16]:
# Save the model
# model_name = "ppo-LunarLander-v2"
model_name = "trial2_vectorized_env"
model.save(model_name)


#### Trial 3: Increase MLP hidden units and layers

In [17]:
# https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html

# For 1D observation space, a 2 layers fully connected net is used with:
# 64 units (per layer) for PPO/A2C/DQN

import torch as th

# Create the environment
env = make_vec_env("LunarLander-v2", n_envs=16)

policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[32, 32, 32, 32], vf=[32, 32, 32, 32]))

model = PPO(
    policy="MlpPolicy",
    env=env,
    policy_kwargs=policy_kwargs,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
)


Using cuda device


In [18]:
# Train it for 1,000,000 timesteps
model.learn(total_timesteps=1000000)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 90.7     |
|    ep_rew_mean     | -177     |
| time/              |          |
|    fps             | 8345     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 16384    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 90.6         |
|    ep_rew_mean          | -169         |
| time/                   |              |
|    fps                  | 4864         |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 32768        |
| train/                  |              |
|    approx_kl            | 0.0025238849 |
|    clip_fraction        | 0.000854     |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | 0.00142      |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x7905ba1b8a60>

In [19]:
# Create a new environment for evaluation
eval_env = Monitor(gym.make("LunarLander-v2"))

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

# Print the results
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
# mean_reward=272.01 +/- 7.049585968643097

mean_reward=272.01 +/- 7.049585968643097


In [20]:
# Save the model
# model_name = "ppo-LunarLander-v2"
model_name = "trial3_mlp_hid_32"
model.save(model_name)


#### Trial 4: Increase training steps

In [21]:
# https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html

# For 1D observation space, a 2 layers fully connected net is used with:
# 64 units (per layer) for PPO/A2C/DQN

import torch as th

# Create the environment
env = make_vec_env("LunarLander-v2", n_envs=16)

policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[32, 32, 32, 32], vf=[32, 32, 32, 32]))

model = PPO(
    policy="MlpPolicy",
    env=env,
    policy_kwargs=policy_kwargs,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
)


Using cuda device


In [22]:
# Train it for 1,000,000 timesteps
model.learn(total_timesteps=2000000)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 92.1     |
|    ep_rew_mean     | -175     |
| time/              |          |
|    fps             | 8576     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 16384    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 89          |
|    ep_rew_mean          | -151        |
| time/                   |             |
|    fps                  | 4789        |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.001167241 |
|    clip_fraction        | 0.000244    |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.39       |
|    explained_variance   | 0.00133     |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x79057d40c850>

In [23]:
# Create a new environment for evaluation
eval_env = Monitor(gym.make("LunarLander-v2"))

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

# Print the results
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
# mean_reward=289.92 +/- 20.413844327051642

mean_reward=289.92 +/- 20.413844327051642


In [24]:
# Save the model
# model_name = "ppo-LunarLander-v2"
model_name = "trial4_training_steps_2M"
model.save(model_name)


#### Trial 5: Increase n_envs to 32

In [25]:
# https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html

# For 1D observation space, a 2 layers fully connected net is used with:
# 64 units (per layer) for PPO/A2C/DQN

import torch as th

# Create the environment
env = make_vec_env("LunarLander-v2", n_envs=32)

policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[32, 32, 32, 32], vf=[32, 32, 32, 32]))

model = PPO(
    policy="MlpPolicy",
    env=env,
    policy_kwargs=policy_kwargs,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
)


Using cuda device


In [26]:
# Train it for 1,000,000 timesteps
model.learn(total_timesteps=2000000)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 91.3     |
|    ep_rew_mean     | -174     |
| time/              |          |
|    fps             | 10122    |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 32768    |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 88.4         |
|    ep_rew_mean          | -158         |
| time/                   |              |
|    fps                  | 5115         |
|    iterations           | 2            |
|    time_elapsed         | 12           |
|    total_timesteps      | 65536        |
| train/                  |              |
|    approx_kl            | 0.0038576364 |
|    clip_fraction        | 0.0322       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | 0.00121      |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x79057d3e1900>

In [27]:
# Create a new environment for evaluation
eval_env = Monitor(gym.make("LunarLander-v2"))

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

# Print the results
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
# mean_reward=281.41 +/- 21.365325985827038

mean_reward=281.41 +/- 21.365325985827038


In [None]:
# Save the model
# model_name = "ppo-LunarLander-v2"
model_name = "trial5_nenv_32"
model.save(model_name)


#### Trial 6: Try diff combinations of hidden units

In [31]:
# https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html

# For 1D observation space, a 2 layers fully connected net is used with:
# 64 units (per layer) for PPO/A2C/DQN

import torch as th

# Create the environment
env = make_vec_env("LunarLander-v2", n_envs=32)

policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[32, 32, 48, 48], vf=[32, 32, 48, 48]))

model = PPO(
    policy="MlpPolicy",
    env=env,
    policy_kwargs=policy_kwargs,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
)


Using cuda device


In [32]:
# Train it for 1,000,000 timesteps
model.learn(total_timesteps=4000000)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 91.4     |
|    ep_rew_mean     | -190     |
| time/              |          |
|    fps             | 8135     |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 32768    |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 106        |
|    ep_rew_mean          | -144       |
| time/                   |            |
|    fps                  | 4685       |
|    iterations           | 2          |
|    time_elapsed         | 13         |
|    total_timesteps      | 65536      |
| train/                  |            |
|    approx_kl            | 0.00943584 |
|    clip_fraction        | 0.0262     |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.38      |
|    explained_variance   | -0.00012   |
|    learning_rate        | 0.0003     |
|   

<stable_baselines3.ppo.ppo.PPO at 0x79057c96bbb0>

In [33]:
# Create a new environment for evaluation
eval_env = Monitor(gym.make("LunarLander-v2"))

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

# Print the results
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
# mean_reward=295.27 +/- 12.753986687040257

mean_reward=295.27 +/- 12.753986687040257


In [34]:
# Save the model
# model_name = "ppo-LunarLander-v2"
model_name = "trial6_MLP_hid_units_32_48_4M"
model.save(model_name)


#### Trial 7: Usual with all otherremaining parameters being same

In [37]:
# Create environment
env = gym.make('LunarLander-v2')

policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[32, 32, 32, 32], vf=[32, 32, 32, 32]))

model = PPO(
    policy="MlpPolicy",
    env=env,
    policy_kwargs=policy_kwargs,
    n_steps=1024,
    batch_size=64,
    n_epochs=4,
    gamma=0.999,
    gae_lambda=0.98,
    ent_coef=0.01,
    verbose=1,
)


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [38]:
# Train it for 1,000,000 timesteps
model.learn(total_timesteps=4000000)

# Takes twice the time to train (56m)



---------------------------------
| rollout/           |          |
|    ep_len_mean     | 92.2     |
|    ep_rew_mean     | -163     |
| time/              |          |
|    fps             | 1534     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 1024     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 92           |
|    ep_rew_mean          | -168         |
| time/                   |              |
|    fps                  | 1283         |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0002738806 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | -0.00288     |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x79057cb5f910>

In [39]:
# Create a new environment for evaluation
eval_env = Monitor(gym.make("LunarLander-v2"))

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

# Print the results
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
# mean_reward=285.77 +/- 14.376375214600117

mean_reward=285.77 +/- 14.376375214600117


In [None]:
# Save the model
# model_name = "ppo-LunarLander-v2"
model_name = "trial7_usual_MLP_hid_units_32_4M"
model.save(model_name)


## 4. Publish on HF Hub

In [8]:
import gymnasium as gym

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env

from huggingface_sb3 import load_from_hub, package_to_hub
from huggingface_hub import notebook_login


In [10]:
!pwd


/home/raghu/DL/topics/RL/unit1


In [11]:
env = gym.make("LunarLander-v2")

model_name = "models/ppo-LunarLander-v2"
model = PPO.load(model_name, env)


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [12]:
notebook_login()
!git config --global credential.helper store


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
# PLACE the variables you've just defined two cells above
# Define the name of the environment
env_id = "LunarLander-v2"

# TODO: Define the model architecture we used
model_architecture = "PPO"

## Define a repo_id
## repo_id is the id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
## CHANGE WITH YOUR REPO ID
repo_id = "ra9hu/UT1-LunarLander-v2" # Change with your repo id, you can't push with mine 😄

## Define the commit message
commit_message = "Upload PPO LunarLander-v2 trained agent"

# Create the evaluation env and set the render_mode="rgb_array"
eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array")])

# PLACE the package_to_hub function you've just filled here
package_to_hub(model=model, # Our trained model
               model_name=model_name, # The name of our trained model
               model_architecture=model_architecture, # The model architecture we used: in our case PPO
               env_id=env_id, # Name of the environment
               eval_env=eval_env, # Evaluation Environment
               repo_id=repo_id, # id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
               commit_message=commit_message)


[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m




Saving video to /tmp/tmpgfhbbmp4/-step-0-to-step-1000.mp4
Moviepy - Building video /tmp/tmpgfhbbmp4/-step-0-to-step-1000.mp4.
Moviepy - Writing video /tmp/tmpgfhbbmp4/-step-0-to-step-1000.mp4



ffmpeg version 9c33b2f Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 9.3.0 (crosstool-NG 1.24.0.133_b0863d8_dirty)
  configuration: --prefix=/home/conda/feedstock_root/build_artifacts/ffmpeg_1627813612080/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1627813612080/_build_env/bin/x86_64-conda-linux-gnu-cc --disable-doc --disable-openssl --enable-avresample --enable-gnutls --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-libx264 --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/home/conda/feedstock_root/build_artifacts/ffmpeg_1627813612080/_build_env/bin/pkg-config
  libavutil      56. 51.100 / 56. 51.100
  libavcodec     58. 91.100 / 58. 91.100
  libavfor

Moviepy - Done !
Moviepy - video ready /tmp/tmpgfhbbmp4/-step-0-to-step-1000.mp4


frame= 1001 fps=0.0 q=-1.0 Lsize=     166kB time=00:00:19.96 bitrate=  68.3kbits/s speed=46.4x    
video:154kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 7.955625%
[libx264 @ 0x5fadced5fa80] frame I:5     Avg QP: 9.44  size:  1894
[libx264 @ 0x5fadced5fa80] frame P:279   Avg QP:21.35  size:   226
[libx264 @ 0x5fadced5fa80] frame B:717   Avg QP:23.23  size:   118
[libx264 @ 0x5fadced5fa80] consecutive B-frames:  1.2%  6.4% 10.5% 81.9%
[libx264 @ 0x5fadced5fa80] mb I  I16..4: 89.2%  4.7%  6.1%
[libx264 @ 0x5fadced5fa80] mb P  I16..4:  0.2%  0.4%  0.1%  P16..4:  1.9%  0.5%  0.2%  0.0%  0.0%    skip:96.7%
[libx264 @ 0x5fadced5fa80] mb B  I16..4:  0.0%  0.0%  0.0%  B16..8:  2.5%  0.3%  0.0%  direct: 0.1%  skip:97.0%  L0:54.9% L1:43.9% BI: 1.1%
[libx264 @ 0x5fadced5fa80] 8x8 transform intra:19.8% inter:16.8%
[libx264 @ 0x5fadced5fa80] coded y,uvDC,uvAC intra: 7.1% 9.9% 8.6% inter: 0.2% 0.3% 0.2%
[libx264 @ 0x5fadced5fa80] i16 v,h,dc,p: 84% 11%  5%  0%
[libx2

[38;5;4mℹ Pushing repo ra9hu/UT1-LunarLander-v2 to the Hugging Face Hub[0m


policy.optimizer.pth:   0%|          | 0.00/103k [00:00<?, ?B/s]

policy.pth:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_variables.pth:   0%|          | 0.00/864 [00:00<?, ?B/s]

ppo-LunarLander-v2.zip:   0%|          | 0.00/173k [00:00<?, ?B/s]

[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/ra9hu/UT1-LunarLander-v2/tree/main/[0m


CommitInfo(commit_url='https://huggingface.co/ra9hu/UT1-LunarLander-v2/commit/f6f72a0e694797f2186550426f6a063dc03541c9', commit_message='Upload PPO LunarLander-v2 trained agent', commit_description='', oid='f6f72a0e694797f2186550426f6a063dc03541c9', pr_url=None, pr_revision=None, pr_num=None)

## 4. Load a model from the Hub

In [5]:
from huggingface_sb3 import load_from_hub
repo_id = "satcos/ppo-LunarLander-v2" # The repo_id
filename = "ppo-LunarLander-v2.zip" # The model filename.zip

# When the model was trained on Python 3.8 the pickle protocol is 5
# But Python 3.6, 3.7 use protocol 4
# In order to get compatibility we need to:
# 1. Install pickle5 (we done it at the beginning of the colab)
# 2. Create a custom empty object we pass as parameter to PPO.load()
custom_objects = {
            "learning_rate": 0.0,
            "lr_schedule": lambda _: 0.0,
            "clip_range": lambda _: 0.0,
}

checkpoint = load_from_hub(repo_id, filename)
model = PPO.load(checkpoint, custom_objects=custom_objects, print_system_info=True)


ppo-LunarLander-v2.zip:   0%|          | 0.00/147k [00:00<?, ?B/s]

== CURRENT SYSTEM INFO ==
- OS: Linux-6.5.0-35-generic-x86_64-with-glibc2.35 # 35~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Tue May  7 09:00:52 UTC 2
- Python: 3.10.14
- Stable-Baselines3: 2.3.2
- PyTorch: 2.3.0+cu121
- GPU Enabled: True
- Numpy: 1.26.4
- Cloudpickle: 3.0.0
- Gymnasium: 0.29.1

== SAVED MODEL SYSTEM INFO ==
- OS: macOS-13.4.1-arm64-i386-64bit Darwin Kernel Version 22.5.0: Thu Jun  8 22:22:20 PDT 2023; root:xnu-8796.121.3~7/RELEASE_ARM64_T6000
- Python: 3.11.0
- Stable-Baselines3: 2.0.0a5
- PyTorch: 2.1.1
- GPU Enabled: False
- Numpy: 1.26.2
- Cloudpickle: 3.0.0
- Gymnasium: 0.28.1
- OpenAI Gym: 0.26.2



In [6]:
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy

# Create a new environment for evaluation
eval_env = Monitor(gym.make("LunarLander-v2"))

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

# Print the results
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")


mean_reward=265.97 +/- 25.634189946014846
