<a target="_blank" href="https://colab.research.google.com/github/rcpaffenroth/dac_raghu/blob/main/LunarLander.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [11]:
! apt-get install build-essential swig
! pip install stable-baselines3[extra] gymnasium[box2d]

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
swig is already the newest version (4.0.2-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 8 not upgraded.


In [12]:
import gymnasium as gym
import matplotlib.pylab as py
import numpy as np

import imageio
from stable_baselines3 import PPO

from IPython import display
%matplotlib inline

In [27]:
from IPython.display import HTML
from base64 import b64encode

In [13]:
# Make the environment
env = gym.make("LunarLander-v2", render_mode='rgb_array')

observation = env.reset()
# Note, there are warnings here that I should fix at some point.

In [14]:
# This is an untrained model that has a good architecture and loss function
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 88.7     |
|    ep_rew_mean     | -203     |
| time/              |          |
|    fps             | 606      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 95.1         |
|    ep_rew_mean          | -164         |
| time/                   |              |
|    fps                  | 255          |
|    iterations           | 2            |
|    time_elapsed         | 16           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0064943526 |
|    clip_fraction        | 0.0209       |
|    clip_range           | 0.2          |
|    en

<stable_baselines3.ppo.ppo.PPO at 0x79dbd07304c0>

In [15]:
# Make a movie of a random agent
env.reset()
images = []
done = False
while not done:
   # This rendering mode puts an image into a numpy array
   images +=  [env.render()]
   obs, reward, done, trunc, info = env.step(env.action_space.sample()) # take a random action
   print("reward: %s" % reward)
env.close()

reward: -0.16165973136647266
reward: 0.8174315557882437
reward: -0.09909448332470447
reward: -0.4150950672877787
reward: -0.27355681499028495
reward: -1.3824400091275766
reward: -2.178687945787658
reward: -1.9630088988246814
reward: 2.6660139159556993
reward: -2.8090356494018054
reward: -2.1273212453837687
reward: 2.287352547355579
reward: -2.1344436391873103
reward: -1.9805159364798033
reward: -2.2222730599861777
reward: 0.25448410908962843
reward: -2.5074475257705444
reward: 3.3416489906774247
reward: -2.6524175532905745
reward: 0.6954126535724299
reward: -2.3379139050771016
reward: -2.444184063958487
reward: -1.9755620649938226
reward: -2.2567618812619785
reward: -2.0814106808823567
reward: -1.9599470541435255
reward: 3.323521183947901
reward: -1.7828083635559995
reward: -1.4557473512290926
reward: 1.8674272635922022
reward: -1.4807992690944616
reward: -1.4485542735450554
reward: -1.188771142178722
reward: -1.0115902555706373
reward: -0.8723748595380119
reward: -0.9197773358556833
r

In [16]:
len(images)

84

In [17]:
images[0].shape

(400, 600, 3)

In [29]:
# imageio is a nice library for taking a sequence of images and makeing a movie
name = 'tmp.mp4'
imageio.mimsave(name, images, fps=15)
mp4 = open(name,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)



In [19]:
# Make a movie of a trained agent
obs = env.reset()[0]
images = []
done = False
while not done:
   # This rendering mode puts an image into a numpy array
   images += [env.render()]
   print(obs)
   action, _states = model.predict(obs, deterministic=True)
   obs, reward, done, trunc, info = env.step(action)
   print("Reward:", reward)
   if done:
      obs = env.reset()
env.close()

[ 0.00546808  1.4221886   0.55384046  0.50080824 -0.00632933 -0.12545316
  0.          0.        ]
Reward: -6.16419261968274
[ 0.01110649  1.4344361   0.56931484  0.5442987  -0.01174042 -0.10823049
  0.          0.        ]
Reward: -4.000323352689679
[ 0.01677427  1.4472262   0.5721475   0.5683963  -0.01705114 -0.10622434
  0.          0.        ]
Reward: -3.7624728120467408
[ 0.02259159  1.4602325   0.5864284   0.57799625 -0.0216884  -0.09275381
  0.          0.        ]
Reward: -3.0036627134857836
[ 0.02840357  1.4735334   0.58597285  0.5910741  -0.02641362 -0.0945129
  0.          0.        ]
Reward: -5.724814096495822
[ 0.03433638  1.4876995   0.5975944   0.62952095 -0.03068372 -0.08540999
  0.          0.        ]
Reward: -2.1795707735202425
[ 0.0401063   1.5021591   0.5821842   0.64253515 -0.03581773 -0.10268966
  0.          0.        ]
Reward: -4.854808537260271
[ 0.04605827  1.5170557   0.59962416  0.6619585  -0.04020175 -0.0876882
  0.          0.        ]
Reward: -5.06927166

In [20]:
imageio.mimsave('tmp2.mp4', images, fps=15)
display.Video('tmp2.mp4')



In [21]:
from huggingface_sb3 import load_from_hub
# checkpoint = load_from_hub(
#     repo_id="sb3/demo-hf-CartPole-v1",
#     filename="ppo-CartPole-v1.zip",
# )
checkpoint = load_from_hub(
    repo_id="MalarzDawid/ppo-LunarLandar-v2",
    filename="ppo-LunarLander-v2.zip",
)

model = PPO.load(checkpoint)

ModuleNotFoundError: ignored

In [None]:
# Make a movie of a trained agent
obs = env.reset()[0]
images = []
done = False
while not done:
   # This rendering mode puts an image into a numpy array
   images += [env.render()]
   print(obs)
   action, _states = model.predict(obs, deterministic=True)
   obs, reward, done, trunc, info = env.step(action)
   print("Reward:", reward)
   if done:
      obs = env.reset()
env.close()

In [None]:
imageio.mimsave('tmp3.mp4', images, fps=15)
display.Video('tmp3.mp4')

# Recording trajectory

In [None]:
for i in range(10):
    # Make a movie of a trained agent
    obs = env.reset()[0]
    done = False
    x = []
    y = []
    while not done:
    # This rendering mode puts an image into a numpy array
        x.append(obs[0])
        y.append(obs[1])
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, trunc, info = env.step(action)
        if done:
            obs = env.reset()
    env.close()
    py.clf()
    py.plot(x,y)
    py.savefig('movie'+str(i)+'.png')

In [None]:
py.plot(x,y)