<a href="https://colab.research.google.com/github/rsglick/drl/blob/master/notebooks/Pendv0_stable_baselines3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
%%bash

apt-get install -y xvfb x11-utils python-opengl swig cmake ffmpeg freeglut3-dev

#pip install ray ray[rllib] ray[debug]
pip install Box2D box2d-py box2d-kengz gym[box2d] gym[Box_2D]\
            pyvirtualdisplay\
            PyOpenGL\
            piglet\
            piglet-templates\
            PyOpenGL-accelerate\
            stable-baselines3[extra]

Reading package lists...
Building dependency tree...
Reading state information...
x11-utils is already the newest version (7.7+3build1).
freeglut3-dev is already the newest version (2.8.1-3).
python-opengl is already the newest version (3.1.0+dfsg-1).
swig is already the newest version (3.0.12-1).
cmake is already the newest version (3.10.2-1ubuntu2.18.04.1).
ffmpeg is already the newest version (7:3.4.6-0ubuntu0.18.04.1).
xvfb is already the newest version (2:1.19.6-1ubuntu4.4).
0 upgraded, 0 newly installed, 0 to remove and 31 not upgraded.


In [None]:
%matplotlib inline

In [None]:
import gym
import os
import matplotlib.pyplot as plt
import numpy as np
import torch

from stable_baselines3 import PPO, SAC, TD3, A2C
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor

# Create Continous Gym Environment




In [None]:
# LunarLanderContinuous-v2
# MountainCarContinuous-v0
# Pendulum-v0

env_name = "Pendulum-v0"
env = gym.make(env_name)

log_dir = f"./gym/{env_name}"
os.makedirs(log_dir, exist_ok=True)
env = Monitor(env, log_dir)
callback = EvalCallback(env, log_path=log_dir, eval_freq=1000, deterministic=False )

In [None]:
total_timesteps = 300000
eval_eps = 100

# SAC

In [51]:
%%time

sac_hyperparams = {
    'learning_rate': 3.0e-4,
    'buffer_size': 1000000,
    'gamma': 0.99,
    'batch_size':256,
    'tau': 0.005,
    'device':'cuda',
}


modelSAC = SAC('MlpPolicy', env, **sac_hyperparams)
modelSAC.learn(total_timesteps=total_timesteps, callback=callback)

# Save the agent

modelSAC.save(f"modelSAC_{env_name}")
del modelSAC

# Evaluate the trained agent
modelSAC = SAC.load(f"modelSAC_{env_name}")

eval_env = gym.make(env_name)

mean_reward, std_reward = evaluate_policy(modelSAC, eval_env, n_eval_episodes=eval_eps, deterministic=False)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

Eval num_timesteps=1000, episode_reward=-1624.39 +/- 116.92
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=2000, episode_reward=-1218.19 +/- 138.33
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3000, episode_reward=-999.48 +/- 125.65
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=4000, episode_reward=-678.43 +/- 53.70
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=5000, episode_reward=-317.06 +/- 308.18
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=6000, episode_reward=-444.85 +/- 642.90
Episode length: 200.00 +/- 0.00
Eval num_timesteps=7000, episode_reward=-299.90 +/- 417.47
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=8000, episode_reward=-167.36 +/- 120.11
Episode length: 200.00 +/- 0.00
New best mean reward!
Eval num_timesteps=9000, episode_reward=-242.61 +/- 205.02
Episode length: 200.00 +/- 0.00
Eval num_timesteps=10000, 

# PPO

In [52]:
%%time
modelPPO = PPO('MlpPolicy', env)
modelPPO.learn(total_timesteps=total_timesteps, callback=callback)

modelPPO.save(f"modelPPO_{env_name}")
del modelPPO

# Evaluate the trained agent
modelPPO = PPO.load(f"modelPPO_{env_name}")

eval_env = gym.make(env_name)

mean_reward, std_reward = evaluate_policy(modelPPO, eval_env, n_eval_episodes=eval_eps, deterministic=False)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

Eval num_timesteps=1000, episode_reward=-1126.78 +/- 266.82
Episode length: 200.00 +/- 0.00
Eval num_timesteps=2000, episode_reward=-1237.89 +/- 177.25
Episode length: 200.00 +/- 0.00
Eval num_timesteps=3000, episode_reward=-999.61 +/- 200.37
Episode length: 200.00 +/- 0.00
Eval num_timesteps=4000, episode_reward=-1184.12 +/- 335.59
Episode length: 200.00 +/- 0.00
Eval num_timesteps=5000, episode_reward=-1212.00 +/- 300.23
Episode length: 200.00 +/- 0.00
Eval num_timesteps=6000, episode_reward=-886.64 +/- 103.85
Episode length: 200.00 +/- 0.00
Eval num_timesteps=7000, episode_reward=-1100.02 +/- 218.16
Episode length: 200.00 +/- 0.00
Eval num_timesteps=8000, episode_reward=-1033.72 +/- 57.25
Episode length: 200.00 +/- 0.00
Eval num_timesteps=9000, episode_reward=-1116.48 +/- 179.61
Episode length: 200.00 +/- 0.00
Eval num_timesteps=10000, episode_reward=-1198.84 +/- 169.95
Episode length: 200.00 +/- 0.00
Eval num_timesteps=11000, episode_reward=-1118.08 +/- 152.43
Episode length: 200.0

# TD3

In [53]:
%%time
modelTD3 = TD3('MlpPolicy', env)
modelTD3.learn(total_timesteps=total_timesteps, callback=callback)

modelTD3.save(f"modelTD3_{env_name}")
del modelTD3


# Evaluate the trained agent
modelTD3 = TD3.load(f"modelTD3_{env_name}")

eval_env = gym.make(env_name)

mean_reward, std_reward = evaluate_policy(modelTD3, eval_env, n_eval_episodes=eval_eps, deterministic=False)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

Eval num_timesteps=944, episode_reward=-1565.20 +/- 60.47
Episode length: 200.00 +/- 0.00
Eval num_timesteps=1944, episode_reward=-1323.69 +/- 82.17
Episode length: 200.00 +/- 0.00
Eval num_timesteps=2944, episode_reward=-1190.94 +/- 185.98
Episode length: 200.00 +/- 0.00
Eval num_timesteps=3944, episode_reward=-1120.72 +/- 46.43
Episode length: 200.00 +/- 0.00
Eval num_timesteps=4944, episode_reward=-657.69 +/- 209.79
Episode length: 200.00 +/- 0.00
Eval num_timesteps=5944, episode_reward=-454.23 +/- 229.81
Episode length: 200.00 +/- 0.00
Eval num_timesteps=6944, episode_reward=-1028.32 +/- 139.24
Episode length: 200.00 +/- 0.00
Eval num_timesteps=7944, episode_reward=-513.98 +/- 511.17
Episode length: 200.00 +/- 0.00
Eval num_timesteps=8944, episode_reward=-76.46 +/- 99.26
Episode length: 200.00 +/- 0.00
Eval num_timesteps=9944, episode_reward=-283.24 +/- 254.53
Episode length: 200.00 +/- 0.00
Eval num_timesteps=10944, episode_reward=-430.64 +/- 598.63
Episode length: 200.00 +/- 0.00

# Record agents in action


In [None]:
# Set up fake display; otherwise rendering will fail
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))


from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = DummyVecEnv([lambda: gym.make(env_id)])
  # Start the video at step=0 and record 500 steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

In [55]:
video_folder = f"{env_name}_output"

record_video(env_name, modelTD3, video_length=1000, prefix=f'td3_{env_name}', video_folder=video_folder)
record_video(env_name, modelPPO, video_length=1000, prefix=f'ppo_{env_name}', video_folder=video_folder)
record_video(env_name, modelSAC, video_length=1000, prefix=f'sac_{env_name}', video_folder=video_folder)

Saving video to  /content/Pendulum-v0_output/td3_Pendulum-v0-step-0-to-step-1000.mp4
Saving video to  /content/Pendulum-v0_output/ppo_Pendulum-v0-step-0-to-step-1000.mp4
Saving video to  /content/Pendulum-v0_output/sac_Pendulum-v0-step-0-to-step-1000.mp4


In [56]:
show_videos(video_folder, prefix=f'sac_{env_name}')

In [57]:
show_videos(video_folder, prefix=f'td3_{env_name}')

In [58]:
show_videos(video_folder, prefix=f'ppo_{env_name}')

In [59]:
os.system(f"mv modelPPO_{env_name}.zip {video_folder}")
os.system(f"mv modelSAC_{env_name}.zip {video_folder}")
os.system(f"mv modelTD3_{env_name}.zip {video_folder}")

0

In [60]:
os.system(f"tar -cvzf {video_folder}.tar.gz {video_folder}")

0

In [None]:
import time
time.sleep(3)

In [62]:
from google.colab import files
files.download(f'/content/{video_folder}.tar.gz')

MessageError: ignored

In [None]:
#%%bash
#rm -rf /content/*