<a href="https://colab.research.google.com/github/rsglick/drl/blob/master/notebooks/LLCv2_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%bash

apt-get install -y xvfb x11-utils python-opengl swig cmake ffmpeg freeglut3-dev

#pip install ray ray[rllib] ray[debug]
pip install Box2D box2d-py box2d-kengz gym[box2d] gym[Box_2D]\
            pyvirtualdisplay\
            PyOpenGL\
            piglet\
            piglet-templates\
            PyOpenGL-accelerate\
            stable-baselines3[extra]

Reading package lists...
Building dependency tree...
Reading state information...
freeglut3-dev is already the newest version (2.8.1-3).
freeglut3-dev set to manually installed.
cmake is already the newest version (3.10.2-1ubuntu2.18.04.1).
ffmpeg is already the newest version (7:3.4.6-0ubuntu0.18.04.1).
The following additional packages will be installed:
  libxxf86dga1 swig3.0
Suggested packages:
  libgle3 swig-doc swig-examples swig3.0-examples swig3.0-doc mesa-utils
The following NEW packages will be installed:
  libxxf86dga1 python-opengl swig swig3.0 x11-utils xvfb
0 upgraded, 6 newly installed, 0 to remove and 31 not upgraded.
Need to get 2,590 kB of archives.
After this operation, 14.2 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 python-opengl all 3.1.0+dfsg-1 [496 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd

In [None]:
%matplotlib inline

In [None]:
import gym
import os
import matplotlib.pyplot as plt
import numpy as np
import torch

from stable_baselines3 import PPO, SAC, TD3, A2C
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor

# Create Continous Gym Environment




In [None]:
# LunarLanderContinuous-v2
# MountainCarContinuous-v0
# Pendulum-v0

env_name = "LunarLanderContinuous-v2"
env = gym.make(env_name)

log_dir = f"./gym/{env_name}"
os.makedirs(log_dir, exist_ok=True)
env = Monitor(env, log_dir)
callback = EvalCallback(env, log_path=log_dir, eval_freq=1000, deterministic=False )

In [None]:
total_timesteps = 300000

# SAC

In [6]:
%%time

sac_hyperparams = {
    'learning_rate': 3.0e-4,
    'buffer_size': 1000000,
    'gamma': 0.99,
    'batch_size':256,
    'tau': 0.005,
    'device':'cuda',
}


modelSAC = SAC('MlpPolicy', env, **sac_hyperparams)
modelSAC.learn(total_timesteps=total_timesteps, callback=callback)

# Save the agent

modelSAC.save(f"modelSAC_{env_name}")
del modelSAC

# Evaluate the trained agent
modelSAC = SAC.load(f"modelSAC_{env_name}")

eval_env = gym.make(env_name)

mean_reward, std_reward = evaluate_policy(modelSAC, eval_env, n_eval_episodes=100, deterministic=False)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

Eval num_timesteps=1000, episode_reward=-186.71 +/- 129.50
Episode length: 222.00 +/- 95.43
New best mean reward!
Eval num_timesteps=2000, episode_reward=-235.66 +/- 74.55
Episode length: 209.40 +/- 54.00
Eval num_timesteps=3000, episode_reward=-148.82 +/- 81.35
Episode length: 323.60 +/- 164.29
New best mean reward!
Eval num_timesteps=4000, episode_reward=-202.91 +/- 108.46
Episode length: 473.60 +/- 217.84
Eval num_timesteps=5000, episode_reward=-202.64 +/- 106.57
Episode length: 501.20 +/- 130.05
Eval num_timesteps=6000, episode_reward=-153.32 +/- 171.29
Episode length: 762.80 +/- 245.25
Eval num_timesteps=7000, episode_reward=-191.05 +/- 48.83
Episode length: 876.20 +/- 131.97
Eval num_timesteps=8000, episode_reward=-209.86 +/- 24.00
Episode length: 795.80 +/- 36.93
Eval num_timesteps=9000, episode_reward=-2.77 +/- 100.25
Episode length: 511.40 +/- 158.11
New best mean reward!
Eval num_timesteps=10000, episode_reward=-60.11 +/- 74.60
Episode length: 929.80 +/- 140.40
Eval num_times

# PPO

In [7]:
%%time
modelPPO = PPO('MlpPolicy', env)
modelPPO.learn(total_timesteps=total_timesteps, callback=callback)

modelPPO.save(f"modelPPO_{env_name}")
del modelPPO

# Evaluate the trained agent
modelPPO = PPO.load(f"modelPPO_{env_name}")

eval_env = gym.make(env_name)

mean_reward, std_reward = evaluate_policy(modelPPO, eval_env, n_eval_episodes=100, deterministic=False)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

Eval num_timesteps=1000, episode_reward=-270.95 +/- 111.11
Episode length: 106.60 +/- 27.22
Eval num_timesteps=2000, episode_reward=-406.65 +/- 121.40
Episode length: 126.00 +/- 33.47
Eval num_timesteps=3000, episode_reward=-181.42 +/- 110.64
Episode length: 127.00 +/- 33.18
Eval num_timesteps=4000, episode_reward=-313.30 +/- 110.76
Episode length: 104.00 +/- 12.51
Eval num_timesteps=5000, episode_reward=-242.38 +/- 111.63
Episode length: 119.60 +/- 21.45
Eval num_timesteps=6000, episode_reward=-261.27 +/- 142.69
Episode length: 128.60 +/- 30.92
Eval num_timesteps=7000, episode_reward=-119.95 +/- 147.14
Episode length: 304.80 +/- 348.63
Eval num_timesteps=8000, episode_reward=-240.69 +/- 97.06
Episode length: 119.40 +/- 9.69
Eval num_timesteps=9000, episode_reward=-189.34 +/- 110.99
Episode length: 106.80 +/- 28.65
Eval num_timesteps=10000, episode_reward=-177.14 +/- 63.71
Episode length: 111.20 +/- 10.46
Eval num_timesteps=11000, episode_reward=-108.70 +/- 96.03
Episode length: 103.40

# TD3

In [8]:
%%time
modelTD3 = TD3('MlpPolicy', env)
modelTD3.learn(total_timesteps=total_timesteps, callback=callback)

modelTD3.save(f"modelTD3_{env_name}")
del modelTD3


# Evaluate the trained agent
modelTD3 = TD3.load(f"modelTD3_{env_name}")

eval_env = gym.make(env_name)

mean_reward, std_reward = evaluate_policy(modelTD3, eval_env, n_eval_episodes=100, deterministic=False)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

Eval num_timesteps=944, episode_reward=-975.78 +/- 362.51
Episode length: 178.20 +/- 69.36
Eval num_timesteps=1944, episode_reward=-918.11 +/- 418.00
Episode length: 150.60 +/- 53.84
Eval num_timesteps=2944, episode_reward=-385.26 +/- 128.08
Episode length: 106.20 +/- 12.84
Eval num_timesteps=3944, episode_reward=-290.24 +/- 166.50
Episode length: 155.00 +/- 69.66
Eval num_timesteps=4944, episode_reward=-190.16 +/- 94.77
Episode length: 207.80 +/- 46.87
Eval num_timesteps=5944, episode_reward=-361.33 +/- 43.29
Episode length: 168.20 +/- 41.44
Eval num_timesteps=6944, episode_reward=-274.75 +/- 40.40
Episode length: 194.20 +/- 33.99
Eval num_timesteps=7944, episode_reward=-247.46 +/- 23.51
Episode length: 173.40 +/- 68.99
Eval num_timesteps=8944, episode_reward=-312.34 +/- 49.83
Episode length: 208.40 +/- 46.16
Eval num_timesteps=9944, episode_reward=-267.02 +/- 59.33
Episode length: 240.20 +/- 95.50
Eval num_timesteps=10944, episode_reward=-262.81 +/- 53.16
Episode length: 250.60 +/- 1

# Record agents in action


In [None]:
# Set up fake display; otherwise rendering will fail
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))


from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = DummyVecEnv([lambda: gym.make(env_id)])
  # Start the video at step=0 and record 500 steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

In [16]:
video_folder = f"{env_name}_output"

record_video(env_name, modelTD3, video_length=1000, prefix=f'td3_{env_name}', video_folder=video_folder)
record_video(env_name, modelPPO, video_length=1000, prefix=f'ppo_{env_name}', video_folder=video_folder)
record_video(env_name, modelSAC, video_length=1000, prefix=f'sac_{env_name}', video_folder=video_folder)

Saving video to  /content/LunarLanderContinuous-v2_output/td3_LunarLanderContinuous-v2-step-0-to-step-1000.mp4
Saving video to  /content/LunarLanderContinuous-v2_output/ppo_LunarLanderContinuous-v2-step-0-to-step-1000.mp4
Saving video to  /content/LunarLanderContinuous-v2_output/sac_LunarLanderContinuous-v2-step-0-to-step-1000.mp4


In [17]:
show_videos(video_folder, prefix=f'sac_{env_name}')

In [18]:
show_videos(video_folder, prefix=f'td3_{env_name}')

In [19]:
show_videos(video_folder, prefix=f'ppo_{env_name}')

In [20]:
os.system(f"mv modelPPO_{env_name}.zip {video_folder}")
os.system(f"mv modelSAC_{env_name}.zip {video_folder}")
os.system(f"mv modelTD3_{env_name}.zip {video_folder}")

0

In [21]:
os.system(f"tar -cvzf {video_folder}.tar.gz {video_folder}")

0

In [None]:
import time
time.sleep(3)

In [None]:
from google.colab import files
files.download(f'/content/{video_folder}.tar.gz')