In [1]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1, 1))
display.start()

import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display
import gym
import numpy as np
import cv2

In [2]:
def evaluate_2(model, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes
    :return: (float) Mean reward for the given number of episodes
    """
    episode_rewards = []
    obs = env.reset()
    for i in range(num_episodes):
        episode_rewards.append(0.0)
        done = False
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            # Stats
            episode_rewards[-1] += reward
            if done:
                obs = env.reset()
    # Compute mean reward for the last 100 episodes
    mean_100ep_reward = round(np.mean(episode_rewards), 1)
    print("Mean reward:", mean_100ep_reward, "Num episodes:", len(episode_rewards))
    return mean_100ep_reward, episode_rewards

In [3]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines import PPO2
from stable_baselines.common import make_vec_env, set_global_seeds

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])

In [5]:
%%time
model = PPO2(MlpPolicy, env, n_steps=2000, verbose=1, nminibatches=20)
model.learn(total_timesteps=50000, log_interval=10)





Instructions for updating:
Use keras.layers.flatten instead.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


--------------------------------------
| approxkl           | 0.0024928835  |
| clipfrac           | 0.019750003   |
| explained_variance | -0.0075       |
| fps                | 891           |
| n_updates          | 1             |
| policy_entropy     | 5.677415      |
| policy_loss        | -0.0052104895 |
| serial_timesteps   | 2000          |
| time_elapsed       | 2.15e-06      |
| total_timesteps    | 2000          |
| value_loss         | 44.8448       |
--------------------------------------
--------------------------------------
| approxkl           | 0.0033729482  |
| clipfrac           | 0.039249986   |
| explained_variance | 0.00541       |
| fps                | 1021          |
| n_updates          | 10            |
| policy_entropy     | 5.7244296     |
| policy_loss        | -0.0052582296 |
| serial_timesteps

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fb5a6488c50>

In [6]:
from stable_baselines.common import set_global_seeds, make_vec_env
from stable_baselines import PPO2

def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init


# Stable Baselines provides you with make_vec_env() helper
# which does exactly the previous steps for you:
# env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

In [7]:
env_id = "BipedalWalker-v2"
num_cpu = 20  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [8]:
%%time
model = PPO2(MlpPolicy, env, n_steps=int(2000/num_cpu), verbose=1, nminibatches=20)
model.learn(total_timesteps=50000, log_interval=10)

--------------------------------------
| approxkl           | 0.0007212687  |
| clipfrac           | 0.001125      |
| explained_variance | 0.00179       |
| fps                | 2742          |
| n_updates          | 1             |
| policy_entropy     | 5.679295      |
| policy_loss        | -0.0019351933 |
| serial_timesteps   | 100           |
| time_elapsed       | 3.1e-06       |
| total_timesteps    | 2000          |
| value_loss         | 251.99869     |
--------------------------------------
-------------------------------------
| approxkl           | 0.0026664075 |
| clipfrac           | 0.022499995  |
| explained_variance | 0.0121       |
| fps                | 4428         |
| n_updates          | 10           |
| policy_entropy     | 5.7602496    |
| policy_loss        | -0.00424704  |
| serial_timesteps   | 1000         |
| time_elapsed       | 4.41         |
| total_timesteps    | 20000        |
| value_loss         | 27.207891    |
-------------------------------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fb5a6474a50>

## mp train for 500k ts

In [9]:
env_id = "BipedalWalker-v2"
num_cpu = 20  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [10]:
%%time
model = PPO2(MlpPolicy, env, n_steps=int(2000/num_cpu), verbose=1, nminibatches=20)
model.learn(total_timesteps=500000, log_interval=10)

-------------------------------------
| approxkl           | 0.0018225571 |
| clipfrac           | 0.011375     |
| explained_variance | 0.0013       |
| fps                | 2861         |
| n_updates          | 1            |
| policy_entropy     | 5.680061     |
| policy_loss        | -0.00500138  |
| serial_timesteps   | 100          |
| time_elapsed       | 3.81e-06     |
| total_timesteps    | 2000         |
| value_loss         | 296.62463    |
-------------------------------------
-------------------------------------
| approxkl           | 0.005336808  |
| clipfrac           | 0.063499995  |
| explained_variance | -0.0912      |
| fps                | 4373         |
| n_updates          | 10           |
| policy_entropy     | 5.635414     |
| policy_loss        | -0.007771574 |
| serial_timesteps   | 1000         |
| time_elapsed       | 4.35         |
| total_timesteps    | 20000        |
| value_loss         | 0.12786123   |
-------------------------------------
------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fb5a646ebd0>

In [11]:
model.save("ppo2_parallel_test")

In [12]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = PPO2.load("ppo2_parallel_test", env=env, nminibatches=16)

In [13]:
evaluate_2(model, 10)[0]

Mean reward: 134.0 Num episodes: 10


134.0

## mp train for 1M ts

In [14]:
env_id = "BipedalWalker-v2"
num_cpu = 20  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [15]:
model = PPO2.load("ppo2_parallel_test", env=env, n_steps=int(2000/num_cpu), verbose=1, nminibatches=20)

In [16]:
%%time
model.learn(total_timesteps=500000, log_interval=10)

-------------------------------------
| approxkl           | 0.0050668074 |
| clipfrac           | 0.05674998   |
| explained_variance | 0.388        |
| fps                | 2879         |
| n_updates          | 1            |
| policy_entropy     | 6.283976     |
| policy_loss        | -0.004145724 |
| serial_timesteps   | 100          |
| time_elapsed       | 2.86e-06     |
| total_timesteps    | 2000         |
| value_loss         | 167.97058    |
-------------------------------------
--------------------------------------
| approxkl           | 0.006713338   |
| clipfrac           | 0.088374995   |
| explained_variance | 0.862         |
| fps                | 4692          |
| n_updates          | 10            |
| policy_entropy     | 6.3188453     |
| policy_loss        | -0.0050673233 |
| serial_timesteps   | 1000          |
| time_elapsed       | 4.13          |
| total_timesteps    | 20000         |
| value_loss         | 0.2930725     |
--------------------------------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fb5387ab190>

In [17]:
model.save("ppo2_parallel_test_1m")

In [18]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = PPO2.load("ppo2_parallel_test", env=env, nminibatches=16)

In [19]:
evaluate_2(model, 10)[0]

Mean reward: 141.8 Num episodes: 10


141.8

## mp train for 5M ts

In [20]:
env_id = "BipedalWalker-v2"
num_cpu = 20  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [21]:
model = PPO2.load("ppo2_parallel_test_1m", env=env, n_steps=int(2000/num_cpu), verbose=1, nminibatches=20)

In [24]:
%%time
model.learn(total_timesteps=int(4e6), log_interval=10)

-------------------------------------
| approxkl           | 0.0055902456 |
| clipfrac           | 0.06312499   |
| explained_variance | 0.854        |
| fps                | 3084         |
| n_updates          | 1            |
| policy_entropy     | 7.398677     |
| policy_loss        | -0.003749938 |
| serial_timesteps   | 100          |
| time_elapsed       | 3.34e-06     |
| total_timesteps    | 2000         |
| value_loss         | 46.085274    |
-------------------------------------
-------------------------------------
| approxkl           | 0.008633566  |
| clipfrac           | 0.12087498   |
| explained_variance | 0.826        |
| fps                | 4606         |
| n_updates          | 10           |
| policy_entropy     | 7.467008     |
| policy_loss        | -0.008168411 |
| serial_timesteps   | 1000         |
| time_elapsed       | 4.3          |
| total_timesteps    | 20000        |
| value_loss         | 12.933058    |
-------------------------------------
------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fb4d86a2910>

In [25]:
model.save("ppo2_parallel_test_5m")

In [28]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = PPO2.load("ppo2_parallel_test_5m", env=env, nminibatches=16)
evaluate_2(model, 10)[0]

Mean reward: 196.2 Num episodes: 10


196.2

In [32]:
rew_avg, rew_list = evaluate_2(model, 100)
print(rew_avg)

Mean reward: 190.4 Num episodes: 100
190.4


In [33]:
min(rew_list), max(rew_list)

(array([-119.22494], dtype=float32), array([277.06787], dtype=float32))

## mp train for 10M ts

In [34]:
env_id = "BipedalWalker-v2"
num_cpu = 20  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [35]:
model = PPO2.load("ppo2_parallel_test_5m", env=env, n_steps=int(2000/num_cpu), verbose=1, nminibatches=20)

In [36]:
%%time
model.learn(total_timesteps=int(5e6), log_interval=10)

-------------------------------------
| approxkl           | 0.005068543  |
| clipfrac           | 0.058999985  |
| explained_variance | 0.962        |
| fps                | 3198         |
| n_updates          | 1            |
| policy_entropy     | 13.229212    |
| policy_loss        | -0.006625402 |
| serial_timesteps   | 100          |
| time_elapsed       | 3.34e-06     |
| total_timesteps    | 2000         |
| value_loss         | 0.67912114   |
-------------------------------------
--------------------------------------
| approxkl           | 0.005937907   |
| clipfrac           | 0.07212498    |
| explained_variance | 0.349         |
| fps                | 4652          |
| n_updates          | 10            |
| policy_entropy     | 13.235659     |
| policy_loss        | -0.0071927356 |
| serial_timesteps   | 1000          |
| time_elapsed       | 4.27          |
| total_timesteps    | 20000         |
| value_loss         | 4.030078      |
--------------------------------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fb4f82fa6d0>

In [37]:
model.save("ppo2_parallel_test_10m")

In [38]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = PPO2.load("ppo2_parallel_test_10m", env=env, nminibatches=16)
evaluate_2(model, 10)[0]

Mean reward: 272.5 Num episodes: 10


272.5

In [39]:
rew_avg, rew_list = evaluate_2(model, 100)
print(rew_avg)

Mean reward: 235.3 Num episodes: 100
235.3


In [40]:
min(rew_list), max(rew_list)

(array([-120.74525], dtype=float32), array([278.25974], dtype=float32))

## mp train for 15M ts

In [43]:
env_id = "BipedalWalker-v2"
num_cpu = 10  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [44]:
model = PPO2.load("ppo2_parallel_test_10m", env=env, n_steps=int(2000/num_cpu), verbose=1, nminibatches=20)

In [45]:
%%time
model.learn(total_timesteps=int(5e6), log_interval=10)

-------------------------------------
| approxkl           | 0.005116429  |
| clipfrac           | 0.056999993  |
| explained_variance | 0.898        |
| fps                | 1645         |
| n_updates          | 1            |
| policy_entropy     | 18.211708    |
| policy_loss        | -0.008977929 |
| serial_timesteps   | 200          |
| time_elapsed       | 4.29e-06     |
| total_timesteps    | 2000         |
| value_loss         | 16.044277    |
-------------------------------------
--------------------------------------
| approxkl           | 0.004840561   |
| clipfrac           | 0.06024999    |
| explained_variance | 0.406         |
| fps                | 2685          |
| n_updates          | 10            |
| policy_entropy     | 18.242268     |
| policy_loss        | -0.0053683524 |
| serial_timesteps   | 2000          |
| time_elapsed       | 8.75          |
| total_timesteps    | 20000         |
| value_loss         | 5.288269      |
--------------------------------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fb4d86cc750>

In [46]:
model.save("ppo2_parallel_test_15m")

In [47]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = PPO2.load("ppo2_parallel_test_15m", env=env, nminibatches=16)
evaluate_2(model, 100)[0]

Mean reward: 244.7 Num episodes: 100


244.7

## mp train for 16M ts

In [69]:
env_id = "BipedalWalker-v2"
num_cpu = 10  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [70]:
model = PPO2.load("ppo2_parallel_test_15m", env=env, n_steps=int(2000/num_cpu), verbose=1, nminibatches=20)

In [71]:
%%time
model.learn(total_timesteps=int(1e6), log_interval=10)

--------------------------------------
| approxkl           | 0.0035000294  |
| clipfrac           | 0.038499977   |
| explained_variance | 0.969         |
| fps                | 2775          |
| n_updates          | 1             |
| policy_entropy     | 21.102102     |
| policy_loss        | -0.0050650802 |
| serial_timesteps   | 200           |
| time_elapsed       | 2.86e-06      |
| total_timesteps    | 2000          |
| value_loss         | 0.43075734    |
--------------------------------------
--------------------------------------
| approxkl           | 0.0029164734  |
| clipfrac           | 0.036999986   |
| explained_variance | 0.873         |
| fps                | 3581          |
| n_updates          | 10            |
| policy_entropy     | 21.044209     |
| policy_loss        | -0.0014467472 |
| serial_timesteps   | 2000          |
| time_elapsed       | 5.18          |
| total_timesteps    | 20000         |
| value_loss         | 11.351488     |
-------------------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fb3c3d137d0>

In [72]:
model.save("ppo2_parallel_test_16m")

In [73]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = PPO2.load("ppo2_parallel_test_16m", env=env, nminibatches=16)
evaluate_2(model, 100)[0]

Mean reward: 172.9 Num episodes: 100


172.9

## mp train for 16M ts

In [59]:
env_id = "BipedalWalker-v2"
num_cpu = 10  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [60]:
model = PPO2.load("ppo2_parallel_test_15m", env=env, n_steps=int(2000/num_cpu), verbose=1, 
                  nminibatches=20, learning_rate=0.00005)

In [61]:
%%time
model.learn(total_timesteps=int(1e6), log_interval=10)

-------------------------------------
| approxkl           | 0.0018725982 |
| clipfrac           | 0.016874995  |
| explained_variance | 0.969        |
| fps                | 2777         |
| n_updates          | 1            |
| policy_entropy     | 21.098171    |
| policy_loss        | -0.002583321 |
| serial_timesteps   | 200          |
| time_elapsed       | 3.34e-06     |
| total_timesteps    | 2000         |
| value_loss         | 0.41565996   |
-------------------------------------
---------------------------------------
| approxkl           | 0.00014600657  |
| clipfrac           | 0.00024999998  |
| explained_variance | 0.893          |
| fps                | 3543           |
| n_updates          | 10             |
| policy_entropy     | 21.096237      |
| policy_loss        | -0.00078123814 |
| serial_timesteps   | 2000           |
| time_elapsed       | 5.2            |
| total_timesteps    | 20000          |
| value_loss         | 17.727943      |
--------------------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fb3d2630c10>

In [62]:
model.save("ppo2_parallel_test_16m_1md")

In [63]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = PPO2.load("ppo2_parallel_test_16m_1md", env=env, nminibatches=16)
evaluate_2(model, 100)[0]

Mean reward: 218.3 Num episodes: 100


218.3

## mp train for 17M ts

In [64]:
env_id = "BipedalWalker-v2"
num_cpu = 10  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [65]:
model = PPO2.load("ppo2_parallel_test_16m_1md", env=env, n_steps=int(2000/num_cpu), verbose=1, 
                  nminibatches=20, learning_rate=0.00005)

In [66]:
%%time
model.learn(total_timesteps=int(1e6), log_interval=10)

--------------------------------------
| approxkl           | 0.00049299083 |
| clipfrac           | 0.002125      |
| explained_variance | 0.898         |
| fps                | 2637          |
| n_updates          | 1             |
| policy_entropy     | 21.215794     |
| policy_loss        | -0.001052466  |
| serial_timesteps   | 200           |
| time_elapsed       | 3.58e-06      |
| total_timesteps    | 2000          |
| value_loss         | 23.414997     |
--------------------------------------
--------------------------------------
| approxkl           | 0.00040829228 |
| clipfrac           | 0.003375      |
| explained_variance | 0.556         |
| fps                | 3687          |
| n_updates          | 10            |
| policy_entropy     | 21.207872     |
| policy_loss        | -0.0005521225 |
| serial_timesteps   | 2000          |
| time_elapsed       | 5.38          |
| total_timesteps    | 20000         |
| value_loss         | 5.012393      |
-------------------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fb3c3d16d90>

In [67]:
model.save("ppo2_parallel_test_17m_1md")

In [68]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = PPO2.load("ppo2_parallel_test_17m_1md", env=env, nminibatches=16)
evaluate_2(model, 100)[0]

Mean reward: 234.5 Num episodes: 100


234.5

In [74]:
rew, stats = evaluate_2(model, 10)

Mean reward: 164.7 Num episodes: 10


In [75]:
stats

[array([71.68468], dtype=float32),
 array([97.526], dtype=float32),
 array([276.0085], dtype=float32),
 array([275.3617], dtype=float32),
 array([273.0869], dtype=float32),
 array([268.27756], dtype=float32),
 array([18.793076], dtype=float32),
 array([82.45851], dtype=float32),
 array([19.01503], dtype=float32),
 array([264.55746], dtype=float32)]