In [1]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1, 1))
display.start()

import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display
import gym
import numpy as np
import cv2

In [2]:
def evaluate_2(model, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes
    :return: (float) Mean reward for the given number of episodes
    """
    episode_rewards = []
    obs = env.reset()
    for i in range(num_episodes):
        episode_rewards.append(0.0)
        done = False
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            # Stats
            episode_rewards[-1] += reward
            if done:
                obs = env.reset()
    # Compute mean reward for the last 100 episodes
    mean_100ep_reward = round(np.mean(episode_rewards), 1)
    print("Mean reward:", mean_100ep_reward, "Num episodes:", len(episode_rewards))
    return mean_100ep_reward, episode_rewards

In [3]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines import PPO2
from stable_baselines.common import make_vec_env, set_global_seeds

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [15]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])

In [16]:
%%time
model = PPO2(MlpPolicy, env, n_steps=2000, verbose=1, nminibatches=20)
model.learn(total_timesteps=50000, log_interval=10)

--------------------------------------
| approxkl           | 0.0029015052  |
| clipfrac           | 0.020999998   |
| explained_variance | -0.00946      |
| fps                | 872           |
| n_updates          | 1             |
| policy_entropy     | 5.68657       |
| policy_loss        | -0.0063695135 |
| serial_timesteps   | 2000          |
| time_elapsed       | 1.91e-06      |
| total_timesteps    | 2000          |
| value_loss         | 92.702614     |
--------------------------------------
-------------------------------------
| approxkl           | 0.0027484435 |
| clipfrac           | 0.019374998  |
| explained_variance | 0.268        |
| fps                | 981          |
| n_updates          | 10           |
| policy_entropy     | 5.7017155    |
| policy_loss        | -0.002791639 |
| serial_timesteps   | 20000        |
| time_elapsed       | 18.6         |
| total_timesteps    | 20000        |
| value_loss         | 0.15041742   |
-------------------------------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fc8206bb990>

In [40]:
from stable_baselines.common import set_global_seeds, make_vec_env
from stable_baselines import PPO2

def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init


# Stable Baselines provides you with make_vec_env() helper
# which does exactly the previous steps for you:
# env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

In [48]:
env_id = "BipedalWalker-v2"
num_cpu = 20  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [49]:
%%time
model = PPO2(MlpPolicy, env, n_steps=int(2000/num_cpu), verbose=1, nminibatches=20)
model.learn(total_timesteps=50000, log_interval=10)

--------------------------------------
| approxkl           | 0.0032459039  |
| clipfrac           | 0.029124996   |
| explained_variance | 0.00101       |
| fps                | 2580          |
| n_updates          | 1             |
| policy_entropy     | 5.6822343     |
| policy_loss        | -0.0046145217 |
| serial_timesteps   | 100           |
| time_elapsed       | 3.1e-06       |
| total_timesteps    | 2000          |
| value_loss         | 338.62793     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0036413774  |
| clipfrac           | 0.03612499    |
| explained_variance | 0.257         |
| fps                | 4691          |
| n_updates          | 10            |
| policy_entropy     | 5.7550087     |
| policy_loss        | -0.0044008223 |
| serial_timesteps   | 1000          |
| time_elapsed       | 4.52          |
| total_timesteps    | 20000         |
| value_loss         | 0.13658616    |
-------------------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fc718af6390>

## mp train for 500k ts

In [None]:
env_id = "BipedalWalker-v2"
num_cpu = 20  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [41]:
%%time
model = PPO2(MlpPolicy, env, n_steps=int(2000/num_cpu), verbose=1, nminibatches=20)
model.learn(total_timesteps=500000, log_interval=10)

--------------------------------------
| approxkl           | 0.0023981573  |
| clipfrac           | 0.016499996   |
| explained_variance | -0.000227     |
| fps                | 2894          |
| n_updates          | 1             |
| policy_entropy     | 5.671473      |
| policy_loss        | -0.0050848573 |
| serial_timesteps   | 100           |
| time_elapsed       | 3.58e-06      |
| total_timesteps    | 2000          |
| value_loss         | 249.08528     |
--------------------------------------
-------------------------------------
| approxkl           | 0.00409021   |
| clipfrac           | 0.04724998   |
| explained_variance | 0.146        |
| fps                | 4502         |
| n_updates          | 10           |
| policy_entropy     | 5.7100744    |
| policy_loss        | -0.004566309 |
| serial_timesteps   | 1000         |
| time_elapsed       | 4.69         |
| total_timesteps    | 20000        |
| value_loss         | 0.17812183   |
-------------------------------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x7fc72c09c8d0>

In [42]:
model.save("ppo2_parallel_test")

In [43]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = PPO2.load("ppo2_parallel_test", env=env, nminibatches=16)

In [44]:
evaluate_2(model, 10)[0]

Mean reward: -36.5 Num episodes: 10


-36.5