In [1]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1, 1))
display.start()

import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display
import gym
import numpy as np
import cv2

In [2]:
from stable_baselines.common.policies import MlpLstmPolicy, MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines import ACKTR
from stable_baselines.common import make_vec_env, set_global_seeds

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
def evaluate_2(model, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes
    :return: (float) Mean reward for the given number of episodes
    """
    episode_rewards = []
    obs = env.reset()
    for i in range(num_episodes):
        episode_rewards.append(0.0)
        done = False
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            reward, done, info = reward[0], done[0], info[0]
            # Stats
            episode_rewards[-1] += reward
            if done:
                obs = env.reset()
    # Compute mean reward for the last 100 episodes
    mean_100ep_reward = round(np.mean(episode_rewards), 1)
    print("Mean reward:", mean_100ep_reward, "Num episodes:", len(episode_rewards))
    return mean_100ep_reward

In [4]:
def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init


# Stable Baselines provides you with make_vec_env() helper
# which does exactly the previous steps for you:
# env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

In [5]:
env_id = "BipedalWalker-v2"
num_cpu = 8  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])




In [6]:
%%time
model = ACKTR(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=500000, log_interval=10)





Instructions for updating:
Use keras.layers.flatten instead.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


---------------------------------
| explained_variance | -0.0942  |
| fps                | 237      |
| nupdates           | 1        |
| policy_entropy     | 5.68     |
| policy_loss        | -0.34    |
| total_timesteps    | 0        |
| value_loss         | 3.06     |
---------------------------------
---------------------------------
| explained_variance | 0.000996 |
| fps                | 1352     |
| nupdates           | 10       |
| policy_entropy     | 5.56     |
| policy_loss        | -0.0192  |
| total_timesteps    | 1449     |
| value_loss         | 546      |
---------------------------------
---------------------------------
| explained_variance | 0.101    |
| fps                | 1828     |
| nupdates           | 20       |
| policy_entropy     | 5.54     |
| policy_loss        | -0.142   |
| total_timesteps    |

<stable_baselines.acktr.acktr.ACKTR at 0x7fb81aab7410>

In [7]:
model.save("acktr_parallel_test")

In [9]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = ACKTR.load("acktr_parallel_test", env=env, nminibatches=16)

In [11]:
evaluate_2(model, 100)

Mean reward: 164.4 Num episodes: 100


164.4

## mp train for 1M ts

In [13]:
env_id = "BipedalWalker-v2"
num_cpu = 10  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [14]:
model = ACKTR.load("acktr_parallel_test", env=env, verbose=1)

In [15]:
%%time
model.learn(total_timesteps=500000, log_interval=10)

---------------------------------
| explained_variance | 0.44     |
| fps                | 284      |
| nupdates           | 1        |
| policy_entropy     | 5.25     |
| policy_loss        | -0.00462 |
| total_timesteps    | 0        |
| value_loss         | 3.75     |
---------------------------------
---------------------------------
| explained_variance | 0.087    |
| fps                | 1624     |
| nupdates           | 10       |
| policy_entropy     | 5.26     |
| policy_loss        | -0.0719  |
| total_timesteps    | 1809     |
| value_loss         | 1.6      |
---------------------------------
---------------------------------
| explained_variance | 0.107    |
| fps                | 2298     |
| nupdates           | 20       |
| policy_entropy     | 5.26     |
| policy_loss        | -0.0849  |
| total_timesteps    | 3819     |
| value_loss         | 4.56     |
---------------------------------
---------------------------------
| explained_variance | 0.715    |
| fps         

<stable_baselines.acktr.acktr.ACKTR at 0x7fb7f40cc190>

In [16]:
model.save("acktr_parallel_1M")

In [17]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = ACKTR.load("acktr_parallel_1M", env=env, nminibatches=16)

In [18]:
evaluate_2(model, 100)

Mean reward: 152.7 Num episodes: 100


152.7

## mp train for 5M ts

In [19]:
env_id = "BipedalWalker-v2"
num_cpu = 10  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [21]:
model = ACKTR.load("acktr_parallel_1M", env=env, verbose=1)

In [None]:
%%time
model.learn(total_timesteps=int(4e6), log_interval=10)

---------------------------------
| explained_variance | 0.591    |
| fps                | 241      |
| nupdates           | 1        |
| policy_entropy     | 5.26     |
| policy_loss        | -0.105   |
| total_timesteps    | 0        |
| value_loss         | 1.84     |
---------------------------------
---------------------------------
| explained_variance | 0.355    |
| fps                | 1451     |
| nupdates           | 10       |
| policy_entropy     | 5.27     |
| policy_loss        | 0.0196   |
| total_timesteps    | 1809     |
| value_loss         | 1.91     |
---------------------------------
---------------------------------
| explained_variance | 0.612    |
| fps                | 1903     |
| nupdates           | 20       |
| policy_entropy     | 5.28     |
| policy_loss        | -0.0324  |
| total_timesteps    | 3819     |
| value_loss         | 315      |
---------------------------------
---------------------------------
| explained_variance | 0.772    |
| fps         

In [None]:
model.save("acktr_parallel_5M")

In [None]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = ACKTR.load("acktr_parallel_5M", env=env, nminibatches=16)

In [None]:
evaluate_2(model, 100)