In [1]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1, 1))
display.start()

import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display
import gym
import numpy as np
import cv2

In [2]:
from stable_baselines.common.policies import MlpLstmPolicy, MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines import ACKTR
from stable_baselines.common import make_vec_env, set_global_seeds

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
def evaluate_2(model, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes
    :return: (float) Mean reward for the given number of episodes
    """
    episode_rewards = []
    obs = env.reset()
    for i in range(num_episodes):
        episode_rewards.append(0.0)
        done = False
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            reward, done, info = reward[0], done[0], info[0]
            # Stats
            episode_rewards[-1] += reward
            if done:
                obs = env.reset()
    # Compute mean reward for the last 100 episodes
    mean_100ep_reward = round(np.mean(episode_rewards), 1)
    print("Mean reward:", mean_100ep_reward, "Num episodes:", len(episode_rewards))
    return mean_100ep_reward

In [4]:
def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init


# Stable Baselines provides you with make_vec_env() helper
# which does exactly the previous steps for you:
# env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

In [5]:
from stable_baselines.common.policies import FeedForwardPolicy, register_policy

# Custom MLP policy of three layers of size 128 each
class CustomPolicy(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomPolicy, self).__init__(*args, **kwargs,
                                           net_arch=[dict(pi=[64, 64],
                                                          vf=[64, 64, 32])],
                                           feature_extraction="mlp")

# Register the policy, it will check that the name is not already taken
register_policy('CustomPolicy', CustomPolicy)

# Because the policy is now registered, you can pass
# a string to the agent constructor instead of passing a class
# model = A2C(policy='CustomPolicy', env='LunarLander-v2', verbose=1).learn(total_timesteps=100000)


In [7]:
env_id = "BipedalWalker-v2"
num_cpu = 10  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [8]:
%%time
model = ACKTR("CustomPolicy", env, verbose=1)
model.learn(total_timesteps=500000, log_interval=10)





Instructions for updating:
Use keras.layers.flatten instead.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


---------------------------------
| explained_variance | 0.000485 |
| fps                | 217      |
| nupdates           | 1        |
| policy_entropy     | 5.68     |
| policy_loss        | -0.12    |
| total_timesteps    | 0        |
| value_loss         | 2.42     |
---------------------------------
---------------------------------
| explained_variance | 0.00111  |
| fps                | 1387     |
| nupdates           | 10       |
| policy_entropy     | 5.64     |
| policy_loss        | -0.2     |
| total_timesteps    | 1809     |
| value_loss         | 365      |
---------------------------------
---------------------------------
| explained_variance | -0.00215 |
| fps                | 1918     |
| nupdates           | 20       |
| policy_entropy     | 5.64     |
| policy_loss        | -0.116   |
| total_timesteps    |

<stable_baselines.acktr.acktr.ACKTR at 0x7fde8d2b4110>

In [9]:
model.save("acktr_custom_parallel")

In [10]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = ACKTR.load("acktr_custom_parallel", env=env, nminibatches=16)

In [11]:
evaluate_2(model, 100)

Mean reward: 60.8 Num episodes: 100


60.8

## mp train for 1M ts

In [12]:
env_id = "BipedalWalker-v2"
num_cpu = 10  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [13]:
model = ACKTR.load("acktr_custom_parallel", env=env, verbose=1)

In [14]:
%%time
model.learn(total_timesteps=500000, log_interval=10)

---------------------------------
| explained_variance | 0.563    |
| fps                | 263      |
| nupdates           | 1        |
| policy_entropy     | 5.44     |
| policy_loss        | -0.205   |
| total_timesteps    | 0        |
| value_loss         | 158      |
---------------------------------
---------------------------------
| explained_variance | 0.462    |
| fps                | 1629     |
| nupdates           | 10       |
| policy_entropy     | 5.43     |
| policy_loss        | 0.0631   |
| total_timesteps    | 1809     |
| value_loss         | 216      |
---------------------------------
---------------------------------
| explained_variance | 0.866    |
| fps                | 2246     |
| nupdates           | 20       |
| policy_entropy     | 5.43     |
| policy_loss        | -0.124   |
| total_timesteps    | 3819     |
| value_loss         | 32.5     |
---------------------------------
---------------------------------
| explained_variance | 0.644    |
| fps         

<stable_baselines.acktr.acktr.ACKTR at 0x7fde0c366690>

In [15]:
model.save("acktr_custom_parallel_1M")

In [16]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = ACKTR.load("acktr_custom_parallel_1M", env=env, nminibatches=16)

In [17]:
evaluate_2(model, 100)

Mean reward: 123.8 Num episodes: 100


123.8

## mp train for 2M ts

In [18]:
env_id = "BipedalWalker-v2"
num_cpu = 10  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [19]:
model = ACKTR.load("acktr_custom_parallel_1M", env=env, verbose=1)

In [20]:
%%time
model.learn(total_timesteps=1000000, log_interval=10)

---------------------------------
| explained_variance | 0.625    |
| fps                | 233      |
| nupdates           | 1        |
| policy_entropy     | 5.38     |
| policy_loss        | -0.241   |
| total_timesteps    | 0        |
| value_loss         | 62.8     |
---------------------------------
---------------------------------
| explained_variance | 0.517    |
| fps                | 1290     |
| nupdates           | 10       |
| policy_entropy     | 5.4      |
| policy_loss        | -0.029   |
| total_timesteps    | 1809     |
| value_loss         | 4.2      |
---------------------------------
---------------------------------
| explained_variance | 0.281    |
| fps                | 1582     |
| nupdates           | 20       |
| policy_entropy     | 5.41     |
| policy_loss        | -0.193   |
| total_timesteps    | 3819     |
| value_loss         | 175      |
---------------------------------
---------------------------------
| explained_variance | 0.305    |
| fps         

<stable_baselines.acktr.acktr.ACKTR at 0x7fddb7d8e150>

In [21]:
model.save("acktr_custom_parallel_2M")

In [22]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = ACKTR.load("acktr_custom_parallel_2M", env=env, nminibatches=16)

In [23]:
evaluate_2(model, 100)

Mean reward: 216.2 Num episodes: 100


216.2

## mp train for 5M ts

In [24]:
env_id = "BipedalWalker-v2"
num_cpu = 10  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [25]:
model = ACKTR.load("acktr_custom_parallel_2M", env=env, verbose=1)

In [26]:
%%time
model.learn(total_timesteps=int(3e6), log_interval=10)

---------------------------------
| explained_variance | 0.729    |
| fps                | 158      |
| nupdates           | 1        |
| policy_entropy     | 5.12     |
| policy_loss        | -0.0971  |
| total_timesteps    | 0        |
| value_loss         | 3.33     |
---------------------------------
---------------------------------
| explained_variance | 0.885    |
| fps                | 1031     |
| nupdates           | 10       |
| policy_entropy     | 5.14     |
| policy_loss        | -0.0987  |
| total_timesteps    | 1809     |
| value_loss         | 46       |
---------------------------------
---------------------------------
| explained_variance | -0.329   |
| fps                | 1458     |
| nupdates           | 20       |
| policy_entropy     | 5.15     |
| policy_loss        | 0.0503   |
| total_timesteps    | 3819     |
| value_loss         | 2.62     |
---------------------------------
---------------------------------
| explained_variance | -0.15    |
| fps         

<stable_baselines.acktr.acktr.ACKTR at 0x7fddb4e61e50>

In [30]:
model.save("acktr_custom_parallel_5M")

In [31]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = ACKTR.load("acktr_custom_parallel_5M", env=env, nminibatches=16)

In [33]:
evaluate_2(model, 100)

Mean reward: 132.0 Num episodes: 100


132.0