In [1]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1, 1))
display.start()

import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display
import gym
import numpy as np
import cv2

In [9]:
from stable_baselines.common.policies import MlpLstmPolicy, MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines import ACKTR
from stable_baselines.common import make_vec_env, set_global_seeds

In [3]:
def evaluate_2(model, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes
    :return: (float) Mean reward for the given number of episodes
    """
    episode_rewards = []
    obs = env.reset()
    for i in range(num_episodes):
        episode_rewards.append(0.0)
        done = False
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            reward, done, info = reward[0], done[0], info[0]
            # Stats
            episode_rewards[-1] += reward
            if done:
                obs = env.reset()
    # Compute mean reward for the last 100 episodes
    mean_100ep_reward = round(np.mean(episode_rewards), 1)
    print("Mean reward:", mean_100ep_reward, "Num episodes:", len(episode_rewards))
    return mean_100ep_reward

In [4]:
def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init


# Stable Baselines provides you with make_vec_env() helper
# which does exactly the previous steps for you:
# env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

In [16]:
env_id = "BipedalWalker-v2"
num_cpu = 8  # Number of processes to use
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

In [17]:
%%time
model = ACKTR(MlpLstmPolicy, env, verbose=1)
model.learn(total_timesteps=500000, log_interval=10)

---------------------------------
| explained_variance | 0.00325  |
| fps                | 113      |
| nupdates           | 1        |
| policy_entropy     | 5.68     |
| policy_loss        | -0.115   |
| total_timesteps    | 0        |
| value_loss         | 3.3      |
---------------------------------
---------------------------------
| explained_variance | -0.00784 |
| fps                | 389      |
| nupdates           | 10       |
| policy_entropy     | 5.87     |
| policy_loss        | -0.115   |
| total_timesteps    | 1449     |
| value_loss         | 2.28     |
---------------------------------
---------------------------------
| explained_variance | 0.0663   |
| fps                | 165      |
| nupdates           | 20       |
| policy_entropy     | 5.91     |
| policy_loss        | -0.0227  |
| total_timesteps    | 3059     |
| value_loss         | 0.765    |
---------------------------------
---------------------------------
| explained_variance | 0.202    |
| fps         

<stable_baselines.acktr.acktr.ACKTR at 0x7fabcff43c50>

In [20]:
model.save("acktr_lstm_parallel_500k")

In [24]:
env = gym.make('BipedalWalker-v2')
env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
model = ACKTR.load("acktr_lstm_parallel_500k", env=env, nminibatches=16)

In [26]:
num_episodes = 100

In [27]:
episode_rewards = []
obs = env.reset()
for i in range(num_episodes):
    episode_rewards.append(0.0)
    done = False
    while not done:
        # _states are only useful when using LSTM policies
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        reward, done, info = reward[0], done[0], info[0]
        # Stats
        episode_rewards[-1] += reward
        if done:
            obs = env.reset()
    # Compute mean reward for the last 100 epis

In [31]:
np.mean(episode_rewards)

24.363377715390143

In [None]:
evaluate_2(model, 10)[0]

In [None]:
def evaluate_2(model, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes
    :return: (float) Mean reward for the given number of episodes
    """
    episode_rewards = []
    obs = env.reset()
    for i in range(num_episodes):
        episode_rewards.append(0.0)
        done = False
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            reward, done, info = reward[0], done[0], info[0]
            # Stats
            episode_rewards[-1] += reward
            if done:
                obs = env.reset()
    # Compute mean reward for the last 100 episodes
    mean_100ep_reward = round(np.mean(episode_rewards), 1)
    print("Mean reward:", mean_100ep_reward, "Num episodes:", len(episode_rewards))
    return mean_100ep_reward