In [2]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1, 1))
display.start()

import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display
import gym
import numpy as np
import cv2

In [13]:
from stable_baselines.common.policies import MlpLstmPolicy, MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines.trpo_mpi import TRPO
from stable_baselines.common import make_vec_env, set_global_seeds

In [14]:
def evaluate_2(model, num_episodes=100):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes
    :return: (float) Mean reward for the given number of episodes
    """
    episode_rewards = []
    obs = env.reset()
    for i in range(num_episodes):
        episode_rewards.append(0.0)
        done = False
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            reward, done, info = reward[0], done[0], info[0]
            # Stats
            episode_rewards[-1] += reward
            if done:
                obs = env.reset()
    # Compute mean reward for the last 100 episodes
    mean_100ep_reward = round(np.mean(episode_rewards), 1)
    print("Mean reward:", mean_100ep_reward, "Num episodes:", len(episode_rewards))
    return mean_100ep_reward

In [15]:
def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init


# Stable Baselines provides you with make_vec_env() helper
# which does exactly the previous steps for you:
# env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

In [21]:
env_id = "BipedalWalker-v2"
num_cpu = 8  # Number of processes to use
env = gym.make(env_id)
env = DummyVecEnv([lambda: env])

In [22]:
%%time
model = TRPO(MlpPolicy, env, verbose=1, n_cpu_tf_sess=8)
model.learn(total_timesteps=500000, log_interval=10)





Instructions for updating:
Use keras.layers.flatten instead.


********** Iteration 0 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 1.126 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.144 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0      0.131          0
         1     0.0298     0.0477
         2    0.00884      0.153
         3    0.00415      0.292
         4    0.00368      0.367
         5    0.00424      0.532
         6    0.00388      0.551
         7     0.0033       0.73
         8     0.0024       1.04
         9   0.000802       1.51
        10    0.00701       1.56
[35mdone in 0.221 seconds[0m
Expected: 0.044 Actual: 0.042
Stepsize OK!
[35mvf[0m
[35mdone in 0.070 seconds[0m
-----------------------------------------
| EpThisIter              | 0           |
| EpisodesSoFar           | 0           |
| TimeElapsed             | 1.65        |
| TimestepsSoFar          | 1024        |
| entloss            

<stable_baselines.trpo_mpi.trpo_mpi.TRPO at 0x7f4bc4d20ed0>

In [23]:
model.save("trpo_parallel_test")

In [24]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = TRPO.load("trpo_parallel_test", env=env, nminibatches=16)

In [25]:
evaluate_2(model, 100)

Mean reward: 96.9 Num episodes: 100


96.9

## mp train for 1M ts

In [26]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])

In [27]:
model = TRPO.load("trpo_parallel_test", env=env, verbose=1)

In [28]:
%%time
model.learn(total_timesteps=500000, log_interval=10)

********** Iteration 0 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 1.072 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.107 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0       1.27          0
         1      0.695     0.0048
         2        1.6    0.00883
         3      0.998     0.0168
         4      0.847     0.0287
         5      0.348     0.0382
         6      0.587     0.0527
         7      0.237     0.0974
         8      0.285      0.165
         9      0.292      0.216
        10      0.173      0.218
[35mdone in 0.222 seconds[0m
Expected: 0.041 Actual: 0.041
Stepsize OK!
[35mvf[0m
[35mdone in 0.079 seconds[0m
-----------------------------------------
| EpThisIter              | 0           |
| EpisodesSoFar           | 0           |
| TimeElapsed             | 1.57        |
| TimestepsSoFar          | 1024        |
| entloss                 | 0.0         |
| entropy                 | 1.6578784   |
| ex

<stable_baselines.trpo_mpi.trpo_mpi.TRPO at 0x7f4bb43af2d0>

In [29]:
model.save("trpo_parallel_1M")

In [30]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = TRPO.load("trpo_parallel_1M", env=env, nminibatches=16)

In [31]:
evaluate_2(model, 100)

Mean reward: 133.9 Num episodes: 100


133.9

## mp train for 2m ts

In [32]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])

In [33]:
model = TRPO.load("trpo_parallel_1M", env=env, verbose=1)

In [None]:
%%time
model.learn(total_timesteps=int(1e6), log_interval=10)

********** Iteration 0 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 0.996 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.102 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0       4.03          0
         1       1.97    0.00209
         2        2.9     0.0055
         3       3.29     0.0204
         4      0.829     0.0302
         5       2.37     0.0534
         6       2.05     0.0789
         7       1.11     0.0947
         8      0.783      0.118
         9       11.6      0.141
        10      0.591      0.175
[35mdone in 0.205 seconds[0m
Expected: 0.048 Actual: 0.047
Stepsize OK!
[35mvf[0m
[35mdone in 0.068 seconds[0m
-----------------------------------------
| EpLenMean               | 116         |
| EpRewMean               | -105        |
| EpThisIter              | 1           |
| EpisodesSoFar           | 1           |
| TimeElapsed             | 1.45        |
| TimestepsSoFar          | 1024        |
| en

In [39]:
model.save("trpo_parallel_2M")

In [40]:
env = gym.make('BipedalWalker-v2')
env = DummyVecEnv([lambda: env])
model = TRPO.load("trpo_parallel_2M", env=env, nminibatches=16)

In [41]:
evaluate_2(model, 100)

Mean reward: 188.3 Num episodes: 100


188.3