In [1]:
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import os

In [2]:
environment_name = 'Breakout-v0'
log_path = os.path.join('/home/kchn/rlp/atari/multi/Training', 'Logs')
save_path = os.path.join('/home/kchn/rlp/atari/multi/Training', 'Saved Models')

In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)

In [None]:
for i in range(1, 11):
    print("Environment number", i)
    env = make_atari_env(environment_name, n_envs=i, seed=0)
    env = VecFrameStack(env, n_stack=i)

eval_callback = EvalCallback(
    env,
    callback_on_new_best=stop_callback,
    eval_freq=10000,
    best_model_save_path=save_path,
    verbose=1
)
    mdl = A2C('CnnPolicy', env, verbose=1, tensorboard_log=log_path)
    mdl.learn(total_timesteps=100000)
    print(evaluate_policy(mdl, env, n_eval_episodes=10, render=True))
    env.close()

Environment number 1


A.L.E: Arcade Learning Environment (version 0.7.4+069f8bd)
[Powered by Stella]


Using cuda device
Wrapping the env in a VecTransposeImage.


2022-10-21 21:41:45.295171: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Logging to /home/kchn/rlp/atari/multi/Training/Logs/A2C_44
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 378      |
|    ep_rew_mean        | 3        |
| time/                 |          |
|    fps                | 133      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.39    |
|    explained_variance | -6.66    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0226  |
|    value_loss         | 0.000314 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 317      |
|    ep_rew_mean        | 2.19     |
| time/                 |          |
|    fps                | 157      |
|    iterations         | 200      |
|    time_elapsed       | 6        |
|    total_times

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 278      |
|    ep_rew_mean        | 1.53     |
| time/                 |          |
|    fps                | 190      |
|    iterations         | 1400     |
|    time_elapsed       | 36       |
|    total_timesteps    | 7000     |
| train/                |          |
|    entropy_loss       | -1.25    |
|    explained_variance | -3.7e+12 |
|    learning_rate      | 0.0007   |
|    n_updates          | 1399     |
|    policy_loss        | -0.389   |
|    value_loss         | 0.1      |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 278       |
|    ep_rew_mean        | 1.54      |
| time/                 |           |
|    fps                | 190       |
|    iterations         | 1500      |
|    time_elapsed       | 39        |
|    total_timesteps    | 7500      |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 293      |
|    ep_rew_mean        | 1.73     |
| time/                 |          |
|    fps                | 195      |
|    iterations         | 2700     |
|    time_elapsed       | 69       |
|    total_timesteps    | 13500    |
| train/                |          |
|    entropy_loss       | -1.24    |
|    explained_variance | -170     |
|    learning_rate      | 0.0007   |
|    n_updates          | 2699     |
|    policy_loss        | 0.289    |
|    value_loss         | 0.0986   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 292      |
|    ep_rew_mean        | 1.67     |
| time/                 |          |
|    fps                | 195      |
|    iterations         | 2800     |
|    time_elapsed       | 71       |
|    total_timesteps    | 14000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 311      |
|    ep_rew_mean        | 2.01     |
| time/                 |          |
|    fps                | 190      |
|    iterations         | 4100     |
|    time_elapsed       | 107      |
|    total_timesteps    | 20500    |
| train/                |          |
|    entropy_loss       | -1.07    |
|    explained_variance | 0.998    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4099     |
|    policy_loss        | -0.0482  |
|    value_loss         | 0.00491  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 315       |
|    ep_rew_mean        | 2.08      |
| time/                 |           |
|    fps                | 190       |
|    iterations         | 4200      |
|    time_elapsed       | 110       |
|    total_timesteps    | 21000     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 301       |
|    ep_rew_mean        | 1.85      |
| time/                 |           |
|    fps                | 187       |
|    iterations         | 5400      |
|    time_elapsed       | 144       |
|    total_timesteps    | 27000     |
| train/                |           |
|    entropy_loss       | -0.00776  |
|    explained_variance | -2.22     |
|    learning_rate      | 0.0007    |
|    n_updates          | 5399      |
|    policy_loss        | -2.78e-05 |
|    value_loss         | 0.00193   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 300       |
|    ep_rew_mean        | 1.82      |
| time/                 |           |
|    fps                | 187       |
|    iterations         | 5500      |
|    time_elapsed       | 146       |
|    total_timesteps    | 27500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 312      |
|    ep_rew_mean        | 2.05     |
| time/                 |          |
|    fps                | 188      |
|    iterations         | 6700     |
|    time_elapsed       | 178      |
|    total_timesteps    | 33500    |
| train/                |          |
|    entropy_loss       | -1.06    |
|    explained_variance | -1.29    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6699     |
|    policy_loss        | -0.0489  |
|    value_loss         | 0.00287  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 314      |
|    ep_rew_mean        | 2.09     |
| time/                 |          |
|    fps                | 188      |
|    iterations         | 6800     |
|    time_elapsed       | 180      |
|    total_timesteps    | 34000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 300      |
|    ep_rew_mean        | 1.88     |
| time/                 |          |
|    fps                | 187      |
|    iterations         | 8000     |
|    time_elapsed       | 213      |
|    total_timesteps    | 40000    |
| train/                |          |
|    entropy_loss       | -0.797   |
|    explained_variance | -3.06    |
|    learning_rate      | 0.0007   |
|    n_updates          | 7999     |
|    policy_loss        | -0.00713 |
|    value_loss         | 0.0041   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 296      |
|    ep_rew_mean        | 1.8      |
| time/                 |          |
|    fps                | 187      |
|    iterations         | 8100     |
|    time_elapsed       | 215      |
|    total_timesteps    | 40500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 321      |
|    ep_rew_mean        | 2.25     |
| time/                 |          |
|    fps                | 186      |
|    iterations         | 9300     |
|    time_elapsed       | 248      |
|    total_timesteps    | 46500    |
| train/                |          |
|    entropy_loss       | -0.239   |
|    explained_variance | 0.921    |
|    learning_rate      | 0.0007   |
|    n_updates          | 9299     |
|    policy_loss        | -0.00253 |
|    value_loss         | 0.0186   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 325      |
|    ep_rew_mean        | 2.32     |
| time/                 |          |
|    fps                | 186      |
|    iterations         | 9400     |
|    time_elapsed       | 251      |
|    total_timesteps    | 47000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 333       |
|    ep_rew_mean        | 2.41      |
| time/                 |           |
|    fps                | 186       |
|    iterations         | 10600     |
|    time_elapsed       | 283       |
|    total_timesteps    | 53000     |
| train/                |           |
|    entropy_loss       | -0.741    |
|    explained_variance | -1.11e+04 |
|    learning_rate      | 0.0007    |
|    n_updates          | 10599     |
|    policy_loss        | -0.00511  |
|    value_loss         | 0.000682  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 328      |
|    ep_rew_mean        | 2.34     |
| time/                 |          |
|    fps                | 186      |
|    iterations         | 10700    |
|    time_elapsed       | 287      |
|    total_timesteps    | 53500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 362      |
|    ep_rew_mean        | 3.2      |
| time/                 |          |
|    fps                | 186      |
|    iterations         | 11900    |
|    time_elapsed       | 319      |
|    total_timesteps    | 59500    |
| train/                |          |
|    entropy_loss       | -0.597   |
|    explained_variance | -18.7    |
|    learning_rate      | 0.0007   |
|    n_updates          | 11899    |
|    policy_loss        | -0.0184  |
|    value_loss         | 0.0107   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 367      |
|    ep_rew_mean        | 3.31     |
| time/                 |          |
|    fps                | 186      |
|    iterations         | 12000    |
|    time_elapsed       | 321      |
|    total_timesteps    | 60000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 373       |
|    ep_rew_mean        | 3.38      |
| time/                 |           |
|    fps                | 186       |
|    iterations         | 13200     |
|    time_elapsed       | 353       |
|    total_timesteps    | 66000     |
| train/                |           |
|    entropy_loss       | -0.248    |
|    explained_variance | -1.76e+05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 13199     |
|    policy_loss        | 0.00454   |
|    value_loss         | 0.000328  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 374       |
|    ep_rew_mean        | 3.37      |
| time/                 |           |
|    fps                | 186       |
|    iterations         | 13300     |
|    time_elapsed       | 356       |
|    total_timesteps    | 66500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 364      |
|    ep_rew_mean        | 3.12     |
| time/                 |          |
|    fps                | 186      |
|    iterations         | 14500    |
|    time_elapsed       | 388      |
|    total_timesteps    | 72500    |
| train/                |          |
|    entropy_loss       | -0.354   |
|    explained_variance | -41.4    |
|    learning_rate      | 0.0007   |
|    n_updates          | 14499    |
|    policy_loss        | 0.398    |
|    value_loss         | 0.349    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 365      |
|    ep_rew_mean        | 3.16     |
| time/                 |          |
|    fps                | 186      |
|    iterations         | 14600    |
|    time_elapsed       | 391      |
|    total_timesteps    | 73000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 395      |
|    ep_rew_mean        | 3.95     |
| time/                 |          |
|    fps                | 186      |
|    iterations         | 15800    |
|    time_elapsed       | 422      |
|    total_timesteps    | 79000    |
| train/                |          |
|    entropy_loss       | -0.729   |
|    explained_variance | 0.196    |
|    learning_rate      | 0.0007   |
|    n_updates          | 15799    |
|    policy_loss        | -0.00446 |
|    value_loss         | 0.00772  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 394       |
|    ep_rew_mean        | 3.92      |
| time/                 |           |
|    fps                | 186       |
|    iterations         | 15900     |
|    time_elapsed       | 425       |
|    total_timesteps    | 79500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 397      |
|    ep_rew_mean        | 3.84     |
| time/                 |          |
|    fps                | 187      |
|    iterations         | 17100    |
|    time_elapsed       | 456      |
|    total_timesteps    | 85500    |
| train/                |          |
|    entropy_loss       | -0.196   |
|    explained_variance | 0.602    |
|    learning_rate      | 0.0007   |
|    n_updates          | 17099    |
|    policy_loss        | -0.0166  |
|    value_loss         | 0.0666   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 398      |
|    ep_rew_mean        | 3.85     |
| time/                 |          |
|    fps                | 187      |
|    iterations         | 17200    |
|    time_elapsed       | 458      |
|    total_timesteps    | 86000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 420       |
|    ep_rew_mean        | 4.44      |
| time/                 |           |
|    fps                | 187       |
|    iterations         | 18400     |
|    time_elapsed       | 491       |
|    total_timesteps    | 92000     |
| train/                |           |
|    entropy_loss       | -0.719    |
|    explained_variance | -3.83e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 18399     |
|    policy_loss        | -0.0803   |
|    value_loss         | 0.0554    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 417      |
|    ep_rew_mean        | 4.4      |
| time/                 |          |
|    fps                | 187      |
|    iterations         | 18500    |
|    time_elapsed       | 493      |
|    total_timesteps    | 92500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 427      |
|    ep_rew_mean        | 4.68     |
| time/                 |          |
|    fps                | 187      |
|    iterations         | 19700    |
|    time_elapsed       | 525      |
|    total_timesteps    | 98500    |
| train/                |          |
|    entropy_loss       | -0.308   |
|    explained_variance | -1.01    |
|    learning_rate      | 0.0007   |
|    n_updates          | 19699    |
|    policy_loss        | -0.0179  |
|    value_loss         | 0.042    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 418      |
|    ep_rew_mean        | 4.57     |
| time/                 |          |
|    fps                | 187      |
|    iterations         | 19800    |
|    time_elapsed       | 527      |
|    total_timesteps    | 99000    |
| train/                |          |
|

  logger.warn(


(5.3, 2.6476404589747453)
Environment number 2
Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to /home/kchn/rlp/atari/multi/Training/Logs/A2C_45
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 289      |
|    ep_rew_mean        | 1.59     |
| time/                 |          |
|    fps                | 250      |
|    iterations         | 100      |
|    time_elapsed       | 3        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.0559   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0941  |
|    value_loss         | 0.00916  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 275      |
|    ep_rew_mean        | 1.34     |
| time/                 |          |
|    fps             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 297      |
|    ep_rew_mean        | 1.96     |
| time/                 |          |
|    fps                | 252      |
|    iterations         | 1400     |
|    time_elapsed       | 55       |
|    total_timesteps    | 14000    |
| train/                |          |
|    entropy_loss       | -1.03    |
|    explained_variance | 0.893    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1399     |
|    policy_loss        | 0.0888   |
|    value_loss         | 0.0826   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 296      |
|    ep_rew_mean        | 1.98     |
| time/                 |          |
|    fps                | 254      |
|    iterations         | 1500     |
|    time_elapsed       | 59       |
|    total_timesteps    | 15000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 317      |
|    ep_rew_mean        | 2.23     |
| time/                 |          |
|    fps                | 258      |
|    iterations         | 2700     |
|    time_elapsed       | 104      |
|    total_timesteps    | 27000    |
| train/                |          |
|    entropy_loss       | -1.04    |
|    explained_variance | 0.928    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2699     |
|    policy_loss        | -0.0461  |
|    value_loss         | 0.0231   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 316      |
|    ep_rew_mean        | 2.28     |
| time/                 |          |
|    fps                | 258      |
|    iterations         | 2800     |
|    time_elapsed       | 108      |
|    total_timesteps    | 28000    |
| train/                |          |
|