In [None]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Conv2D, BatchNormalization, Dropout, LeakyReLU
from keras.optimizers import Adam

from rl.agents.sarsa import SARSAAgent
from rl.policy import BoltzmannQPolicy, BoltzmannGumbelQPolicy
from rl.memory import EpisodeParameterMemory

ENV_NAME = 'MsPacman-ram-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

# Option 1 : Simple model
#model = Sequential()
#model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
#model.add(Dense(nb_actions))
#model.add(Activation('softmax'))

# Option 2: deep network
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
#model.add(Dense(512))
#model.add(LeakyReLU(alpha=0.3))
model.add(Dense(64))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(64))
model.add(LeakyReLU(alpha=0.1))
model.add(BatchNormalization())
model.add(Dropout(0.1))
model.add(Dense(32))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(32))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(32))
model.add(LeakyReLU(alpha=0.1))
model.add(Dense(nb_actions))
model.add(Activation('tanh'))


print(model.summary())


# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
#memory = EpisodeParameterMemory(limit=1000, window_length=1)

#cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
#               batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
cem = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=1000000, policy = BoltzmannGumbelQPolicy())
cem.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
cem.fit(env, nb_steps=10000000, visualize=False, verbose=2)

# After training is done, we save the best weights.
cem.save_weights('sarsa_{}_params.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
cem.test(env, nb_episodes=10, visualize=True)

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 64)                0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 64)                256       
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
__________

   15412/10000000: episode: 22, duration: 0.976s, episode steps: 446, steps per second: 457, episode reward: 230.000, mean reward: 0.516 [0.000, 10.000], mean action: 3.395 [1.000, 7.000], mean observation: 105.659 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   16295/10000000: episode: 23, duration: 1.929s, episode steps: 883, steps per second: 458, episode reward: 290.000, mean reward: 0.328 [0.000, 50.000], mean action: 3.983 [1.000, 7.000], mean observation: 109.552 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   16781/10000000: episode: 24, duration: 1.074s, episode steps: 486, steps per second: 453, episode reward: 210.000, mean reward: 0.432 [0.000, 10.000], mean action: 3.504 [1.000, 7.000], mean observation: 109.154 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   17412/10000000: episode: 25, duration: 1.372s, episode steps: 631, steps per second: 460, episode reward: 210.000, mean reward: 0.333 [0.000, 10.000], mean acti

   37545/10000000: episode: 52, duration: 2.026s, episode steps: 929, steps per second: 458, episode reward: 420.000, mean reward: 0.452 [0.000, 10.000], mean action: 3.602 [1.000, 7.000], mean observation: 107.118 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   38405/10000000: episode: 53, duration: 1.883s, episode steps: 860, steps per second: 457, episode reward: 470.000, mean reward: 0.547 [0.000, 50.000], mean action: 3.910 [1.000, 7.000], mean observation: 108.347 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   39006/10000000: episode: 54, duration: 1.311s, episode steps: 601, steps per second: 458, episode reward: 230.000, mean reward: 0.383 [0.000, 10.000], mean action: 3.872 [1.000, 7.000], mean observation: 109.539 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   39386/10000000: episode: 55, duration: 0.832s, episode steps: 380, steps per second: 457, episode reward: 100.000, mean reward: 0.263 [0.000, 10.000], mean acti

   57024/10000000: episode: 82, duration: 1.821s, episode steps: 834, steps per second: 458, episode reward: 290.000, mean reward: 0.348 [0.000, 50.000], mean action: 4.203 [1.000, 7.000], mean observation: 110.243 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   57888/10000000: episode: 83, duration: 1.889s, episode steps: 864, steps per second: 457, episode reward: 550.000, mean reward: 0.637 [0.000, 200.000], mean action: 3.824 [1.000, 7.000], mean observation: 108.914 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   58582/10000000: episode: 84, duration: 1.509s, episode steps: 694, steps per second: 460, episode reward: 230.000, mean reward: 0.331 [0.000, 10.000], mean action: 3.911 [1.000, 7.000], mean observation: 108.476 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   59559/10000000: episode: 85, duration: 2.140s, episode steps: 977, steps per second: 457, episode reward: 420.000, mean reward: 0.430 [0.000, 50.000], mean act

   77211/10000000: episode: 112, duration: 1.001s, episode steps: 456, steps per second: 456, episode reward: 150.000, mean reward: 0.329 [0.000, 10.000], mean action: 4.107 [1.000, 7.000], mean observation: 108.747 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   77912/10000000: episode: 113, duration: 1.543s, episode steps: 701, steps per second: 454, episode reward: 410.000, mean reward: 0.585 [0.000, 10.000], mean action: 3.906 [1.000, 7.000], mean observation: 107.914 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   78441/10000000: episode: 114, duration: 1.159s, episode steps: 529, steps per second: 456, episode reward: 190.000, mean reward: 0.359 [0.000, 10.000], mean action: 3.839 [1.000, 7.000], mean observation: 107.716 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   78889/10000000: episode: 115, duration: 0.984s, episode steps: 448, steps per second: 455, episode reward: 150.000, mean reward: 0.335 [0.000, 10.000], mean 

   95305/10000000: episode: 142, duration: 1.018s, episode steps: 464, steps per second: 456, episode reward: 180.000, mean reward: 0.388 [0.000, 10.000], mean action: 3.905 [1.000, 7.000], mean observation: 108.455 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   95806/10000000: episode: 143, duration: 1.097s, episode steps: 501, steps per second: 457, episode reward: 130.000, mean reward: 0.259 [0.000, 10.000], mean action: 3.978 [1.000, 7.000], mean observation: 109.563 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   96296/10000000: episode: 144, duration: 1.070s, episode steps: 490, steps per second: 458, episode reward: 120.000, mean reward: 0.245 [0.000, 10.000], mean action: 3.886 [1.000, 7.000], mean observation: 109.536 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
   97142/10000000: episode: 145, duration: 1.841s, episode steps: 846, steps per second: 459, episode reward: 380.000, mean reward: 0.449 [0.000, 50.000], mean 

  113728/10000000: episode: 172, duration: 1.856s, episode steps: 848, steps per second: 457, episode reward: 320.000, mean reward: 0.377 [0.000, 50.000], mean action: 3.721 [1.000, 7.000], mean observation: 110.399 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  114146/10000000: episode: 173, duration: 0.911s, episode steps: 418, steps per second: 459, episode reward: 110.000, mean reward: 0.263 [0.000, 10.000], mean action: 3.904 [1.000, 7.000], mean observation: 107.673 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  115038/10000000: episode: 174, duration: 1.961s, episode steps: 892, steps per second: 455, episode reward: 490.000, mean reward: 0.549 [0.000, 50.000], mean action: 3.821 [0.000, 7.000], mean observation: 101.573 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  115787/10000000: episode: 175, duration: 1.636s, episode steps: 749, steps per second: 458, episode reward: 290.000, mean reward: 0.387 [0.000, 50.000], mean 

  134412/10000000: episode: 202, duration: 1.836s, episode steps: 837, steps per second: 456, episode reward: 550.000, mean reward: 0.657 [0.000, 200.000], mean action: 4.078 [1.000, 7.000], mean observation: 109.989 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  135232/10000000: episode: 203, duration: 1.805s, episode steps: 820, steps per second: 454, episode reward: 400.000, mean reward: 0.488 [0.000, 10.000], mean action: 3.967 [0.000, 8.000], mean observation: 100.172 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  136346/10000000: episode: 204, duration: 2.421s, episode steps: 1114, steps per second: 460, episode reward: 530.000, mean reward: 0.476 [0.000, 50.000], mean action: 3.262 [0.000, 7.000], mean observation: 102.787 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  137273/10000000: episode: 205, duration: 2.017s, episode steps: 927, steps per second: 460, episode reward: 590.000, mean reward: 0.636 [0.000, 200.000], me

  154833/10000000: episode: 232, duration: 1.804s, episode steps: 816, steps per second: 452, episode reward: 330.000, mean reward: 0.404 [0.000, 50.000], mean action: 4.092 [1.000, 7.000], mean observation: 109.288 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  155716/10000000: episode: 233, duration: 1.940s, episode steps: 883, steps per second: 455, episode reward: 890.000, mean reward: 1.008 [0.000, 400.000], mean action: 3.812 [1.000, 7.000], mean observation: 110.880 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  156234/10000000: episode: 234, duration: 1.127s, episode steps: 518, steps per second: 460, episode reward: 130.000, mean reward: 0.251 [0.000, 10.000], mean action: 4.031 [1.000, 7.000], mean observation: 109.887 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  156784/10000000: episode: 235, duration: 1.200s, episode steps: 550, steps per second: 458, episode reward: 260.000, mean reward: 0.473 [0.000, 10.000], mean

  175178/10000000: episode: 262, duration: 1.272s, episode steps: 580, steps per second: 456, episode reward: 210.000, mean reward: 0.362 [0.000, 10.000], mean action: 3.671 [1.000, 7.000], mean observation: 109.224 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  175932/10000000: episode: 263, duration: 1.677s, episode steps: 754, steps per second: 450, episode reward: 310.000, mean reward: 0.411 [0.000, 50.000], mean action: 3.780 [1.000, 7.000], mean observation: 106.881 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  176605/10000000: episode: 264, duration: 1.468s, episode steps: 673, steps per second: 459, episode reward: 160.000, mean reward: 0.238 [0.000, 10.000], mean action: 3.795 [1.000, 7.000], mean observation: 109.284 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  177016/10000000: episode: 265, duration: 0.892s, episode steps: 411, steps per second: 461, episode reward: 120.000, mean reward: 0.292 [0.000, 10.000], mean 

  197533/10000000: episode: 292, duration: 1.884s, episode steps: 864, steps per second: 459, episode reward: 310.000, mean reward: 0.359 [0.000, 50.000], mean action: 3.525 [1.000, 7.000], mean observation: 104.862 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  198335/10000000: episode: 293, duration: 1.741s, episode steps: 802, steps per second: 461, episode reward: 440.000, mean reward: 0.549 [0.000, 10.000], mean action: 3.506 [1.000, 7.000], mean observation: 104.961 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  198845/10000000: episode: 294, duration: 1.110s, episode steps: 510, steps per second: 459, episode reward: 220.000, mean reward: 0.431 [0.000, 10.000], mean action: 4.257 [1.000, 7.000], mean observation: 108.931 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  199627/10000000: episode: 295, duration: 1.701s, episode steps: 782, steps per second: 460, episode reward: 340.000, mean reward: 0.435 [0.000, 50.000], mean 

  217595/10000000: episode: 322, duration: 2.023s, episode steps: 931, steps per second: 460, episode reward: 590.000, mean reward: 0.634 [0.000, 200.000], mean action: 3.738 [1.000, 7.000], mean observation: 108.193 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  218187/10000000: episode: 323, duration: 1.291s, episode steps: 592, steps per second: 459, episode reward: 150.000, mean reward: 0.253 [0.000, 10.000], mean action: 3.863 [1.000, 7.000], mean observation: 108.572 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  218752/10000000: episode: 324, duration: 1.235s, episode steps: 565, steps per second: 458, episode reward: 250.000, mean reward: 0.442 [0.000, 10.000], mean action: 3.363 [1.000, 7.000], mean observation: 108.722 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  219522/10000000: episode: 325, duration: 1.675s, episode steps: 770, steps per second: 460, episode reward: 280.000, mean reward: 0.364 [0.000, 10.000], mean

  239056/10000000: episode: 352, duration: 1.516s, episode steps: 676, steps per second: 446, episode reward: 380.000, mean reward: 0.562 [0.000, 10.000], mean action: 3.457 [0.000, 7.000], mean observation: 104.682 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  239708/10000000: episode: 353, duration: 1.465s, episode steps: 652, steps per second: 445, episode reward: 280.000, mean reward: 0.429 [0.000, 10.000], mean action: 3.710 [0.000, 7.000], mean observation: 107.695 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  240433/10000000: episode: 354, duration: 1.629s, episode steps: 725, steps per second: 445, episode reward: 370.000, mean reward: 0.510 [0.000, 50.000], mean action: 3.719 [1.000, 7.000], mean observation: 109.178 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  241091/10000000: episode: 355, duration: 1.466s, episode steps: 658, steps per second: 449, episode reward: 250.000, mean reward: 0.380 [0.000, 10.000], mean 

  259642/10000000: episode: 382, duration: 0.799s, episode steps: 366, steps per second: 458, episode reward: 60.000, mean reward: 0.164 [0.000, 10.000], mean action: 3.880 [1.000, 7.000], mean observation: 107.756 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  260228/10000000: episode: 383, duration: 1.277s, episode steps: 586, steps per second: 459, episode reward: 200.000, mean reward: 0.341 [0.000, 10.000], mean action: 3.906 [1.000, 7.000], mean observation: 109.963 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  260766/10000000: episode: 384, duration: 1.181s, episode steps: 538, steps per second: 456, episode reward: 120.000, mean reward: 0.223 [0.000, 10.000], mean action: 3.580 [1.000, 7.000], mean observation: 109.272 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  261654/10000000: episode: 385, duration: 1.932s, episode steps: 888, steps per second: 460, episode reward: 390.000, mean reward: 0.439 [0.000, 50.000], mean a

  279774/10000000: episode: 412, duration: 1.532s, episode steps: 705, steps per second: 460, episode reward: 220.000, mean reward: 0.312 [0.000, 10.000], mean action: 3.726 [1.000, 7.000], mean observation: 109.004 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  280455/10000000: episode: 413, duration: 1.485s, episode steps: 681, steps per second: 459, episode reward: 200.000, mean reward: 0.294 [0.000, 10.000], mean action: 3.840 [1.000, 7.000], mean observation: 108.483 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  281072/10000000: episode: 414, duration: 1.343s, episode steps: 617, steps per second: 459, episode reward: 310.000, mean reward: 0.502 [0.000, 10.000], mean action: 3.619 [1.000, 7.000], mean observation: 108.411 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  281799/10000000: episode: 415, duration: 1.585s, episode steps: 727, steps per second: 459, episode reward: 290.000, mean reward: 0.399 [0.000, 10.000], mean 

  300651/10000000: episode: 442, duration: 1.655s, episode steps: 703, steps per second: 425, episode reward: 270.000, mean reward: 0.384 [0.000, 10.000], mean action: 3.676 [1.000, 7.000], mean observation: 108.135 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  301320/10000000: episode: 443, duration: 1.560s, episode steps: 669, steps per second: 429, episode reward: 200.000, mean reward: 0.299 [0.000, 10.000], mean action: 3.904 [1.000, 7.000], mean observation: 109.016 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  302003/10000000: episode: 444, duration: 1.493s, episode steps: 683, steps per second: 457, episode reward: 90.000, mean reward: 0.132 [0.000, 10.000], mean action: 3.988 [1.000, 7.000], mean observation: 111.161 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  302607/10000000: episode: 445, duration: 1.315s, episode steps: 604, steps per second: 459, episode reward: 250.000, mean reward: 0.414 [0.000, 10.000], mean a

  322817/10000000: episode: 472, duration: 1.697s, episode steps: 777, steps per second: 458, episode reward: 210.000, mean reward: 0.270 [0.000, 10.000], mean action: 3.855 [1.000, 7.000], mean observation: 109.372 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  323384/10000000: episode: 473, duration: 1.244s, episode steps: 567, steps per second: 456, episode reward: 220.000, mean reward: 0.388 [0.000, 10.000], mean action: 3.834 [1.000, 7.000], mean observation: 109.515 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  324059/10000000: episode: 474, duration: 1.481s, episode steps: 675, steps per second: 456, episode reward: 300.000, mean reward: 0.444 [0.000, 10.000], mean action: 4.086 [0.000, 7.000], mean observation: 108.446 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  324815/10000000: episode: 475, duration: 1.664s, episode steps: 756, steps per second: 454, episode reward: 300.000, mean reward: 0.397 [0.000, 50.000], mean 

  343031/10000000: episode: 502, duration: 1.779s, episode steps: 817, steps per second: 459, episode reward: 390.000, mean reward: 0.477 [0.000, 50.000], mean action: 3.755 [1.000, 7.000], mean observation: 109.809 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  343634/10000000: episode: 503, duration: 1.317s, episode steps: 603, steps per second: 458, episode reward: 270.000, mean reward: 0.448 [0.000, 10.000], mean action: 3.900 [1.000, 7.000], mean observation: 108.387 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  344363/10000000: episode: 504, duration: 1.593s, episode steps: 729, steps per second: 458, episode reward: 300.000, mean reward: 0.412 [0.000, 10.000], mean action: 3.852 [1.000, 7.000], mean observation: 108.057 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  344780/10000000: episode: 505, duration: 0.916s, episode steps: 417, steps per second: 455, episode reward: 140.000, mean reward: 0.336 [0.000, 10.000], mean 

  364241/10000000: episode: 532, duration: 0.979s, episode steps: 440, steps per second: 450, episode reward: 150.000, mean reward: 0.341 [0.000, 10.000], mean action: 3.855 [1.000, 7.000], mean observation: 108.915 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  364978/10000000: episode: 533, duration: 1.651s, episode steps: 737, steps per second: 446, episode reward: 300.000, mean reward: 0.407 [0.000, 10.000], mean action: 3.365 [1.000, 7.000], mean observation: 105.120 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  365518/10000000: episode: 534, duration: 1.226s, episode steps: 540, steps per second: 441, episode reward: 190.000, mean reward: 0.352 [0.000, 10.000], mean action: 4.035 [1.000, 7.000], mean observation: 109.022 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  366126/10000000: episode: 535, duration: 1.565s, episode steps: 608, steps per second: 388, episode reward: 180.000, mean reward: 0.296 [0.000, 10.000], mean 

  384788/10000000: episode: 562, duration: 1.819s, episode steps: 831, steps per second: 457, episode reward: 540.000, mean reward: 0.650 [0.000, 200.000], mean action: 3.945 [1.000, 7.000], mean observation: 109.775 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  385509/10000000: episode: 563, duration: 1.569s, episode steps: 721, steps per second: 460, episode reward: 250.000, mean reward: 0.347 [0.000, 10.000], mean action: 3.627 [1.000, 7.000], mean observation: 108.819 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  385991/10000000: episode: 564, duration: 1.052s, episode steps: 482, steps per second: 458, episode reward: 150.000, mean reward: 0.311 [0.000, 10.000], mean action: 4.037 [1.000, 7.000], mean observation: 109.921 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  386633/10000000: episode: 565, duration: 1.406s, episode steps: 642, steps per second: 457, episode reward: 320.000, mean reward: 0.498 [0.000, 10.000], mean

  405672/10000000: episode: 592, duration: 1.599s, episode steps: 733, steps per second: 458, episode reward: 490.000, mean reward: 0.668 [0.000, 200.000], mean action: 3.907 [1.000, 7.000], mean observation: 111.329 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  406351/10000000: episode: 593, duration: 1.480s, episode steps: 679, steps per second: 459, episode reward: 300.000, mean reward: 0.442 [0.000, 10.000], mean action: 3.377 [1.000, 7.000], mean observation: 109.187 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  406825/10000000: episode: 594, duration: 1.034s, episode steps: 474, steps per second: 459, episode reward: 170.000, mean reward: 0.359 [0.000, 10.000], mean action: 3.700 [1.000, 7.000], mean observation: 109.598 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  407508/10000000: episode: 595, duration: 1.481s, episode steps: 683, steps per second: 461, episode reward: 260.000, mean reward: 0.381 [0.000, 10.000], mean

  425493/10000000: episode: 622, duration: 1.075s, episode steps: 490, steps per second: 456, episode reward: 220.000, mean reward: 0.449 [0.000, 10.000], mean action: 4.173 [1.000, 7.000], mean observation: 107.834 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  425997/10000000: episode: 623, duration: 1.104s, episode steps: 504, steps per second: 457, episode reward: 180.000, mean reward: 0.357 [0.000, 10.000], mean action: 4.024 [1.000, 7.000], mean observation: 109.062 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  426649/10000000: episode: 624, duration: 1.461s, episode steps: 652, steps per second: 446, episode reward: 160.000, mean reward: 0.245 [0.000, 10.000], mean action: 3.664 [1.000, 7.000], mean observation: 108.911 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  427178/10000000: episode: 625, duration: 1.163s, episode steps: 529, steps per second: 455, episode reward: 240.000, mean reward: 0.454 [0.000, 10.000], mean 

  445547/10000000: episode: 652, duration: 1.362s, episode steps: 626, steps per second: 459, episode reward: 200.000, mean reward: 0.319 [0.000, 10.000], mean action: 3.513 [1.000, 7.000], mean observation: 109.549 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  446184/10000000: episode: 653, duration: 1.388s, episode steps: 637, steps per second: 459, episode reward: 270.000, mean reward: 0.424 [0.000, 10.000], mean action: 4.022 [1.000, 7.000], mean observation: 108.895 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  446979/10000000: episode: 654, duration: 1.770s, episode steps: 795, steps per second: 449, episode reward: 410.000, mean reward: 0.516 [0.000, 10.000], mean action: 3.283 [1.000, 7.000], mean observation: 104.116 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  447420/10000000: episode: 655, duration: 0.993s, episode steps: 441, steps per second: 444, episode reward: 100.000, mean reward: 0.227 [0.000, 10.000], mean 

  467144/10000000: episode: 682, duration: 1.305s, episode steps: 596, steps per second: 457, episode reward: 220.000, mean reward: 0.369 [0.000, 10.000], mean action: 4.076 [1.000, 7.000], mean observation: 109.639 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  467683/10000000: episode: 683, duration: 1.180s, episode steps: 539, steps per second: 457, episode reward: 240.000, mean reward: 0.445 [0.000, 10.000], mean action: 4.032 [1.000, 7.000], mean observation: 108.633 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  468195/10000000: episode: 684, duration: 1.116s, episode steps: 512, steps per second: 459, episode reward: 150.000, mean reward: 0.293 [0.000, 10.000], mean action: 4.307 [1.000, 7.000], mean observation: 108.335 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  468716/10000000: episode: 685, duration: 1.139s, episode steps: 521, steps per second: 458, episode reward: 140.000, mean reward: 0.269 [0.000, 10.000], mean 

  487957/10000000: episode: 712, duration: 1.515s, episode steps: 689, steps per second: 455, episode reward: 350.000, mean reward: 0.508 [0.000, 10.000], mean action: 3.820 [1.000, 7.000], mean observation: 108.629 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  488794/10000000: episode: 713, duration: 1.837s, episode steps: 837, steps per second: 456, episode reward: 510.000, mean reward: 0.609 [0.000, 200.000], mean action: 4.143 [1.000, 7.000], mean observation: 109.703 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  489352/10000000: episode: 714, duration: 1.227s, episode steps: 558, steps per second: 455, episode reward: 190.000, mean reward: 0.341 [0.000, 10.000], mean action: 3.590 [1.000, 7.000], mean observation: 108.261 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  489954/10000000: episode: 715, duration: 1.317s, episode steps: 602, steps per second: 457, episode reward: 250.000, mean reward: 0.415 [0.000, 10.000], mean

  507836/10000000: episode: 742, duration: 1.031s, episode steps: 456, steps per second: 442, episode reward: 130.000, mean reward: 0.285 [0.000, 10.000], mean action: 3.743 [1.000, 7.000], mean observation: 108.468 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  508562/10000000: episode: 743, duration: 1.625s, episode steps: 726, steps per second: 447, episode reward: 280.000, mean reward: 0.386 [0.000, 50.000], mean action: 3.682 [1.000, 7.000], mean observation: 109.008 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  509131/10000000: episode: 744, duration: 1.254s, episode steps: 569, steps per second: 454, episode reward: 140.000, mean reward: 0.246 [0.000, 10.000], mean action: 3.794 [1.000, 7.000], mean observation: 109.275 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  510008/10000000: episode: 745, duration: 1.915s, episode steps: 877, steps per second: 458, episode reward: 520.000, mean reward: 0.593 [0.000, 200.000], mean

  527153/10000000: episode: 772, duration: 1.392s, episode steps: 635, steps per second: 456, episode reward: 250.000, mean reward: 0.394 [0.000, 10.000], mean action: 3.499 [1.000, 7.000], mean observation: 105.217 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  527783/10000000: episode: 773, duration: 1.382s, episode steps: 630, steps per second: 456, episode reward: 220.000, mean reward: 0.349 [0.000, 10.000], mean action: 3.816 [1.000, 7.000], mean observation: 109.506 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  528453/10000000: episode: 774, duration: 1.471s, episode steps: 670, steps per second: 456, episode reward: 480.000, mean reward: 0.716 [0.000, 200.000], mean action: 3.858 [1.000, 7.000], mean observation: 110.193 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  529323/10000000: episode: 775, duration: 1.903s, episode steps: 870, steps per second: 457, episode reward: 490.000, mean reward: 0.563 [0.000, 200.000], mea

  547575/10000000: episode: 802, duration: 1.397s, episode steps: 640, steps per second: 458, episode reward: 190.000, mean reward: 0.297 [0.000, 10.000], mean action: 3.372 [1.000, 7.000], mean observation: 108.391 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  548241/10000000: episode: 803, duration: 1.453s, episode steps: 666, steps per second: 458, episode reward: 270.000, mean reward: 0.405 [0.000, 50.000], mean action: 4.147 [1.000, 7.000], mean observation: 109.226 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  548781/10000000: episode: 804, duration: 1.176s, episode steps: 540, steps per second: 459, episode reward: 270.000, mean reward: 0.500 [0.000, 10.000], mean action: 4.044 [1.000, 7.000], mean observation: 108.856 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  549653/10000000: episode: 805, duration: 1.916s, episode steps: 872, steps per second: 455, episode reward: 340.000, mean reward: 0.390 [0.000, 50.000], mean 

  569287/10000000: episode: 832, duration: 2.582s, episode steps: 910, steps per second: 352, episode reward: 600.000, mean reward: 0.659 [0.000, 200.000], mean action: 4.010 [0.000, 7.000], mean observation: 108.842 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  569765/10000000: episode: 833, duration: 1.343s, episode steps: 478, steps per second: 356, episode reward: 120.000, mean reward: 0.251 [0.000, 10.000], mean action: 3.996 [1.000, 7.000], mean observation: 109.309 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  570270/10000000: episode: 834, duration: 1.432s, episode steps: 505, steps per second: 353, episode reward: 90.000, mean reward: 0.178 [0.000, 10.000], mean action: 3.814 [1.000, 7.000], mean observation: 110.230 [0.000, 255.000], loss: --, mean_absolute_error: --, mean_q: --
  570871/10000000: episode: 835, duration: 1.698s, episode steps: 601, steps per second: 354, episode reward: 190.000, mean reward: 0.316 [0.000, 10.000], mean 