# Deep Q Learning using Keras

In [6]:
import numpy as np
import gym


from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.memory import SequentialMemory

# Gym env and actions

In [7]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(1)
env.seed(1)
nb_actions = env.action_space.n

In [8]:
input_shape=(1,) + env.observation_space.shape
print(input_shape)

(1, 4)


# Simple NN model

In [9]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_4 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1.00,value_min=.05, value_test=.05,
nb_steps=10000)

memory = SequentialMemory(limit=10000, window_length=1)

dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=100, policy=policy)

dqn.compile(Adam(lr=0.001), metrics=['mae'])

W1118 20:02:53.499825 4514584000 deprecation_wrapper.py:119] From /Users/sandeep/anaconda3/envs/py36/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:159: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W1118 20:02:53.500586 4514584000 deprecation_wrapper.py:119] From /Users/sandeep/anaconda3/envs/py36/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:164: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W1118 20:02:53.598199 4514584000 deprecation_wrapper.py:119] From /Users/sandeep/anaconda3/envs/py36/lib/python3.7/site-packages/keras/optimizers.py:711: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W1118 20:02:53.634010 4514584000 deprecation.py:506] From /Users/sandeep/anaconda3/envs/py36/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:1247: calling reduce_sum_v1 (from tensorflow.python.ops.math_ops) with

In [11]:
dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)

Training for 50000 steps ...


W1118 20:02:56.636869 4514584000 deprecation.py:506] From /Users/sandeep/anaconda3/envs/py36/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:680: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


    16/50000: episode: 1, duration: 1.493s, episode steps: 16, steps per second: 11, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: 0.084 [-0.788, 1.345], loss: 1.070938, mean_absolute_error: 0.918653, mean_q: 0.610389, mean_eps: 0.998765




    34/50000: episode: 2, duration: 0.299s, episode steps: 18, steps per second: 60, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: -0.098 [-1.294, 0.559], loss: 0.756213, mean_absolute_error: 0.769966, mean_q: 0.515999, mean_eps: 0.997672
    54/50000: episode: 3, duration: 0.333s, episode steps: 20, steps per second: 60, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.450 [0.000, 1.000], mean observation: 0.115 [-0.614, 1.473], loss: 0.556590, mean_absolute_error: 0.707005, mean_q: 0.479894, mean_eps: 0.995867
    66/50000: episode: 4, duration: 0.198s, episode steps: 12, steps per second: 60, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.093 [-0.843, 1.525], loss: 0.463143, mean_absolute_error: 0.706444, mean_q: 0.582751, mean_eps: 0.994347
    81/50000: episode: 5, duration: 0.252s, episode steps: 15, steps per second: 60, ep

   671/50000: episode: 30, duration: 0.052s, episode steps: 20, steps per second: 385, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.350 [0.000, 1.000], mean observation: 0.059 [-1.362, 2.145], loss: 1.055057, mean_absolute_error: 3.564481, mean_q: 6.910261, mean_eps: 0.937253
   686/50000: episode: 31, duration: 0.039s, episode steps: 15, steps per second: 387, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: 0.098 [-0.960, 1.563], loss: 0.918074, mean_absolute_error: 3.532938, mean_q: 6.975838, mean_eps: 0.935590
   695/50000: episode: 32, duration: 0.025s, episode steps: 9, steps per second: 361, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.111 [0.000, 1.000], mean observation: 0.154 [-1.382, 2.271], loss: 1.315276, mean_absolute_error: 3.625801, mean_q: 7.026266, mean_eps: 0.934450
   716/50000: episode: 33, duration: 0.054s, episode steps: 21, steps per second: 39

  1325/50000: episode: 59, duration: 0.040s, episode steps: 15, steps per second: 377, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.094 [-1.416, 0.645], loss: 2.469263, mean_absolute_error: 6.456985, mean_q: 11.966770, mean_eps: 0.874885
  1336/50000: episode: 60, duration: 0.029s, episode steps: 11, steps per second: 376, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.120 [-0.993, 1.795], loss: 1.549565, mean_absolute_error: 6.418336, mean_q: 12.230840, mean_eps: 0.873650
  1369/50000: episode: 61, duration: 0.080s, episode steps: 33, steps per second: 411, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.424 [0.000, 1.000], mean observation: 0.050 [-1.149, 1.912], loss: 2.266544, mean_absolute_error: 6.492097, mean_q: 12.400084, mean_eps: 0.871560
  1382/50000: episode: 62, duration: 0.033s, episode steps: 13, steps per seco

  1867/50000: episode: 88, duration: 0.027s, episode steps: 10, steps per second: 366, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.100 [0.000, 1.000], mean observation: 0.148 [-1.529, 2.495], loss: 2.242883, mean_absolute_error: 7.341004, mean_q: 13.986285, mean_eps: 0.823157
  1892/50000: episode: 89, duration: 0.064s, episode steps: 25, steps per second: 389, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: -0.101 [-1.316, 0.566], loss: 3.242002, mean_absolute_error: 7.432673, mean_q: 13.984216, mean_eps: 0.821495
  1906/50000: episode: 90, duration: 0.037s, episode steps: 14, steps per second: 374, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.571 [0.000, 1.000], mean observation: -0.105 [-1.412, 0.759], loss: 2.763043, mean_absolute_error: 7.446013, mean_q: 13.875972, mean_eps: 0.819642
  1935/50000: episode: 91, duration: 0.073s, episode steps: 29, steps per sec

  2653/50000: episode: 115, duration: 0.190s, episode steps: 77, steps per second: 406, episode reward: 77.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.040 [-1.497, 1.319], loss: 3.744190, mean_absolute_error: 8.378994, mean_q: 15.751702, mean_eps: 0.751670
  2669/50000: episode: 116, duration: 0.050s, episode steps: 16, steps per second: 321, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.562 [0.000, 1.000], mean observation: -0.107 [-1.211, 0.586], loss: 3.366600, mean_absolute_error: 8.417544, mean_q: 15.972502, mean_eps: 0.747252
  2690/50000: episode: 117, duration: 0.056s, episode steps: 21, steps per second: 375, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: 0.101 [-0.754, 1.281], loss: 2.555066, mean_absolute_error: 8.370342, mean_q: 16.054666, mean_eps: 0.745495
  2762/50000: episode: 118, duration: 0.195s, episode steps: 72, steps per 

  3469/50000: episode: 142, duration: 0.530s, episode steps: 34, steps per second: 64, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.588 [0.000, 1.000], mean observation: -0.069 [-2.184, 1.185], loss: 4.698697, mean_absolute_error: 9.807246, mean_q: 18.638775, mean_eps: 0.672108
  3502/50000: episode: 143, duration: 0.549s, episode steps: 33, steps per second: 60, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.576 [0.000, 1.000], mean observation: -0.041 [-1.788, 1.004], loss: 4.379073, mean_absolute_error: 9.909130, mean_q: 18.945297, mean_eps: 0.668925
  3536/50000: episode: 144, duration: 0.189s, episode steps: 34, steps per second: 180, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.028 [-1.304, 0.996], loss: 4.266679, mean_absolute_error: 10.050197, mean_q: 19.228282, mean_eps: 0.665742
  3561/50000: episode: 145, duration: 0.070s, episode steps: 25, steps per

  4386/50000: episode: 169, duration: 0.532s, episode steps: 32, steps per second: 60, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: 0.085 [-0.625, 1.072], loss: 5.979150, mean_absolute_error: 11.601592, mean_q: 22.286766, mean_eps: 0.584897
  4426/50000: episode: 170, duration: 0.668s, episode steps: 40, steps per second: 60, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.070 [-1.461, 0.654], loss: 6.188643, mean_absolute_error: 11.655389, mean_q: 22.279720, mean_eps: 0.581477
  4482/50000: episode: 171, duration: 0.932s, episode steps: 56, steps per second: 60, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: 0.160 [-0.930, 0.993], loss: 5.442793, mean_absolute_error: 11.745052, mean_q: 22.537432, mean_eps: 0.576918
  4521/50000: episode: 172, duration: 0.654s, episode steps: 39, steps per 

  5648/50000: episode: 197, duration: 0.119s, episode steps: 48, steps per second: 403, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.479 [0.000, 1.000], mean observation: -0.127 [-0.925, 0.306], loss: 5.618119, mean_absolute_error: 13.614251, mean_q: 26.443961, mean_eps: 0.465768
  5673/50000: episode: 198, duration: 0.061s, episode steps: 25, steps per second: 413, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.099 [-0.763, 1.189], loss: 5.227707, mean_absolute_error: 13.850038, mean_q: 26.942370, mean_eps: 0.462300
  5700/50000: episode: 199, duration: 0.066s, episode steps: 27, steps per second: 409, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.407 [0.000, 1.000], mean observation: 0.042 [-1.020, 1.746], loss: 6.039533, mean_absolute_error: 13.874074, mean_q: 26.994674, mean_eps: 0.459830
  5743/50000: episode: 200, duration: 0.108s, episode steps: 43, steps p

  7108/50000: episode: 224, duration: 0.114s, episode steps: 48, steps per second: 420, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: 0.139 [-0.457, 0.970], loss: 7.638253, mean_absolute_error: 15.628015, mean_q: 30.318678, mean_eps: 0.327068
  7166/50000: episode: 225, duration: 0.134s, episode steps: 58, steps per second: 432, episode reward: 58.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.534 [0.000, 1.000], mean observation: 0.129 [-0.464, 0.863], loss: 7.583666, mean_absolute_error: 15.591644, mean_q: 30.246383, mean_eps: 0.322033
  7216/50000: episode: 226, duration: 0.118s, episode steps: 50, steps per second: 423, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.108 [-0.406, 0.933], loss: 8.708817, mean_absolute_error: 15.654356, mean_q: 30.230140, mean_eps: 0.316903
  7266/50000: episode: 227, duration: 0.117s, episode steps: 50, steps pe

  9076/50000: episode: 252, duration: 0.532s, episode steps: 196, steps per second: 368, episode reward: 196.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.036 [-1.080, 0.437], loss: 9.170905, mean_absolute_error: 17.280087, mean_q: 33.624010, mean_eps: 0.147138
  9147/50000: episode: 253, duration: 0.191s, episode steps: 71, steps per second: 372, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.535 [0.000, 1.000], mean observation: 0.139 [-0.223, 0.919], loss: 8.481370, mean_absolute_error: 17.447579, mean_q: 33.992944, mean_eps: 0.134455
  9229/50000: episode: 254, duration: 0.239s, episode steps: 82, steps per second: 343, episode reward: 82.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.537 [0.000, 1.000], mean observation: 0.127 [-0.439, 1.045], loss: 8.055125, mean_absolute_error: 17.369850, mean_q: 33.943235, mean_eps: 0.127188
  9317/50000: episode: 255, duration: 0.233s, episode steps: 88, steps

 12620/50000: episode: 279, duration: 0.326s, episode steps: 140, steps per second: 430, episode reward: 140.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.543 [0.000, 1.000], mean observation: 0.409 [-0.326, 2.213], loss: 8.048780, mean_absolute_error: 21.479718, mean_q: 42.715449, mean_eps: 0.050000
 12779/50000: episode: 280, duration: 0.365s, episode steps: 159, steps per second: 435, episode reward: 159.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.541 [0.000, 1.000], mean observation: 0.410 [-0.404, 2.426], loss: 7.939690, mean_absolute_error: 21.650706, mean_q: 43.100695, mean_eps: 0.050000
 12930/50000: episode: 281, duration: 0.348s, episode steps: 151, steps per second: 434, episode reward: 151.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.536 [0.000, 1.000], mean observation: 0.419 [-0.509, 2.404], loss: 8.810925, mean_absolute_error: 21.887704, mean_q: 43.525856, mean_eps: 0.050000
 13095/50000: episode: 282, duration: 0.381s, episode steps: 165, s

 17623/50000: episode: 306, duration: 0.458s, episode steps: 200, steps per second: 436, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.324 [-0.504, 2.137], loss: 7.025070, mean_absolute_error: 27.917204, mean_q: 55.941006, mean_eps: 0.050000
 17791/50000: episode: 307, duration: 0.389s, episode steps: 168, steps per second: 432, episode reward: 168.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.530 [0.000, 1.000], mean observation: 0.408 [-0.485, 2.411], loss: 4.960592, mean_absolute_error: 28.210642, mean_q: 56.584929, mean_eps: 0.050000
 17983/50000: episode: 308, duration: 0.441s, episode steps: 192, steps per second: 435, episode reward: 192.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.368 [-0.411, 2.401], loss: 4.393656, mean_absolute_error: 28.266367, mean_q: 56.724321, mean_eps: 0.050000
 18183/50000: episode: 309, duration: 0.460s, episode steps: 200, s

 22639/50000: episode: 333, duration: 0.502s, episode steps: 195, steps per second: 388, episode reward: 195.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.371 [-0.575, 2.426], loss: 0.340699, mean_absolute_error: 29.832926, mean_q: 59.922887, mean_eps: 0.050000
 22839/50000: episode: 334, duration: 0.516s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.192 [-0.554, 1.283], loss: 0.352265, mean_absolute_error: 30.058738, mean_q: 60.379183, mean_eps: 0.050000
 23023/50000: episode: 335, duration: 0.431s, episode steps: 184, steps per second: 427, episode reward: 184.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.527 [0.000, 1.000], mean observation: 0.387 [-0.411, 2.413], loss: 0.885253, mean_absolute_error: 29.885035, mean_q: 59.965458, mean_eps: 0.050000
 23223/50000: episode: 336, duration: 0.540s, episode steps: 200, s

 27994/50000: episode: 360, duration: 0.460s, episode steps: 200, steps per second: 435, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.293 [-0.461, 1.970], loss: 1.776791, mean_absolute_error: 31.700526, mean_q: 63.232234, mean_eps: 0.050000
 28194/50000: episode: 361, duration: 0.466s, episode steps: 200, steps per second: 429, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.338 [-0.481, 2.262], loss: 1.323324, mean_absolute_error: 31.611619, mean_q: 63.030886, mean_eps: 0.050000
 28394/50000: episode: 362, duration: 0.468s, episode steps: 200, steps per second: 427, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.358 [-0.382, 2.406], loss: 1.556123, mean_absolute_error: 31.447305, mean_q: 62.669944, mean_eps: 0.050000
 28594/50000: episode: 363, duration: 0.469s, episode steps: 200, s

 33394/50000: episode: 387, duration: 0.512s, episode steps: 200, steps per second: 391, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.229 [-0.586, 1.501], loss: 2.003212, mean_absolute_error: 33.073890, mean_q: 65.893011, mean_eps: 0.050000
 33594/50000: episode: 388, duration: 0.496s, episode steps: 200, steps per second: 403, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.196 [-0.586, 1.254], loss: 1.581313, mean_absolute_error: 32.837881, mean_q: 65.385617, mean_eps: 0.050000
 33794/50000: episode: 389, duration: 0.489s, episode steps: 200, steps per second: 409, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.207 [-0.809, 1.352], loss: 1.962887, mean_absolute_error: 32.769087, mean_q: 65.190084, mean_eps: 0.050000
 33994/50000: episode: 390, duration: 0.471s, episode steps: 200, s

 37470/50000: episode: 415, duration: 0.245s, episode steps: 99, steps per second: 404, episode reward: 99.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.566 [0.000, 1.000], mean observation: 0.543 [-0.941, 2.430], loss: 2.648755, mean_absolute_error: 33.851203, mean_q: 67.676108, mean_eps: 0.050000
 37590/50000: episode: 416, duration: 0.304s, episode steps: 120, steps per second: 395, episode reward: 120.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.542 [0.000, 1.000], mean observation: 0.489 [-0.988, 2.436], loss: 4.131662, mean_absolute_error: 34.352838, mean_q: 68.370049, mean_eps: 0.050000
 37713/50000: episode: 417, duration: 0.304s, episode steps: 123, steps per second: 404, episode reward: 123.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: 0.474 [-0.629, 2.403], loss: 3.470675, mean_absolute_error: 34.170238, mean_q: 67.765627, mean_eps: 0.050000
 37849/50000: episode: 418, duration: 0.336s, episode steps: 136, ste

 41308/50000: episode: 442, duration: 0.424s, episode steps: 175, steps per second: 413, episode reward: 175.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.385 [-0.492, 2.403], loss: 4.698818, mean_absolute_error: 32.336135, mean_q: 64.321606, mean_eps: 0.050000
 41508/50000: episode: 443, duration: 0.506s, episode steps: 200, steps per second: 396, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.319 [-0.592, 2.127], loss: 6.703786, mean_absolute_error: 32.357930, mean_q: 64.293802, mean_eps: 0.050000
 41708/50000: episode: 444, duration: 0.504s, episode steps: 200, steps per second: 397, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.105 [-0.423, 0.692], loss: 6.411222, mean_absolute_error: 32.404331, mean_q: 64.453184, mean_eps: 0.050000
 41908/50000: episode: 445, duration: 0.494s, episode steps: 200, s

 46680/50000: episode: 469, duration: 0.536s, episode steps: 192, steps per second: 358, episode reward: 192.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.378 [-0.488, 2.403], loss: 6.465344, mean_absolute_error: 35.547250, mean_q: 71.187964, mean_eps: 0.050000
 46869/50000: episode: 470, duration: 0.465s, episode steps: 189, steps per second: 406, episode reward: 189.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.513 [0.000, 1.000], mean observation: 0.392 [-0.850, 2.405], loss: 7.953630, mean_absolute_error: 35.589570, mean_q: 71.241807, mean_eps: 0.050000
 47049/50000: episode: 471, duration: 0.426s, episode steps: 180, steps per second: 423, episode reward: 180.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.381 [-0.416, 2.408], loss: 5.437929, mean_absolute_error: 35.716042, mean_q: 71.590968, mean_eps: 0.050000
 47238/50000: episode: 472, duration: 0.486s, episode steps: 189, s

<keras.callbacks.History at 0x14310fdd8>

In [12]:
dqn.test(env, nb_episodes=100, visualize=True)

Testing for 100 episodes ...
Episode 1: reward: 177.000, steps: 177
Episode 2: reward: 189.000, steps: 189
Episode 3: reward: 172.000, steps: 172
Episode 4: reward: 161.000, steps: 161
Episode 5: reward: 177.000, steps: 177
Episode 6: reward: 157.000, steps: 157
Episode 7: reward: 159.000, steps: 159
Episode 8: reward: 162.000, steps: 162
Episode 9: reward: 171.000, steps: 171
Episode 10: reward: 165.000, steps: 165
Episode 11: reward: 193.000, steps: 193
Episode 12: reward: 179.000, steps: 179
Episode 13: reward: 183.000, steps: 183
Episode 14: reward: 199.000, steps: 199
Episode 15: reward: 183.000, steps: 183
Episode 16: reward: 169.000, steps: 169
Episode 17: reward: 154.000, steps: 154
Episode 18: reward: 166.000, steps: 166
Episode 19: reward: 181.000, steps: 181
Episode 20: reward: 192.000, steps: 192
Episode 21: reward: 161.000, steps: 161
Episode 22: reward: 170.000, steps: 170
Episode 23: reward: 172.000, steps: 172
Episode 24: reward: 200.000, steps: 200
Episode 25: reward: 

<keras.callbacks.History at 0x14bbb16d8>

In [13]:
env.close()