# Deep Q Learning using Keras

In [1]:
import numpy as np
import gym


from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

# Gym env and actions

In [2]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(1)
env.seed(1)
nb_actions = env.action_space.n

In [3]:
input_shape=(1,) + env.observation_space.shape
print(input_shape)

(1, 4)


# Simple NN model

In [4]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1.00,value_min=.05, value_test=.05,
nb_steps=10000)

memory = SequentialMemory(limit=10000, window_length=1)

dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=100, policy=policy)

dqn.compile(Adam(lr=0.001), metrics=['mae'])





In [11]:
dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)

Training for 50000 steps ...
   200/50000: episode: 1, duration: 0.454s, episode steps: 200, steps per second: 441, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.261 [-1.667, 0.530], loss: 7.963720, mean_absolute_error: 21.350580, mean_q: 42.705916
   400/50000: episode: 2, duration: 0.468s, episode steps: 200, steps per second: 428, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.139 [-0.859, 0.807], loss: 9.009680, mean_absolute_error: 21.738863, mean_q: 43.411633
   600/50000: episode: 3, duration: 0.469s, episode steps: 200, steps per second: 426, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.083 [-0.738, 0.829], loss: 7.174608, mean_absolute_error: 21.914589, mean_q: 43.932236
   800/50000: episode: 4, duration: 0.464s, episode steps: 200, steps per second: 431, episode reward

  6000/50000: episode: 30, duration: 0.483s, episode steps: 200, steps per second: 414, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.171 [-1.079, 0.624], loss: 10.947314, mean_absolute_error: 28.351948, mean_q: 57.125492
  6200/50000: episode: 31, duration: 0.491s, episode steps: 200, steps per second: 407, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.208 [-1.358, 0.594], loss: 10.362570, mean_absolute_error: 28.573654, mean_q: 57.595177
  6400/50000: episode: 32, duration: 0.506s, episode steps: 200, steps per second: 395, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.103 [-0.711, 0.607], loss: 12.674486, mean_absolute_error: 28.679949, mean_q: 57.744164
  6600/50000: episode: 33, duration: 0.506s, episode steps: 200, steps per second: 395, episode reward: 200.000, mean reward

 11791/50000: episode: 59, duration: 0.488s, episode steps: 200, steps per second: 410, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.098 [-1.128, 1.124], loss: 10.417451, mean_absolute_error: 32.685566, mean_q: 66.067879
 11991/50000: episode: 60, duration: 0.477s, episode steps: 200, steps per second: 419, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.173 [-0.563, 1.132], loss: 11.421991, mean_absolute_error: 32.811859, mean_q: 66.317726
 12191/50000: episode: 61, duration: 0.496s, episode steps: 200, steps per second: 404, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.102 [-0.665, 0.498], loss: 14.488133, mean_absolute_error: 33.241486, mean_q: 67.004433
 12391/50000: episode: 62, duration: 0.491s, episode steps: 200, steps per second: 407, episode reward: 200.000, mean reward:

 17591/50000: episode: 88, duration: 0.463s, episode steps: 200, steps per second: 432, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.097 [-0.800, 0.545], loss: 12.381862, mean_absolute_error: 35.651646, mean_q: 71.971214
 17791/50000: episode: 89, duration: 0.463s, episode steps: 200, steps per second: 432, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.036 [-0.684, 0.787], loss: 10.990753, mean_absolute_error: 35.617886, mean_q: 71.994080
 17991/50000: episode: 90, duration: 0.461s, episode steps: 200, steps per second: 433, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.014 [-0.614, 0.526], loss: 14.793518, mean_absolute_error: 35.885632, mean_q: 72.353989
 18191/50000: episode: 91, duration: 0.466s, episode steps: 200, steps per second: 429, episode reward: 200.000, mean reward

 23391/50000: episode: 117, duration: 0.476s, episode steps: 200, steps per second: 420, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.029 [-0.618, 0.558], loss: 11.787702, mean_absolute_error: 37.821808, mean_q: 76.332062
 23591/50000: episode: 118, duration: 0.471s, episode steps: 200, steps per second: 425, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.014 [-0.509, 0.513], loss: 12.561560, mean_absolute_error: 38.055836, mean_q: 76.811188
 23791/50000: episode: 119, duration: 0.478s, episode steps: 200, steps per second: 419, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.196 [-1.377, 0.601], loss: 10.998963, mean_absolute_error: 37.974815, mean_q: 76.755440
 23991/50000: episode: 120, duration: 0.463s, episode steps: 200, steps per second: 432, episode reward: 200.000, mean rew

 28991/50000: episode: 145, duration: 0.481s, episode steps: 200, steps per second: 416, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.116 [-1.066, 0.708], loss: 13.332803, mean_absolute_error: 38.931355, mean_q: 78.556396
 29191/50000: episode: 146, duration: 0.485s, episode steps: 200, steps per second: 413, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.225 [-1.645, 0.532], loss: 12.385360, mean_absolute_error: 38.817436, mean_q: 78.316292
 29391/50000: episode: 147, duration: 0.467s, episode steps: 200, steps per second: 428, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.094 [-0.674, 0.548], loss: 13.429565, mean_absolute_error: 38.981640, mean_q: 78.683929
 29591/50000: episode: 148, duration: 0.473s, episode steps: 200, steps per second: 422, episode reward: 200.000, mean re

 34791/50000: episode: 174, duration: 0.502s, episode steps: 200, steps per second: 398, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.308 [-2.083, 0.494], loss: 16.683767, mean_absolute_error: 39.497501, mean_q: 79.389793
 34991/50000: episode: 175, duration: 0.503s, episode steps: 200, steps per second: 398, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.027 [-0.598, 0.568], loss: 12.176950, mean_absolute_error: 39.352669, mean_q: 79.182396
 35191/50000: episode: 176, duration: 0.491s, episode steps: 200, steps per second: 407, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.000 [-0.388, 0.537], loss: 12.397534, mean_absolute_error: 39.280987, mean_q: 79.129463
 35391/50000: episode: 177, duration: 0.481s, episode steps: 200, steps per second: 416, episode reward: 200.000, mean rew

 40379/50000: episode: 202, duration: 0.507s, episode steps: 200, steps per second: 395, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.017 [-0.693, 0.534], loss: 6.150489, mean_absolute_error: 39.867970, mean_q: 80.293617
 40579/50000: episode: 203, duration: 0.480s, episode steps: 200, steps per second: 416, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.259 [-1.806, 0.557], loss: 8.563495, mean_absolute_error: 39.919163, mean_q: 80.383781
 40779/50000: episode: 204, duration: 0.477s, episode steps: 200, steps per second: 419, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.036 [-0.616, 0.619], loss: 12.039043, mean_absolute_error: 39.721535, mean_q: 79.746368
 40979/50000: episode: 205, duration: 0.472s, episode steps: 200, steps per second: 424, episode reward: 200.000, mean rewar

 46179/50000: episode: 231, duration: 0.477s, episode steps: 200, steps per second: 419, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.019 [-0.586, 0.823], loss: 9.598829, mean_absolute_error: 38.933224, mean_q: 77.827850
 46379/50000: episode: 232, duration: 0.472s, episode steps: 200, steps per second: 424, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.099 [-0.772, 0.576], loss: 13.475163, mean_absolute_error: 38.981224, mean_q: 77.818367
 46579/50000: episode: 233, duration: 0.474s, episode steps: 200, steps per second: 422, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.061 [-0.724, 0.567], loss: 12.164748, mean_absolute_error: 38.756676, mean_q: 77.265228
 46779/50000: episode: 234, duration: 0.500s, episode steps: 200, steps per second: 400, episode reward: 200.000, mean rewa

<keras.callbacks.History at 0x14bdee240>

In [12]:
dqn.test(env, nb_episodes=100, visualize=True)

Testing for 100 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200
Episode 16: reward: 200.000, steps: 200
Episode 17: reward: 200.000, steps: 200
Episode 18: reward: 200.000, steps: 200
Episode 19: reward: 200.000, steps: 200
Episode 20: reward: 200.000, steps: 200
Episode 21: reward: 200.000, steps: 200
Episode 22: reward: 200.000, steps: 200
Episode 23: reward: 200.000, steps: 200
Episode 24: reward: 200.000, steps: 200
Episode 25: reward: 

<keras.callbacks.History at 0x14bdee630>