In [2]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory


Using TensorFlow backend.


In [3]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n
# print(nb_actions)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [5]:
input_shape=(1,) + env.observation_space.shape
print(input_shape)

(1, 4)


In [19]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_5 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_6 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-2), metrics=['mae'])


In [21]:
dqn.fit(env, nb_steps=15000, visualize=True, verbose=2)

Training for 15000 steps ...




    62/15000: episode: 1, duration: 0.837s, episode steps: 62, steps per second: 74, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.037 [-0.402, 0.639], loss: 0.144909, mean_absolute_error: 0.420378, mean_q: 0.625707
    83/15000: episode: 2, duration: 0.080s, episode steps: 21, steps per second: 261, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: 0.132 [-0.368, 1.203], loss: 0.008572, mean_absolute_error: 0.553426, mean_q: 1.223205
   119/15000: episode: 3, duration: 0.151s, episode steps: 36, steps per second: 239, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.071 [-1.138, 2.191], loss: 0.008687, mean_absolute_error: 0.673664, mean_q: 1.394034
   145/15000: episode: 4, duration: 0.102s, episode steps: 26, steps per second: 254, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], m

  2850/15000: episode: 30, duration: 0.510s, episode steps: 140, steps per second: 275, episode reward: 140.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.247 [-1.399, 0.408], loss: 0.440944, mean_absolute_error: 11.060410, mean_q: 22.272280
  2921/15000: episode: 31, duration: 0.259s, episode steps: 71, steps per second: 274, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.465 [0.000, 1.000], mean observation: -0.181 [-0.940, 0.337], loss: 0.805725, mean_absolute_error: 11.408578, mean_q: 22.915760
  3046/15000: episode: 32, duration: 0.451s, episode steps: 125, steps per second: 277, episode reward: 125.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: -0.162 [-1.030, 0.483], loss: 0.519456, mean_absolute_error: 11.798341, mean_q: 23.597139
  3246/15000: episode: 33, duration: 0.729s, episode steps: 200, steps per second: 274, episode reward: 200.000, mean reward: 1.0

  8219/15000: episode: 59, duration: 0.721s, episode steps: 200, steps per second: 277, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.005 [-0.619, 0.638], loss: 2.904309, mean_absolute_error: 24.556650, mean_q: 49.335400
  8419/15000: episode: 60, duration: 0.721s, episode steps: 200, steps per second: 277, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.143 [-0.493, 0.911], loss: 3.788210, mean_absolute_error: 24.846403, mean_q: 49.849003
  8619/15000: episode: 61, duration: 0.721s, episode steps: 200, steps per second: 277, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.000 [-0.582, 0.521], loss: 3.406561, mean_absolute_error: 25.239449, mean_q: 50.716148
  8819/15000: episode: 62, duration: 0.723s, episode steps: 200, steps per second: 277, episode reward: 200.000, mean reward: 1.00

 14019/15000: episode: 88, duration: 0.719s, episode steps: 200, steps per second: 278, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.027 [-0.639, 0.540], loss: 9.149343, mean_absolute_error: 33.441250, mean_q: 66.844971
 14219/15000: episode: 89, duration: 0.715s, episode steps: 200, steps per second: 280, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.083 [-0.504, 0.574], loss: 8.004816, mean_absolute_error: 33.627632, mean_q: 67.330170
 14419/15000: episode: 90, duration: 0.719s, episode steps: 200, steps per second: 278, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.020 [-0.742, 0.618], loss: 9.892067, mean_absolute_error: 33.717098, mean_q: 67.481094
 14619/15000: episode: 91, duration: 0.723s, episode steps: 200, steps per second: 277, episode reward: 200.000, mean reward: 1.00

<keras.callbacks.History at 0x7f1003b44e80>

In [22]:
dqn.test(env, nb_episodes=100, visualize=True)

Testing for 100 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200
Episode 16: reward: 200.000, steps: 200
Episode 17: reward: 200.000, steps: 200
Episode 18: reward: 200.000, steps: 200
Episode 19: reward: 200.000, steps: 200
Episode 20: reward: 200.000, steps: 200
Episode 21: reward: 200.000, steps: 200
Episode 22: reward: 200.000, steps: 200
Episode 23: reward: 200.000, steps: 200
Episode 24: reward: 200.000, steps: 200
Episode 25: reward: 

<keras.callbacks.History at 0x7f1003b44da0>

In [23]:
env.close()