In [4]:
import numpy as np
import gym


In [5]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [7]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=1000, visualize=True, verbose=2)

Training for 1000 steps ...




  79/1000: episode: 1, duration: 2.397s, episode steps: 79, steps per second: 33, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.060 [-0.402, 0.722], loss: 0.426330, mean_absolute_error: 0.495163, mean_q: 0.053828
 113/1000: episode: 2, duration: 0.565s, episode steps: 34, steps per second: 60, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.151 [-0.159, 0.753], loss: 0.350759, mean_absolute_error: 0.444407, mean_q: 0.193362
 163/1000: episode: 3, duration: 0.832s, episode steps: 50, steps per second: 60, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.082 [-0.295, 0.778], loss: 0.311264, mean_absolute_error: 0.462657, mean_q: 0.321871
 197/1000: episode: 4, duration: 0.565s, episode steps: 34, steps per second: 60, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.5

 684/1000: episode: 30, duration: 0.183s, episode steps: 11, steps per second: 60, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.120 [-1.134, 1.941], loss: 0.502934, mean_absolute_error: 2.302229, mean_q: 4.359598
 697/1000: episode: 31, duration: 0.213s, episode steps: 13, steps per second: 61, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.308 [0.000, 1.000], mean observation: 0.100 [-1.017, 1.716], loss: 0.457887, mean_absolute_error: 2.297588, mean_q: 4.373185
 708/1000: episode: 32, duration: 0.184s, episode steps: 11, steps per second: 60, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.364 [0.000, 1.000], mean observation: 0.105 [-1.023, 1.554], loss: 0.563538, mean_absolute_error: 2.400669, mean_q: 4.474579
 720/1000: episode: 33, duration: 0.200s, episode steps: 12, steps per second: 60, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action:

<keras.callbacks.History at 0x2b0421e4710>

In [10]:
dqn.test(env, nb_episodes=50, visualize=True)

Testing for 50 episodes ...
Episode 1: reward: 12.000, steps: 12
Episode 2: reward: 13.000, steps: 13
Episode 3: reward: 12.000, steps: 12
Episode 4: reward: 13.000, steps: 13
Episode 5: reward: 10.000, steps: 10
Episode 6: reward: 12.000, steps: 12
Episode 7: reward: 12.000, steps: 12
Episode 8: reward: 10.000, steps: 10
Episode 9: reward: 9.000, steps: 9
Episode 10: reward: 11.000, steps: 11
Episode 11: reward: 10.000, steps: 10
Episode 12: reward: 10.000, steps: 10
Episode 13: reward: 11.000, steps: 11
Episode 14: reward: 13.000, steps: 13
Episode 15: reward: 9.000, steps: 9
Episode 16: reward: 11.000, steps: 11
Episode 17: reward: 12.000, steps: 12
Episode 18: reward: 11.000, steps: 11
Episode 19: reward: 12.000, steps: 12
Episode 20: reward: 12.000, steps: 12
Episode 21: reward: 11.000, steps: 11
Episode 22: reward: 11.000, steps: 11
Episode 23: reward: 11.000, steps: 11
Episode 24: reward: 12.000, steps: 12
Episode 25: reward: 13.000, steps: 13
Episode 26: reward: 12.000, steps: 

<keras.callbacks.History at 0x2b0410d2b00>