In [1]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


In [2]:
ENV_NAME = 'TimePilot-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [3]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 100800)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10080100  
_________________________________________________________________
activation_1 (Activation)    (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1010      
_________________________________________________________________
activation_2 (Activation)    (None, 10)                0         
Total params: 10,081,110
Trainable params: 10,081,110
Non-trainable params: 0
_________________________________________________________________
None


In [4]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=10000, visualize=True, verbose=2)

Training for 10000 steps ...




 1518/10000: episode: 1, duration: 101.618s, episode steps: 1518, steps per second: 15, episode reward: 500.000, mean reward: 0.329 [0.000, 100.000], mean action: 8.562 [0.000, 9.000], mean observation: 88.799 [0.000, 228.000], loss: 13.617092, mean_absolute_error: 0.043236, mean_q: 1.000000
 3989/10000: episode: 2, duration: 164.774s, episode steps: 2471, steps per second: 15, episode reward: 700.000, mean reward: 0.283 [0.000, 100.000], mean action: 8.573 [0.000, 9.000], mean observation: 90.012 [0.000, 228.000], loss: 15.927675, mean_absolute_error: 0.049971, mean_q: 1.000000
 6013/10000: episode: 3, duration: 134.959s, episode steps: 2024, steps per second: 15, episode reward: 500.000, mean reward: 0.247 [0.000, 100.000], mean action: 8.597 [0.000, 9.000], mean observation: 89.555 [0.000, 228.000], loss: 12.561594, mean_absolute_error: 0.043120, mean_q: 1.000000
 7810/10000: episode: 4, duration: 119.831s, episode steps: 1797, steps per second: 15, episode reward: 600.000, mean rew

<keras.callbacks.History at 0x7f704c850eb8>

In [8]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 300.000, steps: 2408


KeyboardInterrupt: 

In [6]:
dqn.fit(env, nb_steps=10000, visualize=True)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 666.452 seconds


<keras.callbacks.History at 0x7f705264dac8>

In [7]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 300.000, steps: 2667
Episode 2: reward: 300.000, steps: 2417
Episode 3: reward: 300.000, steps: 2425
Episode 4: reward: 300.000, steps: 2410
Episode 5: reward: 300.000, steps: 2445


<keras.callbacks.History at 0x7f700c0324a8>