In [5]:
%matplotlib inline
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gym
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [3]:
# Set the relevant variables:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions in Cartpole game
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [4]:
# Build a simple hidden layer
model = Sequential()
model.add(Flatten(input_shape = (1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [5]:
# Use the epsilon greedy policy improvment
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit = 50000, window_length = 1)
dqn = DQNAgent(model = model, nb_actions = nb_actions, memory = memory, nb_steps_warmup = 10, target_model_update = 1e-2, policy = policy)
dqn.compile(Adam(lr = 1e-3), metrics = ['mae'])
dqn.fit(env, nb_steps = 5000, visualize = True, verbose = 2)

Training for 5000 steps ...
Instructions for updating:
Use tf.cast instead.




   79/5000: episode: 1, duration: 2.793s, episode steps: 79, steps per second: 28, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.060 [-0.402, 0.722], loss: 0.428186, mean_absolute_error: 0.496044, mean_q: 0.053062
  113/5000: episode: 2, duration: 0.568s, episode steps: 34, steps per second: 60, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.151 [-0.159, 0.753], loss: 0.351612, mean_absolute_error: 0.445402, mean_q: 0.193012
  165/5000: episode: 3, duration: 0.865s, episode steps: 52, steps per second: 60, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.088 [-0.295, 0.673], loss: 0.314531, mean_absolute_error: 0.467520, mean_q: 0.322337
  199/5000: episode: 4, duration: 0.568s, episode steps: 34, steps per second: 60, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action:

  675/5000: episode: 31, duration: 0.238s, episode steps: 13, steps per second: 55, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.308 [0.000, 1.000], mean observation: 0.087 [-1.207, 1.871], loss: 0.423647, mean_absolute_error: 2.193084, mean_q: 4.225003
  684/5000: episode: 32, duration: 0.165s, episode steps: 9, steps per second: 54, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.111 [0.000, 1.000], mean observation: 0.124 [-1.418, 2.297], loss: 0.355577, mean_absolute_error: 2.276168, mean_q: 4.358374
  694/5000: episode: 33, duration: 0.162s, episode steps: 10, steps per second: 62, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.144 [-1.348, 2.185], loss: 0.334126, mean_absolute_error: 2.305633, mean_q: 4.487020
  703/5000: episode: 34, duration: 0.150s, episode steps: 9, steps per second: 60, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action:

  983/5000: episode: 61, duration: 0.199s, episode steps: 12, steps per second: 60, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.083 [-1.184, 1.769], loss: 1.060042, mean_absolute_error: 3.501330, mean_q: 6.475317
  997/5000: episode: 62, duration: 0.236s, episode steps: 14, steps per second: 59, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.357 [0.000, 1.000], mean observation: 0.092 [-0.788, 1.489], loss: 0.833642, mean_absolute_error: 3.500277, mean_q: 6.528677
 1013/5000: episode: 63, duration: 0.269s, episode steps: 16, steps per second: 59, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.093 [-0.944, 1.642], loss: 0.960523, mean_absolute_error: 3.573193, mean_q: 6.695931
 1026/5000: episode: 64, duration: 0.210s, episode steps: 13, steps per second: 62, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean act

 1491/5000: episode: 90, duration: 0.783s, episode steps: 45, steps per second: 57, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: 0.096 [-0.544, 0.818], loss: 1.311603, mean_absolute_error: 4.562609, mean_q: 8.422343
 1540/5000: episode: 91, duration: 0.833s, episode steps: 49, steps per second: 59, episode reward: 49.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.120 [-0.759, 0.259], loss: 1.364419, mean_absolute_error: 4.674416, mean_q: 8.651674
 1574/5000: episode: 92, duration: 0.466s, episode steps: 34, steps per second: 73, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.101 [-0.964, 0.221], loss: 1.460110, mean_absolute_error: 4.786120, mean_q: 8.817448
 1598/5000: episode: 93, duration: 0.557s, episode steps: 24, steps per second: 43, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean a

 2184/5000: episode: 119, duration: 0.414s, episode steps: 14, steps per second: 34, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.571 [0.000, 1.000], mean observation: -0.116 [-1.197, 0.608], loss: 2.000751, mean_absolute_error: 6.568384, mean_q: 12.614542
 2199/5000: episode: 120, duration: 0.473s, episode steps: 15, steps per second: 32, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: -0.102 [-1.071, 0.454], loss: 3.439864, mean_absolute_error: 6.669377, mean_q: 12.565231
 2219/5000: episode: 121, duration: 0.644s, episode steps: 20, steps per second: 31, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.121 [-0.889, 0.364], loss: 2.519728, mean_absolute_error: 6.639435, mean_q: 12.573806
 2280/5000: episode: 122, duration: 1.924s, episode steps: 61, steps per second: 32, episode reward: 61.000, mean reward: 1.000 [1.000, 1.000]

 3510/5000: episode: 148, duration: 1.183s, episode steps: 71, steps per second: 60, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.479 [0.000, 1.000], mean observation: -0.076 [-0.716, 0.481], loss: 3.010715, mean_absolute_error: 8.853054, mean_q: 17.117516
 3548/5000: episode: 149, duration: 0.650s, episode steps: 38, steps per second: 59, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.447 [0.000, 1.000], mean observation: -0.126 [-0.721, 0.210], loss: 3.468884, mean_absolute_error: 8.969676, mean_q: 17.296041
 3587/5000: episode: 150, duration: 0.648s, episode steps: 39, steps per second: 60, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.132 [-0.656, 0.383], loss: 2.801829, mean_absolute_error: 9.033515, mean_q: 17.556305
 3641/5000: episode: 151, duration: 0.901s, episode steps: 54, steps per second: 60, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000]

<keras.callbacks.History at 0xb25027b70>

In [9]:
dqn.test(env, nb_episodes = 5, visualize = True)

Testing for 5 episodes ...
Episode 1: reward: 66.000, steps: 66
Episode 2: reward: 66.000, steps: 66
Episode 3: reward: 67.000, steps: 67
Episode 4: reward: 81.000, steps: 81
Episode 5: reward: 74.000, steps: 74


<keras.callbacks.History at 0xb222359e8>

In [8]:
Env.algorithmic

NameError: name 'Env' is not defined