In [1]:
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [2]:
ENV_NAME = 'CartPole-v1'

<br/>**Get the environment and extract the number of actions.**

In [3]:
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [4]:
env.reset().shape

(4,)

In [5]:
(1,) + env.observation_space.shape

(1, 4)

<br/> **Next, we build a very simple model.**

In [6]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 4)                 0         
                                                                 
 dense (Dense)               (None, 16)                80        
                                                                 
 activation (Activation)     (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 activation_1 (Activation)   (None, 16)                0         
                                                                 
 dense_2 (Dense)             (None, 16)                272       
                                                                 
 activation_2 (Activation)   (None, 16)                0

<br/> **Finally, we configure and compile our agent. We can use every built-in tensorflow.keras optimizer and even the metrics!**

In [7]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=20,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])

<br/> **Learn agent**

In [8]:
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
    1/10000 [..............................] - ETA: 7:48 - reward: 1.0000

  updates=self.state_updates,


94 episodes - episode_reward: 104.606 [10.000, 313.000] - loss: 0.973 - mae: 19.422 - mean_q: 39.419

Interval 2 (10000 steps performed)
49 episodes - episode_reward: 206.367 [149.000, 310.000] - loss: 1.618 - mae: 36.965 - mean_q: 74.533

Interval 3 (20000 steps performed)
40 episodes - episode_reward: 247.600 [148.000, 375.000] - loss: 1.719 - mae: 40.542 - mean_q: 81.572

Interval 4 (30000 steps performed)
34 episodes - episode_reward: 295.824 [189.000, 453.000] - loss: 2.215 - mae: 44.614 - mean_q: 89.798

Interval 5 (40000 steps performed)
done, took 185.572 seconds


<keras.callbacks.History at 0x16473d58a90>

<br/> **Save the final weights after training**

In [9]:
dqn.save_weights(f'dqn_{ENV_NAME}_weights.h5f', overwrite=True)

<br/>**Finally, evaluate our algorithm for 5 episodes.**

In [8]:
dqn.load_weights(f'dqn_{ENV_NAME}_weights.h5f')

In [9]:
dqn.test(env, nb_episodes=5, visualize=True)
env.close()

Testing for 5 episodes ...


  updates=self.state_updates,


Episode 1: reward: 443.000, steps: 443
Episode 2: reward: 389.000, steps: 389
Episode 3: reward: 398.000, steps: 398
Episode 4: reward: 364.000, steps: 364
Episode 5: reward: 400.000, steps: 400
