In [1]:
import os
import gym
import random
import numpy as np
import tensorflow as tf
from collections import deque
from skimage.color import rgb2gray
from skimage.transform import resize




In [2]:
ENV_NAME = 'Breakout-v0'  # Environment name
FRAME_WIDTH = 84  # Resized frame width
FRAME_HEIGHT = 84  # Resized frame height

NUM_EPISODES = 12000  # Number of episodes the agent plays
STATE_LENGTH = 4  # Number of most recent frames to produce the input to the network

GAMMA = 0.99  # Discount factor

EXPLORATION_STEPS = 1000000  # Number of steps over which the initial value of epsilon is linearly annealed to its final value
INITIAL_EPSILON = 1.0  # Initial value of epsilon in epsilon-greedy
FINAL_EPSILON = 0.1  # Final value of epsilon in epsilon-greedy

INITIAL_REPLAY_SIZE = 20000  # Number of steps to populate the replay memory before training starts
NUM_REPLAY_MEMORY = 400000  # Number of replay memory the agent uses for training

BATCH_SIZE = 32  # Mini batch size
TARGET_UPDATE_INTERVAL = 10000  # The frequency with which the target network is updated
TRAIN_INTERVAL = 4  # The agent selects 4 actions between successive updates

NO_OP_STEPS = 30  # Maximum number of "do nothing" actions to be performed by the agent at the start of an episode

TRAIN = True
SAVE_SUMMARY_PATH = 'graphs/dqn/kerastestnew/2'
NUM_EPISODES_AT_TEST = 30  # Number of episodes the agent plays at test time

In [3]:

def preprocess(observation, last_observation):
    processed_observation = np.maximum(observation, last_observation)
    processed_observation = np.uint8(resize(rgb2gray(processed_observation), (FRAME_WIDTH, FRAME_HEIGHT)) * 255)
    return processed_observation


In [4]:

env = gym.make(ENV_NAME)
num_actions = env.action_space.n
epsilon = INITIAL_EPSILON
epsilon_step = (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORATION_STEPS
t = 0

# Parameters used for summary
total_reward = 0
total_q_max = 0
total_loss = 0
duration = 0
episode = 0

# Create replay memory
replay_memory = deque()


In [5]:
def build_network(name):
    with tf.variable_scope(name):
        s = tf.placeholder(tf.float32, [None, FRAME_WIDTH, FRAME_HEIGHT, STATE_LENGTH])
        conv1 = tf.layers.conv2d(s, 32, (8,8), strides=(4,4), activation=tf.nn.relu)
        conv2 = tf.layers.conv2d(conv1, 64, (4,4), strides=(2,2), activation=tf.nn.relu)
        conv3 = tf.layers.conv2d(conv2, 64, (3,3), strides=(1,1), activation=tf.nn.relu)
        flattened = tf.contrib.layers.flatten(conv3)
        print(flattened)
        dense1 = tf.layers.dense(flattened, 512, activation=tf.nn.relu)
        q_values = tf.layers.dense(dense1, num_actions)
    trainable_parameters = [v for v in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if v.name.startswith(name)]
    print([v.name for v in trainable_parameters])
    return s, q_values, trainable_parameters

### Updating weights

In [6]:

# Create q network
s, q_values, q_network_weights = build_network("q_network")

# Create target network
st, target_q_values, target_network_weights = build_network("target_network")

# Define target network update operation
update_target_network = [target_network_weights[i].assign(q_network_weights[i]) for i in range(len(target_network_weights))]



Tensor("q_network/Flatten/Reshape:0", shape=(?, 3136), dtype=float32)
['q_network/conv2d/kernel:0', 'q_network/conv2d/bias:0', 'q_network/conv2d_1/kernel:0', 'q_network/conv2d_1/bias:0', 'q_network/conv2d_2/kernel:0', 'q_network/conv2d_2/bias:0', 'q_network/dense/kernel:0', 'q_network/dense/bias:0', 'q_network/dense_1/kernel:0', 'q_network/dense_1/bias:0']
Tensor("target_network/Flatten/Reshape:0", shape=(?, 3136), dtype=float32)
['target_network/conv2d/kernel:0', 'target_network/conv2d/bias:0', 'target_network/conv2d_1/kernel:0', 'target_network/conv2d_1/bias:0', 'target_network/conv2d_2/kernel:0', 'target_network/conv2d_2/bias:0', 'target_network/dense/kernel:0', 'target_network/dense/bias:0', 'target_network/dense_1/kernel:0', 'target_network/dense_1/bias:0']


### Input and output conversions

In [7]:

a = tf.placeholder(tf.int64, [None])
y = tf.placeholder(tf.float32, [None])

# Convert action to one hot vector
a_one_hot = tf.one_hot(a, num_actions, 1.0, 0.0)

## Use the network we created before to determine the expected value for the specified action
q_value = tf.reduce_sum(tf.multiply(q_values, a_one_hot), reduction_indices=1)


### Loss function
As loss function you can use mean-squared error, but note that this error metric can give use values if the error becomes large. 

One error metric that prevents this is te so-called Huber-loss. 

In [8]:

# Clip the error, the loss is quadratic when the error is in (-1, 1), and linear outside of that region
error = tf.abs(y - q_value)
quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
linear_part = error - quadratic_part
loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)


### Determine the optimizer
Normally we always use the ADAM optimizer. A downside we did not yet encounter by Adam is that if you encounter a very very sparse situation you can mess up your gradients. It turns out that this is possible using this environment. This is why we use another optimizer: the RMSPropOptimizer. 

In [9]:
LEARNING_RATE = 0.00025  # Learning rate used by RMSProp
MOMENTUM = 0.95  # Momentum used by RMSProp
MIN_GRAD = 0.01  # Constant added to the squared gradient in the denominator of the RMSProp update

optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, momentum=MOMENTUM, epsilon=MIN_GRAD)
grads_update = optimizer.minimize(loss, var_list=q_network_weights)



In [10]:
def get_initial_state(observation, last_observation):
    processed_observation = np.maximum(observation, last_observation)
    processed_observation = np.uint8(resize(rgb2gray(processed_observation), (FRAME_WIDTH, FRAME_HEIGHT)) * 255)
    state = [processed_observation for _ in range(STATE_LENGTH)]
    return np.dstack(state)


### How do we determine our action? 
At the start of our program we always want to select a random action. We thus set epsilon to 1.0 as parameter. 

In [11]:

def get_action(state, epsilon):
    if epsilon >= random.random() or t < INITIAL_REPLAY_SIZE:
        action = random.randrange(num_actions)
    else:
        action = np.argmax(q_values.eval(feed_dict={s: [np.float32(state / 255.0)]}))

    # Anneal epsilon linearly over time
    if epsilon > FINAL_EPSILON and t >= INITIAL_REPLAY_SIZE:
        epsilon -= epsilon_step

    return action

In [12]:
### SUMMARY STUFF
episode_total_reward = tf.placeholder(tf.float32)
tf.summary.scalar('total_reward', episode_total_reward)
episode_avg_max_q = tf.placeholder(tf.float32)
tf.summary.scalar('average_maxq', episode_avg_max_q)
episode_duration = tf.placeholder(tf.float32)
tf.summary.scalar('duration', episode_duration)
episode_avg_loss = tf.placeholder(tf.float32)
tf.summary.scalar('loss', episode_avg_loss)

summary_op = tf.summary.merge_all()
summary_writer = tf.summary.FileWriter(SAVE_SUMMARY_PATH, tf.get_default_graph())


In [13]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
sess.run(update_target_network)

[array([[[[  2.85475105e-02,   1.52664185e-02,   3.96659225e-03, ...,
            -3.00774779e-02,  -9.21535119e-03,  -9.70891863e-03],
          [ -7.62239471e-03,   4.42822427e-02,  -2.86701750e-02, ...,
             4.97484803e-02,   4.59947959e-02,   4.71892506e-02],
          [ -3.00905481e-03,   7.86687434e-03,   1.08856969e-02, ...,
            -3.12535018e-02,  -3.03224791e-02,  -6.50117919e-03],
          [  4.51983809e-02,   3.35274562e-02,   3.46411541e-02, ...,
             4.86962646e-02,   3.00429463e-02,   1.60857812e-02]],
 
         [[ -3.47745232e-02,   9.09455121e-05,   2.88466513e-02, ...,
             4.50282097e-02,   4.26991656e-02,   1.16887763e-02],
          [  4.48082313e-02,  -4.60370146e-02,  -2.80579794e-02, ...,
             2.69188806e-02,  -7.38650560e-05,  -2.00920943e-02],
          [ -3.65154520e-02,  -1.22794099e-02,   3.99484113e-03, ...,
            -3.25303301e-02,  -4.31570411e-03,  -3.93614955e-02],
          [  3.62463221e-02,   1.97795555e-02

In [None]:

    


def run(self, state, action, reward, terminal, observation):
    next_state = np.dstack((state[:, :, 1:], observation))

    # Clip all positive rewards at 1 and all negative rewards at -1, leaving 0 rewards unchanged
    reward = np.clip(reward, -1, 1)

    # Store transition in replay memory
    self.replay_memory.append((state, action, reward, next_state, terminal))
    if len(self.replay_memory) > NUM_REPLAY_MEMORY:
        self.replay_memory.popleft()

    if self.t >= INITIAL_REPLAY_SIZE:
        # Train network
        if self.t % TRAIN_INTERVAL == 0:
            self.train_network()

        # Update target network
        if self.t % TARGET_UPDATE_INTERVAL == 0:
            self.sess.run(self.update_target_network)
            
    self.total_reward += reward
    self.total_q_max += np.max(self.q_values.eval(feed_dict={self.s: [np.float32(state / 255.0)]}))
    self.duration += 1

    if terminal:
        # Write summary
        if self.t >= INITIAL_REPLAY_SIZE:
            stats = [self.total_reward, self.total_q_max / float(self.duration),
                    self.duration, self.total_loss / (float(self.duration) / float(TRAIN_INTERVAL))]
            for i in range(len(stats)):
                self.sess.run(self.update_ops[i], feed_dict={
                    self.summary_placeholders[i]: float(stats[i])
                })
                
            summary_str = sess.run(summary_op, feed_dict={episode_total_reward: total_reward, episode_avg_max_q: total_q_max/float(duration), episode_duration: duration , episode_avg_loss: total_loss/duration})
            summary_writer.add_summary(summary_str, self.episode + 1)

        # Debug
        if t < INITIAL_REPLAY_SIZE:
            mode = 'random'
        elif INITIAL_REPLAY_SIZE <= self.t < INITIAL_REPLAY_SIZE + EXPLORATION_STEPS:
            mode = 'explore'
        else:
            mode = 'exploit'
        print('EPISODE: {0:6d} / TIMESTEP: {1:8d} / DURATION: {2:5d} / EPSILON: {3:.5f} / TOTAL_REWARD: {4:3.0f} / AVG_MAX_Q: {5:2.4f} / AVG_LOSS: {6:.5f} / MODE: {7}'.format(
            self.episode + 1, self.t, self.duration, self.epsilon,
            self.total_reward, self.total_q_max / float(self.duration),
            self.total_loss / (float(self.duration) / float(TRAIN_INTERVAL)), mode))

        self.total_reward = 0
        self.total_q_max = 0
        self.total_loss = 0
        self.duration = 0
        self.episode += 1

    self.t += 1

    return next_state

def train_network(replay_memory):
    state_batch = []
    action_batch = []
    reward_batch = []
    next_state_batch = []
    terminal_batch = []
    y_batch = []

    # Sample random minibatch of transition from replay memory
    minibatch = random.sample(replay_memory, BATCH_SIZE)
    for data in minibatch:
        state_batch.append(data[0])
        action_batch.append(data[1])
        reward_batch.append(data[2])
        next_state_batch.append(data[3])
        terminal_batch.append(data[4])

    # Convert True to 1, False to 0
    terminal_batch = np.array(terminal_batch) + 0

    target_q_values_batch = target_q_values.eval(feed_dict={st: np.float32(np.array(next_state_batch) / 255.0)})
    y_batch = reward_batch + (1 - terminal_batch) * GAMMA * np.max(target_q_values_batch, axis=1)

    local_loss, _ = sess.run([loss, grads_update], feed_dict={
        s: np.float32(np.array(state_batch) / 255.0),
        a: action_batch,
        y: y_batch
    })
    global total_loss
    
    total_loss += local_loss

In [None]:



for _ in range(NUM_EPISODES):
    terminal = False
    observation = env.reset()
    for _ in range(random.randint(1, NO_OP_STEPS)):
        last_observation = observation
        observation, _, _, _ = env.step(0)  # Do nothing
    state = get_initial_state(observation, last_observation)
    while not terminal:
        last_observation = observation
        action = get_action(state, epsilon)
        observation, reward, terminal, _ = env.step(action)
        # env.render()
        processed_observation = preprocess(observation, last_observation)

#         state = agent.run(state, action, reward, terminal, processed_observation)
        next_state = np.dstack((state[:, :, 1:], processed_observation))

        # Clip all positive rewards at 1 and all negative rewards at -1, leaving 0 rewards unchanged
        reward = np.clip(reward, -1, 1)

        # Store transition in replay memory
        replay_memory.append((state, action, reward, next_state, terminal))
        if len(replay_memory) > NUM_REPLAY_MEMORY:
            replay_memory.popleft()

        if t >= INITIAL_REPLAY_SIZE:
            # Train network
            if t % TRAIN_INTERVAL == 0:
                train_network(replay_memory)

            # Update target network
            if t % TARGET_UPDATE_INTERVAL == 0:
                sess.run(update_target_network)

        total_reward += reward
        total_q_max += np.max(q_values.eval(feed_dict={s: [np.float32(state / 255.0)]}))
        duration += 1

        if terminal:
            # Write summary
            if t >= INITIAL_REPLAY_SIZE:
                summary_str = sess.run(summary_op, feed_dict={episode_total_reward: total_reward, episode_avg_max_q: total_q_max/float(duration), episode_duration: duration , episode_avg_loss: total_loss/duration})
                summary_writer.add_summary(summary_str, self.episode + 1)

            # Debug
            if t < INITIAL_REPLAY_SIZE:
                mode = 'random'
            elif INITIAL_REPLAY_SIZE <= t < INITIAL_REPLAY_SIZE + EXPLORATION_STEPS:
                mode = 'explore'
            else:
                mode = 'exploit'
            print('EPISODE: {0:6d} / TIMESTEP: {1:8d} / DURATION: {2:5d} / EPSILON: {3:.5f} / TOTAL_REWARD: {4:3.0f} / AVG_MAX_Q: {5:2.4f} / AVG_LOSS: {6:.5f} / MODE: {7}'.format(
                episode + 1, t, duration, epsilon,
                total_reward, total_q_max / float(duration),
                total_loss / (float(duration) / float(TRAIN_INTERVAL)), mode))

            total_reward = 0
            total_q_max = 0
            total_loss = 0
            duration = 0
            episode += 1

        t += 1

        state = next_state

  warn("The default mode, 'constant', will be changed to 'reflect' in "


EPISODE:      1 / TIMESTEP:      170 / DURATION:   171 / EPSILON: 1.00000 / TOTAL_REWARD:   0 / AVG_MAX_Q: 0.0936 / AVG_LOSS: 0.00000 / MODE: random
EPISODE:      2 / TIMESTEP:      395 / DURATION:   225 / EPSILON: 1.00000 / TOTAL_REWARD:   1 / AVG_MAX_Q: 0.0906 / AVG_LOSS: 0.00000 / MODE: random
EPISODE:      3 / TIMESTEP:      574 / DURATION:   179 / EPSILON: 1.00000 / TOTAL_REWARD:   0 / AVG_MAX_Q: 0.0935 / AVG_LOSS: 0.00000 / MODE: random
EPISODE:      4 / TIMESTEP:      797 / DURATION:   223 / EPSILON: 1.00000 / TOTAL_REWARD:   1 / AVG_MAX_Q: 0.0927 / AVG_LOSS: 0.00000 / MODE: random
EPISODE:      5 / TIMESTEP:      964 / DURATION:   167 / EPSILON: 1.00000 / TOTAL_REWARD:   0 / AVG_MAX_Q: 0.0928 / AVG_LOSS: 0.00000 / MODE: random
EPISODE:      6 / TIMESTEP:     1132 / DURATION:   168 / EPSILON: 1.00000 / TOTAL_REWARD:   0 / AVG_MAX_Q: 0.0934 / AVG_LOSS: 0.00000 / MODE: random
EPISODE:      7 / TIMESTEP:     1447 / DURATION:   315 / EPSILON: 1.00000 / TOTAL_REWARD:   2 / AVG_MAX_Q:

In [None]:
MAX_FRAMES_TEST = 300
frames = list()

observation = env.reset()

last_observation = observation
observation, _, _, _ = env.step(1) 
state = get_initial_state(observation, last_observation)
for _ in range(MAX_FRAMES_TEST):
    last_observation = observation

    action = np.argmax(q_values.eval(feed_dict={s: [np.float32(state / 255.0)]}))
    observation, _, terminal, _ = env.step(action)
    
    frames.append(observation)

   # env.render()
    processed_observation = preprocess(observation, last_observation)
    state = np.dstack((state[:, :, 1:], processed_observation))
    #state = np.append(state[1:, :, :], processed_observation, axis=0)

In [None]:
from matplotlib import animation
from JSAnimation.IPython_display import display_animation
import matplotlib.pyplot as plt
def display_frames_as_gif(frames, filename_gif = None):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    if filename_gif: 
        anim.save(filename_gif, writer = 'imagemagick', fps=20)
    display(display_animation(anim, default_mode='loop'))
display_frames_as_gif(frames[:300])

In [None]:

obs, _, _, _ = env.step(1)
%matplotlib inline
plt.imshow(obs)