In [1]:
"""
The code below implements an agent that learns using the Deep Q Network(DQN) algorithm.  
The script is currently set to play Breakout. 
"""
import gym
from gym.wrappers import Monitor
import random
import numpy as np
import tensorflow as tf
from collections import deque
from skimage.color import rgb2gray
from skimage.transform import resize
import pickle
import matplotlib.pyplot as plt

In [2]:
"""
OpenAI environment outputs 210x160x3 RGB images that are preprocessed by taking the maximum 
value for each pixel colour value over the current frame being encoded and the previous frame. 
Then the image is resized to a 84x84 pixel frame and convert them to grayscale.
"""
def preprocess_input(frame, last_frame):
    processed_image = np.maximum(frame, last_frame)
    processed_image = rgb2gray(processed_image)
    processed_image = resize(processed_image, (image_width, image_height))
    processed_image = np.uint8(processed_image * 255)
    
    processed =  np.reshape(processed_image, (1, image_width, image_height))
    processed = np.swapaxes(processed,0,2)
    
    return processed

In [3]:
"""
Four consecutive frames are stacked together in order to help determine 
the velocity of the ball.
"""
def first_state(frame, last_frame):
    processed_image = np.maximum(frame, last_frame)
    processed_image = rgb2gray(processed_image)
    processed_image = resize(processed_image, (image_width, image_height))
    processed_image = np.uint8(processed_image * 255)
        
    state = [processed_image for _ in range(agent_history_length)]
    state_stack = np.stack(state, axis=0)
    state_stack = np.swapaxes(state_stack,0,2)
    return state_stack

In [4]:
"""
CNN architecture that is used to create both the Q-network and the target network. 
The error term was clipped between -1 and 1 in order to improve the stability of the algorithm.
Reference: https://github.com/dennybritz/reinforcement-learning
"""
class Network():
    def __init__(self, num_actions, scope="estimator"):
        self.scope = scope
        self.num_actions = num_actions
       
        with tf.variable_scope(scope):
            self.build_network()
    
    def build_network(self):

        self.s = tf.placeholder(tf.float32, [None, image_width, image_height, agent_history_length], name="s")
        s = tf.to_float(self.s) / 255.0
        self.y = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
        self.a = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")

        # Three convolutional layers
        conv1 = tf.contrib.layers.conv2d(
            s, 64, 8, 4, activation_fn=tf.nn.relu)
        conv2 = tf.contrib.layers.conv2d(
            conv1, 64, 4, 2, activation_fn=tf.nn.relu)
        conv3 = tf.contrib.layers.conv2d(
            conv2, 64, 3, 1, activation_fn=tf.nn.relu)

        # Fully connected layers
        flat = tf.contrib.layers.flatten(conv3)
        fc1 = tf.contrib.layers.fully_connected(flat, 512)
        self.predict = tf.contrib.layers.fully_connected(fc1, self.num_actions)
        
        # Get the predictions for the chosen actions only
        indices = tf.range(batch_size) * tf.shape(self.predict)[1] + self.a
        self.q_value = tf.reshape(self.predict, [-1])
        self.q_value = tf.gather(self.q_value, indices)

        # Calcualte the loss        
        self.error = self.y - self.q_value
        self.error_clip = tf.clip_by_value(self.error, -1.0, 1.0)
        self.loss = tf.reduce_mean(tf.square(self.error_clip))


        self.optimizer = tf.train.RMSPropOptimizer(0.0001, momentum=0.95, epsilon=0.01)
        self.train_op = self.optimizer.minimize(self.loss)


    def run(self, sess, state):

        return sess.run([self.predict], { self.s: state})
    
    def update(self, sess, state, action, y_l):
        
        feed_dict = { self.s: state, self.y: y_l, self.a: action }
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        
        return loss
    

In [5]:
"""
Obtain the proper action during the training process.
It implements some of the epsilon-gready exploration-exploatation by annealing epsilon linearly. 
"""
def train_action(sess, estimator, num_actions, state, epsilon, t, epsilon_step, load):
    rand_num = random.random()
    if load:
        if epsilon >= rand_num:
            action = random.randrange(num_actions) 
        else:
            q_val = estimator.run(sess, np.expand_dims(state, 0))[0]
            action = np.argmax(q_val)
    else:
        if epsilon >= rand_num or t < replay_start_size:
            action = random.randrange(num_actions) 
        else:
            q_val = estimator.run(sess, np.expand_dims(state, 0))[0]
            action = np.argmax(q_val)
      

    if epsilon > final_e and t >= replay_start_size:
        epsilon = epsilon - epsilon_step
    return action, epsilon

In [6]:
"""
Copy the q-network parameters to the target network every 10000 actions.
Reference: https://github.com/dennybritz/reinforcement-learning
"""

def copy_network_params(sess, network1, network2):
    
    paramsN1 = [t for t in tf.trainable_variables() if t.name.startswith(network1.scope)]
    paramsN2 = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    
    paramsN1 = sorted(paramsN1, key=lambda v: v.name)
    paramsN2 = sorted(paramsN2, key=lambda v: v.name)

    update_params = []
    for n1, n2 in zip(paramsN1, paramsN2):
        new_params = n2.assign(n1)
        update_params.append(new_params)

    sess.run(update_params)

In [7]:
def dqn(sess, q_network, target_network, replay_memory, t, reward_hist,total_reward, duration, total_loss, saver, episode, epsilon, state, action, reward, done, observation):

    next_state = np.append(state[:, :, 1:], observation, axis=2)

    # Clip rewards between -1 and 1
    reward = np.clip(reward, -1, 1)

    # Store in replay memory
    replay_memory.append((state, action, reward, next_state, done))
    if len(replay_memory) > replay_memory_size:
        replay_memory.popleft()
   
    
    if t >= replay_memory_size:
        
        # Train network every 4 frames
        if t % 4 == 0:
            state_batch = []
            action_batch = []
            reward_batch = []
            next_state_batch = []
            done_batch = []
            yi_batch = []

            # Sample random minibatch from the replay memory buffer
            minibatch = random.sample(replay_memory, batch_size)
            for data in minibatch:
                state_batch.append(data[0])
                action_batch.append(data[1])
                reward_batch.append(data[2])
                next_state_batch.append(data[3])
                done_batch.append(data[4])

            done_batch = np.array(done_batch) + 0

            target_q_values_batch = target_network.run(sess, next_state_batch)[0]
            yi_batch = reward_batch + (1 - done_batch) * 0.99 * np.max(target_q_values_batch, axis=1)
            
            loss = q_network.update(sess, state_batch, action_batch, yi_batch)
            total_loss += loss

                
        # Update target network with q-network parameters 
        if t % target_network_update == 0:
            copy_model_parameters(sess, q_network, target_network)
            print("\nCopy q network params to the target network.")


    total_reward += reward
    duration += 1
    
    if done:
        #print("Average loss:", total_loss )
        print("Episode:", episode + 1)
        print("Final Reward:", total_reward)
        print("Episode steps:", duration)
        print("----------------")

        
        # Apend episode reward go list
        reward_hist.append(total_reward)
        
        # Save network
        if episode % 200 == 0 and t >= replay_memory_size:
            saver.save(sess, 'DQN-Breakout/Breakout-DQN.ckpt', global_step=episode)
            pickle.dump(episode,open('DQN-Breakout/last-episode.p','wb'))
            pickle.dump(reward_hist,open('DQN-Breakout/all_reward.p','wb'))
            print('Successfully saved: ')

    t += 1
    return next_state, t, total_reward, duration, total_loss, reward_hist

In [8]:
"""
Get the proper action during the testing process.
"""

def test_action(sess, estimator, num_actions, state, t):
    if random.random() <= 0.05:
        action = random.randrange(num_actions)
    else:
        q_val = estimator.run(sess, np.expand_dims(state, 0))[0]
        action = np.argmax(q_val)
    t += 1

    return action, t

In [9]:

# To test the parameters make "train = false"
# To train the network make "train = true"

final_episode_num = 55801
image_width = 84  
image_height = 84 
agent_history_length = 4 
exploration_frames = 1000000  
initial_e = 1.0  
final_e = 0.1  
replay_start_size = 50000  
replay_memory_size = 1000000  
batch_size = 32  
target_network_update = 10000    
load_param = True
train = True


In [10]:
def main():
    env = gym.make('Breakout-v0')
   
    nA = env.action_space.n
    g = tf.Graph()
    config = tf.ConfigProto(allow_soft_placement = True)

    with g.as_default(), tf.Session(config=config) as sess, tf.device('/gpu:0'):
    
        # Create the q-network and the target network
        q_network = Network(nA, scope="q")
        target_network = Network(nA, scope="target_q")
        
        sess.run(tf.global_variables_initializer())
        
        # Annealing step 
        epsilon_step = (initial_e - final_e) / exploration_frames
        t = 0
        load = False
        
        # Create replay memory
        replay_memory = deque()
        reward_hist = []
        
        saver = tf.train.Saver()
        if load_param:
            print("Import saved model")
            epsilon = final_e
            load = True
            episode_number = pickle.load(open('DQN-Breakout/last-episode.p','rb'))
            saver.restore(sess, 'DQN-Breakout/Breakout-DQN.ckpt-'+str(episode_number))
            reward_hist = pickle.load(open('DQN-Breakout/all_reward.p','rb'))
            print('Continue from last saved episode')
        else:
            epsilon = initial_e
            episode_number = 0
            
        
        if train:  
            for episode in range(episode_number, final_episode_num):
                print("episode", episode)
                done = False
                total_reward = 0
                duration = 0
                total_loss = 0
                observation = env.reset()
                for p in range(random.randint(1, 30)):
                    last_observation = observation
                    observation, _, _, _ = env.step(0)  
                state = first_state(observation, last_observation)     
                while not done:   
                    last_observation = observation
                    action, epsilon = train_action(sess, q_network, nA, state, epsilon, t, epsilon_step, load)
                    observation, reward, done, _ = env.step(action)
                    processed_observation = preprocess_input(observation, last_observation)
                    state, t, total_reward, duration, total_loss, reward_hist = dqn(sess, q_network, target_network, replay_memory, t, reward_hist, total_reward, duration, total_loss, saver, episode, epsilon, state, action, reward, done, processed_observation)
            
            # Plotting
            interval = 50
            running_mean=[]; running_var=[]
            for i in range(np.floor(len(reward_hist)/interval).astype('int32')):
                running_mean.extend([np.mean(reward_hist[i*interval:i*interval+interval+1])]*interval)
                running_var.extend([np.var(reward_hist[i*interval:i*interval+interval+1])]*interval)

            fig,axe = plt.subplots(3,1,figsize=(18,15))
            axe[0].plot(np.array(reward_hist))
            axe[0].set_title('Cummulative reward')
            axe[1].plot(np.array(running_mean))
            axe[1].set_title('Running mean')
            axe[2].plot(np.array(running_var))
            axe[2].set_title('Running variance')
            fig.savefig('Breakout_results.png')   
            plt.close(fig) 

        else:  
            env = Monitor(env, directory="./monitor", video_callable=lambda count: count % 1 == 0, resume=True)
            for _ in range(20):
                done = False
                total_reward = 0
                duration = 0
                total_loss = 0
                observation = env.reset()
                for _ in range(random.randint(1, 30)):
                    last_observation = observation
                    observation, _, _, _ = env.step(0)  
                state = first_state(observation, last_observation)  
                while not done:
                    last_observation = observation
                    action, t = test_action(sess, q_network, nA, state, t)
                    observation, _, done, _ = env.step(action)
                    env.render()
                    processed_observation = preprocess_input(observation, last_observation)
                    state = np.append(state[:, :, 1:], processed_observation, axis=2)
                    

main()


[2017-05-04 21:42:48,829] Making new env: Breakout-v0
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Import saved model
Continue from last saved episode
episode 55800


  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("The default mode, 'constant', will be changed to 'reflec

Episode: 55801
Final Reward: 29.0
Episode steps: 1345
----------------
