In [169]:
from __future__ import division

import gym
import numpy as np
import random
import tensorflow as tf
import tensorflow.contrib.slim as slim
import matplotlib.pyplot as plt
import scipy.misc
import os
%matplotlib inline

In [170]:
env = gym.make('Breakout-v0')

[2017-05-20 21:03:46,642] Making new env: Breakout-v0


In [171]:
size=80 #preprocess image size 80*80
def processState(obs):
        obs = obs[35:195]
        obs = obs[::2, ::2, 0]
        obs[obs == 144] = 0
        obs[obs == 109] = 0
        obs[obs!=0] = 1
        return np.reshape(obs.astype(np.float).ravel(), (1,size*size))

In [172]:
class Qnetwork():
    def __init__(self,h_size):
        #The network recieves a frame from the game, flattened into an array.
        #It then resizes it and processes it through four convolutional layers.
        self.scalarInput =  tf.placeholder(shape=[None,size**2],dtype=tf.float32)
        self.imageIn = tf.reshape(self.scalarInput,shape=[-1,size,size,1])
        self.conv1 = slim.conv2d( \
            inputs=self.imageIn,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', biases_initializer=None)
        self.conv2 = slim.conv2d( \
            inputs=self.conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', biases_initializer=None)
        self.conv3 = slim.conv2d( \
            inputs=self.conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', biases_initializer=None)
        self.conv4 = slim.conv2d( \
            inputs=self.conv3,num_outputs=h_size,kernel_size=[6,6],stride=[1,1],padding='VALID', biases_initializer=None)
        
        
        #We take the output from the final convolutional layer and split it into separate advantage and value streams.
        self.streamAC,self.streamVC = tf.split(self.conv4,2,3)
        self.streamA = slim.flatten(self.streamAC)
        self.streamV = slim.flatten(self.streamVC)
        xavier_init = tf.contrib.layers.xavier_initializer()
        self.AW = tf.Variable(xavier_init([h_size//2,3]))
        self.VW = tf.Variable(xavier_init([h_size//2,1]))
        self.Advantage = tf.matmul(self.streamA,self.AW)
        self.Value = tf.matmul(self.streamV,self.VW)
        
        #Then combine them together to get our final Q-values.
        self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keep_dims=True))
        self.predict = tf.argmax(self.Qout,1)
        
        #Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
        self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.actions,3,dtype=tf.float32)
        
        self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)
        
        self.td_error = tf.square(self.targetQ - self.Q)
        self.loss = tf.reduce_mean(self.td_error)
        self.trainer = tf.train.AdamOptimizer(learning_rate=0.1)
        self.updateModel = self.trainer.minimize(self.loss)

In [173]:
mainQN = Qnetwork(h_size)

In [174]:
class experience_buffer():
    def __init__(self, buffer_size = 1000000):
        self.buffer = []
        self.buffer_size = buffer_size
    
    def add(self,experience):
        if len(self.buffer) + len(experience) >= self.buffer_size:
            self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []
        self.buffer.extend(experience)
            
    def sample(self,size):
        return np.reshape(np.array(random.sample(self.buffer,size)),[size,5])

In [175]:
def updateTargetGraph(tfVars,tau):
    total_vars = len(tfVars)
    op_holder = []
    for idx,var in enumerate(tfVars[0:total_vars//2]):
        op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value())))
    return op_holder

def updateTarget(op_holder,sess):
    for op in op_holder:
        sess.run(op)

In [176]:
batch_size = 32 #How many experiences to use for each training step.
update_freq = 4 #How often to perform a training step.
y = .99 #Discount factor on the target Q-values
startE = 1 #Starting chance of random action
endE = 0.1 #Final chance of random action
anneling_steps = 1000000. #How many steps of training to reduce startE to endE.
num_episodes = 10000 #How many episodes of game environment to train network with.
pre_train_steps = 50000 #How many steps of random actions before training begins.
max_epLength = 50000000000 #The max allowed length of our episode.
load_model = False #Whether to load a saved model.
path = "./dqn" #The path to save our model to.
h_size = 256 #The size of the final convolutional layer before splitting it into Advantage and Value streams.
tau = 0.1 #Rate to update target network toward primary network

In [None]:
tf.reset_default_graph()
mainQN = Qnetwork(h_size)
targetQN = Qnetwork(h_size)

init = tf.global_variables_initializer()

saver = tf.train.Saver()

trainables = tf.trainable_variables()

targetOps = updateTargetGraph(trainables,tau)

myBuffer = experience_buffer()

#Set the rate of random action decrease. 
e = startE
stepDrop = (startE - endE)/anneling_steps

#create lists to contain total rewards and steps per episode
jList = []
rList = []
total_steps = 0

#Make a path for our model to be saved in.
if not os.path.exists(path):
    os.makedirs(path)

with tf.Session() as sess:
    sess.run(init)
    updateTarget(targetOps,sess) #Set the target network to be equal to the primary network.
    for i in range(num_episodes):
        episodeBuffer = experience_buffer()
        #Reset environment and get first new observation
        s = env.reset()
        s = processState(s)
        d = False
        rAll = 0
        j = 0
        #The Q-Network
        while j < max_epLength: #If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
            j+=1
            #Choose an action by greedily (with e chance of random action) from the Q-network
            if np.random.rand(1) < e or total_steps < pre_train_steps:
                a = env.action_space.sample()
            else:
                a = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:s})
            s1,r,d,_ = env.step(a)
            s1 = processState(s1)
            total_steps += 1
            episodeBuffer.add(np.reshape(np.array([s,a,r,s1,d]),[1,5])) #Save the experience to our episode buffer.
            
            if total_steps > pre_train_steps:
                if e > endE:
                    e -= stepDrop
                
                if total_steps % (update_freq) == 0:
                    trainBatch = myBuffer.sample(batch_size) #Get a random batch of experiences.
                    #Below we perform the Double-DQN update to the target Q-values
                    Q1 = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,3])})
                    Q2 = sess.run(targetQN.Qout,feed_dict={targetQN.scalarInput:np.vstack(trainBatch[:,3])})
                    end_multiplier = -(trainBatch[:,4] - 1)
                    doubleQ = Q2[range(batch_size),Q1]
                    targetQ = trainBatch[:,2] + (y*doubleQ * end_multiplier)
                    #Update the network with our target values.
                    _ = sess.run(mainQN.updateModel, \
                        feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,0]),mainQN.targetQ:targetQ, mainQN.actions:trainBatch[:,1]})
                    
                    updateTarget(targetOps,sess) #Set the target network to be equal to the primary network.
            rAll += r
            s = s1
            
            if d == True:

                break
        
        myBuffer.add(episodeBuffer.buffer)
        jList.append(j)
        rList.append(rAll)
        #Periodically save the model. 
        if i % 1000 == 0:
            saver.save(sess,path+'/model-'+str(i)+'.cptk')
            print("Saved Model")
        print(i,np.mean(rList[-10:]), e)
    saver.save(sess,path+'/model-'+str(i)+'.cptk')
print("Percent of succesful episodes: " + str(sum(rList)/num_episodes) + "%")

Saved Model
0 2.0 1
1 2.5 1
2 1.66666666667 1
3 1.75 1
4 1.4 1
5 1.83333333333 1
6 1.71428571429 1
7 1.625 1
8 1.66666666667 1
9 1.6 1
10 1.4 1
11 1.2 1
12 1.4 1
13 1.4 1
14 1.5 1
15 1.3 1
16 1.3 1
17 1.4 1
18 1.5 1
19 1.6 1
20 1.7 1
21 1.9 1
22 1.7 1
23 1.6 1
24 1.6 1
25 1.6 1
26 1.8 1
27 1.8 1
28 1.6 1
29 1.5 1
30 1.4 1
31 1.1 1
32 1.1 1
33 1.0 1
34 0.9 1
35 0.8 1
36 0.6 1
37 0.4 1
38 0.5 1
39 0.6 1
40 0.9 1
41 1.0 1
42 1.0 1
43 1.4 1
44 1.4 1
45 1.5 1
46 1.5 1
47 1.6 1
48 1.4 1
49 1.3 1
50 1.2 1
51 1.2 1
52 1.3 1
53 1.1 1
54 1.1 1
55 0.9 1
56 1.0 1
57 0.9 1
58 1.0 1
59 0.9 1
60 0.7 1
61 0.7 1
62 0.6 1
63 0.4 1
64 0.5 1
65 0.7 1
66 0.6 1
67 0.8 1
68 0.8 1
69 0.9 1
70 0.9 1
71 0.8 1
72 1.3 1
73 1.4 1
74 1.5 1
75 1.3 1
76 1.4 1
77 1.6 1
78 1.7 1
79 1.6 1
80 1.8 1
81 2.0 1
82 1.5 1
83 1.6 1
84 1.4 1
85 1.6 1
86 1.7 1
87 1.4 1
88 1.4 1
89 1.5 1
90 1.5 1
91 1.5 1
92 1.5 1
93 1.3 1
94 1.7 1
95 1.6 1
96 1.5 1
97 1.5 1
98 1.8 1
99 1.8 1
100 1.6 1
101 1.6 1
102 1.9 1
103 1.9 1
104 1.6 1
105 1