In [39]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow.contrib.layers import flatten, conv2d, fully_connected
from collections import deque, Counter
import random
from datetime import datetime

In [40]:
color = np.array([210, 164, 74]).mean()

def preprocess_observation(obs):
    
    img = obs[1:176:2, ::2]
    img = img.mean(axis=2)
    img[img==color] = 0
    img = (img-128) / 128 - 1
    return img.reshape(88,80,1)

In [41]:
env = gym.make("MsPacman-v0")
n_outputs = env.action_space.n

In [42]:
# Define Q network
tf.reset_default_graph()

def q_network(X, name_scope):
    initializer = tf.contrib.layers.variance_scaling_initializer()
    
    with tf.variable_scope(name_scope) as scope:
        layer_1 = conv2d(X, num_outputs=32, kernel_size=(8,8), stride=4, padding="SAME",weights_initializer=initializer)
        tf.summary.histogram('layer_1', layer_1)
        
        layer_2 = conv2d(X, num_outputs=64, kernel_size=(4,4), stride=2, padding="SAME",weights_initializer=initializer)
        tf.summary.histogram('layer_2', layer_2)
                
        layer_3 = conv2d(X, num_outputs=64, kernel_size=(3,3), stride=1, padding="SAME",weights_initializer=initializer)
        tf.summary.histogram('layer_3', layer_3)
        
        flat = flatten(layer_3)
        
        fc = fully_connected(flat, num_outputs=128, weights_initializer=initializer)
        tf.summary.histogram('fc',fc)
        
        output = fully_connected(fc, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)
        tf.summary.histogram('output',output)
        
        vars = {v.name[len(scope.name):]: v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)}
        return vars, output

In [43]:
eps_min = 0.05
eps_max = 0.5
eps_decay_steps = 5000000

def epsilon_greedy(action, step):
    p = np.random.random(1).squeeze()
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * steps/eps_decay_steps)
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        return action

In [44]:
for steps in range(1, 10000000, 10000):
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * steps/eps_decay_steps)
    print(epsilon)

0.49999991
0.49909991
0.49819991
0.49729991
0.49639991
0.49549991
0.49459991
0.49369991
0.49279991
0.49189991
0.49099991
0.49009990999999997
0.48919991
0.48829991
0.48739991
0.48649991
0.48559991
0.48469991
0.48379991
0.48289991
0.48199991
0.48109991
0.48019991
0.47929991
0.47839991
0.47749991
0.47659991
0.47569991
0.47479991
0.47389991
0.47299991
0.47209991
0.47119991
0.47029991
0.46939991
0.46849991
0.46759991
0.46669991
0.46579991
0.46489990999999997
0.46399991
0.46309991
0.46219991
0.46129991
0.46039991
0.45949991
0.45859991
0.45769991
0.45679991
0.45589991
0.45499991
0.45409991
0.45319991
0.45229991
0.45139991
0.45049991
0.44959991
0.44869991
0.44779990999999997
0.44689991
0.44599991
0.44509991
0.44419991
0.44329991
0.44239991
0.44149991
0.44059991
0.43969990999999997
0.43879991
0.43789991
0.43699991
0.43609991
0.43519991
0.43429991
0.43339991
0.43249991
0.43159991
0.43069990999999996
0.42979991
0.42889991
0.42799991
0.42709991
0.42619991
0.42529991
0.42439991
0.42349991
0.4225999

0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05


In [45]:
buffer_len = 20000
exp_buffer = deque(maxlen=buffer_len)

In [46]:
def sample_memories(batch_size):
    perm_batch = np.random.permutation(len(exp_buffer))[:batch_size]
    mem = np.array(exp_buffer)[perm_batch]
    return mem[:,0], mem[:,1], mem[:,2], mem[:,3], mem[:,4]

In [47]:
num_episodes = 800
batch_size = 48
input_shape = (None, 88, 80, 1)
learning_rate = 0.001
X_shape = (None, 88, 80, 1)
discount_factor = 0.97

global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000

In [48]:
logdir = 'ch8_logs'
tf.reset_default_graph()

X = tf.placeholder(tf.float32, shape=X_shape)
in_training_mode = tf.placeholder(tf.bool)

In [49]:
mainQ, mainQ_outputs = q_network(X, 'mainQ')

targetQ, targetQ_outputs = q_network(X, 'targetQ')

In [50]:
X_action = tf.placeholder(tf.int32, shape=(None,))
Q_action = tf.reduce_sum(targetQ_outputs * tf.one_hot(X_action, n_outputs),axis=-1,keep_dims=True)

In [51]:
copy_op = [tf.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
copy_target_to_main = tf.group(*copy_op)

In [52]:
y = tf.placeholder(tf.float32, shape=(None,1))

loss = tf.reduce_mean(tf.square(y-Q_action))

optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

loss_summary = tf.summary.scalar('LOSS', loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [None]:
with tf.Session() as sess:
    init.run()
    
    for i in range(num_episodes):
        done = False
        obs = env.reset()
        epoch = 0
        episodic_reward = 0
        actions_counter = Counter()
        episodic_loss = []
        
        while not done:
            obs = preprocess_observation(obs)
            
            # 현재 상태에서의 Q-값을 계산
            actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False}) 
            
            action = np.argmax(actions, axis=-1)
            actions_counter[str(action)] += 1
            action = epsilon_greedy(action, global_step)
            next_obs, reward, done, _ = env.step(action)
            
            # 현재 상태, 행동, 다음 상태, 보상, 종료 여부를 buffer에 저장
            exp_buffer.append([obs, action, preprocess_observation(next_obs), reward, done])
            
            if global_step % steps_train == 0 and global_step > start_steps:
                o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)
                o_obs = [x for x in o_obs]
                o_next_obs = [x for x in o_next_obs]
                next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})
                
                # Q(s,a)←Q(s,a)+α[r+γmax(Q(s′,a′))−Q(s,a)]
                y_batch = o_rew + discount_factor * np.max(next_act, axis=-1) * (1-o_done)
                
                mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False})
                file_writer.add_summary(mrg_summary, global_step)
                train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_actions:o_act, in_training_mode:True})
                episodic_loss.append(train_loss)
            
            # copy main Q network weights to target Q networks after some intervals
            if (global_step+1)%copy_steps==0 and global_step > start_steps:
                copy_target_to_main.run()
                
            obs = next_obs
            epoch += 1
            global_step += 1
            episodic_reward += reward
            
        print('Epoch', epoch, 'Reward', episodic_reward,)
                