In [1]:
import gym
import numpy as np
import os
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
env = gym.make("MsPacman-v0")

In [3]:
obs = env.reset()

In [4]:
obs.shape #{lenth,width,channel}

(210, 160, 3)

In [5]:
env.action_space #{actions}

Discrete(9)

In [6]:
mspacman_color = np.array([210,163,74]).mean()

def preprocess_observation(obs):
    img = obs[1:176:2,::2] 
    # 1:176:2 means from 1 thru 172 with step size: 2
    # ::2 means: do above process for the 1st and 2nd dimention,3rd dimension no change
    # img.shape: (88,80,3)
    img = img.mean(axis=2) # gray convertion
    img[img==mspacman_color]=0 #Improve contrast
    img = (img - 128)/128 -1 # regulize to -1 and 1
    return img.reshape(88,80,1)

In [7]:
# NN structure: 
# layer1: 32,8x8+4stride
# layer2: 64,4x4+2stride
# layer3: 64,3x3+1stride
# fully conn: 512 units
# fully conn: 9 nnits

input_height = 88
input_width = 80
input_channels = 1
conv_n_maps = [32,64,64]
conv_kernel_size = [(8,8),(4,4),(3,3)]
conv_strides = [4,2,1]
conv_paddings = ['SAME']*3 
# ['SAME', 'SAME', 'SAME']
conv_activation = [tf.nn.relu]*3 
# [<function tensorflow.python.ops.gen_nn_ops.relu>,
#  <function tensorflow.python.ops.gen_nn_ops.relu>,
#  <function tensorflow.python.ops.gen_nn_ops.relu>]
n_hidden_in = 64*11*10 # 64 (11x10)s in layers
n_hidden = 512
hidden_activation = tf.nn.relu
n_outputs = env.action_space.n # 9 actions {up, down,left......}
initializer = tf.contrib.layers.variance_scaling_initializer()

In [8]:
from tensorflow.contrib.layers import convolution2d,fully_connected

# We will have 2 DQNs, one as actor, and another as critic
# Build q_network() to create DQNs
def q_network(X_state,scope):
    prev_layer = X_state
    conv_layers = []
    with tf.variable_scope(scope) as scope:
        for n_maps, kernel_size, stride,padding,activation in zip(conv_n_maps,
                                                                  conv_kernel_size,
                                                                  conv_strides,
                                                                  conv_paddings,
                                                                  conv_activation
                                                                 ):
            prev_layer = convolution2d(prev_layer,
                                       num_outputs=n_maps,
                                       kernel_size = kernel_size,
                                       stride = stride,
                                       padding = padding,
                                       activation_fn = activation,
                                       weights_initializer = initializer
                                      )
            conv_layers.append(prev_layer)
        
        last_conv_layer_flat = tf.reshape(prev_layer,
                                          shape=[-1,n_hidden_in])
        hidden = fully_connected(last_conv_layer_flat,
                                 n_hidden,
                                 activation_fn = hidden_activation,
                                 weights_initializer = initializer
                                )
        outputs = fully_connected(hidden,
                                  n_outputs,
                                  activation_fn = None,
                                  weights_initializer = initializer
                                 )
    trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       scope = scope.name
                                      )
    trainable_vars_by_name = {var.name[len(scope.name):]:var
                              for var in trainable_vars
                             }
    return outputs,trainable_vars_by_name
           #: output is q_value

### DQN cost function_critics : J(theta_critic) 
#### J(theta_critic) = 1/m * Sum [(y - q_value(s,a,theta))**2]
### NN trained value: y
#### y = reward + discount_rate * max_q_value(next_state, next_action, theta_actor)

In [9]:
# placeholder creation, copy critic's DQN to actor's DQN

X_state = tf.placeholder(tf.float32, 
                         shape=[None,input_height,input_width,input_channels])
actor_q_values, actor_vars = q_network(X_state,scope="q_networks/actor")
critic_q_values , critic_vars = q_network(X_state,scope="q_network/critic")

# copy critic to actor
copy_ops = [actor_var.assign(critic_vars[var_name]) for var_name, actor_var in actor_vars.items()]
copy_critic_to_actor = tf.group(*copy_ops)

In [10]:
# placeholder critic's DQN training
# one_hot for recording action x critic's q_value
X_action = tf.placeholder(tf.int32,shape=[None])
q_value = tf.reduce_sum(critic_q_values*tf.one_hot(X_action,n_outputs),axis=1,keepdims=True)

In [11]:
# Adding training process:

learning_rate = 1e-3

# Q-value will be added to placeholder
y = tf.placeholder(tf.float32,shape=[None,1])

# cost function is MSE 
cost = tf.reduce_mean(tf.square(y-q_value))

global_step = tf.Variable(0,trainable=False,name='global_step')

# AdamOptimize to do optimization
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(cost,global_step = global_step)
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [12]:
# Need tool: deque 弹出数据，当replay 快满的时候
# Need function: randomly sampling from replay

from collections import deque
replay_memory_size = 10000
replay_memory = deque([],maxlen = replay_memory_size)

def sample_memories(batch_size):
    indices = rnd.permutation(len(replay_memory))[:batch_size]
    cols = [[],[],[],[]] # state, action,reward,next_state,continue
    for idx in indices:
        memory = replay_memory[idx]
        for col, value in zip(cols,memory):
            col.apppend(value)
    cols = [np.array(col) for col in cols]
    return (cols[0],cols[1],cols[2].reshape(-1,1),cols[3],cols[4].reshape(-1,1))

In [13]:
# actor to explore game using epsilon-greedy
# in 50000 training steps, reduce epsilon from 1 to 0.05 

eps_min = 0.05
eps_max = 1
eps_decay_steps = 5000

def epsilon_greedy(q_value,step):
    epsilon = max(eps_min,eps_max - (eps_max-eps_min)*step/eps_decay_steps)
    if rnd.rand()<epsilon:
        return rnd.randint(n_outputs) # 随机的动作
    else: 
        return np.argmax(q_value) # 最优的动作

### TRAINING START... LET'S GOOOO

In [15]:
n_steps = 10000 # 总共训练步长（数）
training_start = 1000 # GAME 开始1000次迭代开始训练
training_interval = 3 # 然后 每隔3次迭代，训练一次
save_steps = 50 # 每训练50步，保存一下模型
copy_steps = 25 # 每训练25步，复制 critic's Q_value 给 actor
discount_rate = 0.95
skip_start = 90 # 跳过游戏的开始时间
batch_size = 50
iteration = 0
checkpoint_path = ("./dqn_pacman.ckpt")
done = True # 当true，env.reset()

#### tensorflow.session 开始训练

In [None]:
with tf.Session() as sess:
    if os.path.isfile(checkpoint_path):
        saver.restore(sess,checkpoint_path)
    else:
        init.run()
    while True:
        step = global_step.eval()
        if step >= n_steps:
            break
        iteration +=1
        if done: # 若 GAMEOVER，restart new game
            obs = env.reset()
            for skip in range(skip_start): # skip game foreplay
                obs,reward,done,info = env.step(0)
                state = preprocess_observation(obs)
                
# actor 要做的事情：
    q_values = actor_q_values.eval(eed_dict = {X_state: [state]})
    action = epsilon_greedy(q_values,step)

# actor 开始玩游戏:
    obs,reward, done, info = env.step(action) # action 后 
    next_state = preprocess_observation(obs)  # state 后
    print (reward)
    print (q_values)
    
# 记录下 发生的事 in Replay Memory
    replay_memory.append((state,action,next_state,1-done))
    state = next_state
    
    #if (iteration < training_start) or (iteration%training_interval != 0): 
        #continue

# critic to learn:
    X_state_val,X_action_val,rewards,X_next_state_val,continues = (sample_memories(batch_size))
    next_q_values = actor_q_values.eval(feed_dict = {X_state:X_next_state_val})
    max_next_q_values = np.max(next_q_values,axis=1,keepdims=True)
    y_val = rewards + continues*discount_rate*max_next_q_values
    training_op.run(feed_dict = {X_state: X_state_val,
                                 X_action: X_action_val,
                                 y: y_val})
    
# Copy critic's q_value to actor:
    if step % copy_steps == 0:
        copy_critic_to_actor.run()
        
# Save Model
    if step % save_steps == 0:
        saver.save(sess,checkpoint_path)