In [None]:
import sys
sys.path.append("game/")
from paperio2_wrapper import *

In [None]:
import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline
import tensorflow as tf
import random
from collections import deque
import warnings
from pprint import pprint
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

In [None]:
ACTIONS = 4 # number of valid actions
GAMMA = 0.99 # decay rate of past observations
OBSERVE = 10000. # timesteps to observe before training
EXPLORE = 20000. # frames over which to anneal epsilon
FINAL_EPSILON = 0.08 # final value of epsilon
INITIAL_EPSILON = 0.2 # starting value of epsilon
REPLAY_MEMORY = 50000 # number of previous transitions to remember
BATCH = 64 # size of minibatch
FRAME_PER_ACTION = 1

In [None]:
def conv2d(x_tensor, conv_num_outputs, conv_ksize=3, conv_stride = 1):
    _, input_width, input_height, input_depth = x_tensor.get_shape().as_list()

    weights = tf.Variable(tf.truncated_normal([conv_ksize, conv_ksize, input_depth, conv_num_outputs], 
                                              mean=0.0, stddev=0.05, dtype=tf.float32))
    biases = tf.Variable(tf.zeros(conv_num_outputs), dtype=tf.float32)

    conv = tf.nn.conv2d(input=x_tensor, filter=weights, strides=[1, conv_stride, conv_stride, 1], padding='SAME')
    conv = tf.nn.bias_add(conv, biases)
    conv = tf.nn.relu(conv)
    return conv

def maxpool(x_tensor):
    return tf.nn.max_pool(x_tensor, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

def flatten(x_tensor):
    return tf.contrib.layers.flatten(x_tensor)

def fully_conn(x_tensor, num_outputs):
    return tf.contrib.layers.fully_connected(inputs = x_tensor, num_outputs=num_outputs)

def output(x_tensor, num_outputs):
    return tf.contrib.layers.fully_connected(inputs=x_tensor, num_outputs=num_outputs, activation_fn=None)

In [None]:
def createNetwork():
    # input layer
    s = tf.placeholder("float", [None, 50, 50, 8])
    
    x = conv2d(s, 64, 5, 2)
    x = maxpool(x)
    x = conv2d(x, 128, 3, 1)
    x = tf.layers.dropout(x)    
    x = conv2d(x, 128, 3, 1)
    
    x = flatten(x) 
    
    x = fully_conn(x, 256)
    x = fully_conn(x, 64)
    out = output(x, ACTIONS)
    return s, out, x

In [None]:
from game.player import UP, DOWN, LEFT, RIGHT, get_opposite_dir

# if you want to print image
# EDIT HERE!!!!!!!
PRINT = False

def set_action(direction):
    action = np.zeros([ACTIONS])
    action[direction] = 1
    return action

def totalReward(step, area, ter) :
    return step + area + ter

def isTermianl(terRew, ter) :
    return terRew == ter or ter + terRew == 0

def initial_state(areas, paths) :
    return np.stack((areas, paths, areas, paths, areas, paths, areas, paths), axis=2)

def update_state(areas, paths, state):
    return np.append(np.stack((areas, paths), axis=2), state[:, :, :6], axis=2)

def to_action_dict(action_t):
    d = {}
    for k, v in action_t.items():
        d[k] = np.argmax(v)
    return d

def trainNetwork(s, readout, h_fc1, sess):
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    terminal_reward = 50
    step_penalty = -0.7
    game = PaperIO2Wrapper(50, step_penalty, terminal_reward)
    PLAYER1 = game.p1.id
    PLAYER2 = game.p2.id
    D = deque()

    action_t = {PLAYER1: set_action(DOWN), PLAYER2: set_action(UP)}

    areas, paths, step_penalty, delta_area, termination_rewards = game.frame_step(to_action_dict(action_t))
    revArea, revPath = game.get_player2_view()
    state_t = {
        PLAYER1: initial_state(areas, paths),
        PLAYER2: initial_state(revArea, revPath)
    }


#     saving and loading networks
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")


    epsilon = INITIAL_EPSILON
    t = 0
    while True:
        for player in [PLAYER1, PLAYER2]:
            readout_t = readout.eval(feed_dict={s : [state_t[player]]})[0]
            action = np.zeros([ACTIONS])
            action_index = np.argmax(readout_t) if random.random() > epsilon else random.randrange(ACTIONS)
            if player == PLAYER2:
                action_index = get_opposite_dir(action_index)
            action[action_index] = 1
            action_t[player] = action

        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        
        areas, paths, step_penalty, delta_area, termination_rewards = game.frame_step(to_action_dict(action_t))
        revArea, revPath = game.get_player2_view()
        
        state_t1 ={
            PLAYER1: update_state(areas, paths, state_t[PLAYER1]),
            PLAYER2: update_state(revArea, revPath, state_t[PLAYER2])
        } 
                
        for player in [PLAYER1, PLAYER2]:
            reward_t = totalReward(step_penalty, delta_area[player], termination_rewards[player])
            D.append((state_t[player],
                     action_t[player],
                     reward_t,
                     state_t1[player],
                     isTermianl(termination_rewards[player], terminal_reward)))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

            # only train if done observing
            if t > OBSERVE:
                # sample a minibatch to train on
                minibatch = random.sample(D, BATCH)

                # get the batch variables
                s_j_batch = [d[0] for d in minibatch]
                a_batch = [d[1] for d in minibatch]
                r_batch = [d[2] for d in minibatch]
                s_j1_batch = [d[3] for d in minibatch]

                y_batch = []
                readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch})
                for i in range(0, len(minibatch)):
                    terminal = minibatch[i][4]
                    # if terminal, only equals reward
                    if terminal:
                        y_batch.append(r_batch[i])
                    else:
                        y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))

                # perform gradient step
                train_step.run(feed_dict = {
                    y : y_batch,
                    a : a_batch,
                    s : s_j_batch}
                )

            # update the old values
            state_t = state_t1
            t += 1

            # save progress every 5000 iterations
            if t % 5000 == 0:
                saver.save(sess, 'saved_networks/dqn', global_step = t)

            if PRINT and t % 10 == 0:
                game.show_board()
                
            if PRINT or t % 2000 == 0:
                # print info
                state = ""
                if t <= OBSERVE:
                    state = "observe"
                elif t > OBSERVE and t <= OBSERVE + EXPLORE:
                    state = "explore"
                else:
                    state = "train"

                print("TIMESTEP", t, "/ STATE", state, \
                    "/ EPSILON", epsilon, "/ ACTION", np.argmax(action_t[player]), "/ REWARD", reward_t, \
                    "/ Q_MAX %e" % np.max(readout_t))



In [None]:
sess = tf.InteractiveSession()
s, readout, h_fc1 = createNetwork()
trainNetwork(s, readout, h_fc1, sess)