In [None]:
import sys

sys.path.append("game/")
# from paperio2_wrapper import *
import player
from paperio_single_wrapper import PaperIOSingleWrapper
from paperio_two_wrapper import PaperIOTwoWrapper
import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline
import random
from collections import deque
import warnings
from layer_utils import *
print('TensorFlow Version: {}'.format(tf.__version__))

In [None]:
# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

In [None]:
ACTIONS = 4  # number of valid actions
GAMMA = 0.99  # decay rate of past observations
OBSERVE = 10000.  # timesteps to observe before training
EXPLORE = 20000.  # frames over which to anneal epsilon
FINAL_EPSILON = 0.08  # final value of epsilon
INITIAL_EPSILON = 0.2  # starting value of epsilon
REPLAY_MEMORY = 50000  # number of previous transitions to remember
BATCH = 64  # size of minibatch
FRAME_PER_ACTION = 1

In [None]:
def create_network_single():
    s = tf.placeholder("float", [None, 15, 15, 8])
    # 15 : view size

    x = conv2d(s, 32, 3, 1)
    x = maxpool(x)
    x = conv2d(x, 64, 3, 1)
    x = tf.layers.dropout(x)
    x = conv2d(x, 128, 3, 1)

    x = flatten(x)

    x = fully_conn(x, 256)
    x = fully_conn(x, 64)
    out = output(x, ACTIONS)
    return s, out, x

In [None]:
def set_action(direction):
    action = np.zeros([ACTIONS])
    action[direction] = 1
    return action


def initial_state(areas, paths):
    return np.stack((areas, paths, areas, paths, areas, paths, areas, paths), axis=2)


def update_state(areas, paths, state):
    return np.append(np.stack((areas, paths), axis=2), state[:, :, :6], axis=2)


def to_action_dict(action_t):
    d = {}
    for k, v in action_t.items():
        d[k] = np.argmax(v)
    return d

def get_heads(N):
    N_4 = N // 4
    # If you want to have randome head, let h1 = h2 = None
    return None, None
    # else: there are some options:
    
    # center: used in single player
    # return (N // 2, N // 2), (N // 2, N // 2)
    
    # 1/4 corner:
    # return (N_4, N_4), (N_4 * 3, N_4 * 3)
    
    # random corner:
    # if random.random() > 0.5:
    #     h1, h2 = (N_4, N_4 * 3), (N_4 * 3, N_4)
    # else:
    #     h1, h2 = (N_4, N_4), (N_4 * 3, N_4 * 3)

In [None]:
def train_network(s, readout, sess, N=50, MAX_ITER=None, 
              is_two_player=True, is_print=False, has_random=True, only_observe=False):
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    global_step = tf.Variable(0, name='global_step', trainable=False)

    readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost, global_step=global_step)

    DEAD_REWARD = -N * N
    P1 = 1
    P2 = 2
    PLAYERS = {P1, P2} if is_two_player else {P1}
    N_4 = N // 4
    
    h1, h2 = get_heads(N)
    
    game_wrapper = PaperIOTwoWrapper(N=N, 
                                     id1=P1, 
                                     id2=P2, 
                                     h1=h1,
                                     h2=h2,
                                     init_pad=3, 
                                     is_two_player=is_two_player)
    MAX_STEP = N * N / 4
    prev_reset = 0
    action_t = {}
    init_states = {p: np.stack([np.zeros((15, 15))] * 8, axis=2) for p in PLAYERS}
    D = deque()

    for i in range(5):
        action_t = {p: set_action(player.get_rand_dir()) for p in PLAYERS}
        game_wrapper.step(to_action_dict(action_t))
        for p in game_wrapper.players:
            area, path = game_wrapper.get_views(p)
            init_states[p] = update_state(area, path, init_states[p])

    state_t = init_states

    #     saving and loading networks
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    checkpoint = tf.train.get_checkpoint_state("saved_networks_2")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    epsilon = INITIAL_EPSILON if has_random else 0
    t = 0
    while MAX_ITER is None or t < MAX_ITER:
        for p in PLAYERS:
            readout_t = readout.eval(feed_dict={s: [state_t[p]]})[0]
            action = np.zeros([ACTIONS])
            action_index = np.argmax(readout_t) if random.random() > epsilon else random.randrange(ACTIONS)
            action[action_index] = 1
            action_t[p] = action

        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        game_wrapper.step(to_action_dict(action_t))

        state_t1 = {}
        
        for p in PLAYERS:
            area, path = game_wrapper.get_views(p)
            state_t1[p] = update_state(area, path, state_t[p])

        need_reset = False
        
        for p in PLAYERS:
            is_dead = game_wrapper.is_dead(p)
            is_win = game_wrapper.game.board.current_areas.get(p, 0) / (N * N) > 0.7
            r0 = r1 = 0

            if is_dead:
                r0 = DEAD_REWARD
            elif is_two_player and game_wrapper.is_dead(3 - p) and (3 - p) not in game_wrapper.game.board.suicide:
                r0 = game_wrapper.prev_area[3 - p]
            elif game_wrapper.is_in_area(p):
                r0 = game_wrapper.get_delta_area(p)
                r1 = 2 * game_wrapper.delta_dis(p)
            else:
                r0 = -0.7
                r1 = -2 * game_wrapper.delta_dis(p)
            reward_t = r0 + r1

            if is_dead or is_win or t - prev_reset >= MAX_STEP:
                need_reset = True
                
            D.append((state_t[p],
                      action_t[p],
                      reward_t,
                      state_t1[p],
                      is_dead or is_win))

            if len(D) > REPLAY_MEMORY:
                D.popleft()

            if t > OBSERVE:
                # sample a minibatch to train on
                minibatch = random.sample(D, BATCH)

                # get the batch variables
                s_j_batch = [d[0] for d in minibatch]
                a_batch = [d[1] for d in minibatch]
                r_batch = [d[2] for d in minibatch]
                s_j1_batch = [d[3] for d in minibatch]

                y_batch = []
                readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch})
                for i in range(0, len(minibatch)):
                    terminal = minibatch[i][4]
                    # if terminal, only equals reward
                    if terminal:
                        y_batch.append(r_batch[i])
                    else:
                        y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))

                # perform gradient step
                train_step.run(feed_dict={
                    y: y_batch,
                    a: a_batch,
                    s: s_j_batch}
                )

            if t % 5000 == 0 and not only_observe:
                saver.save(sess, 'saved_networks_2/dqn', global_step=global_step)
                game_wrapper.game.show_board()

            if is_print and t % 10 == 0:
                # if is_print:
                game_wrapper.game.show_board()

            if is_print or t % 1000 == 0:
                # print info
                state = ""
                if t <= OBSERVE:
                    state = "observe"
                elif OBSERVE < t <= OBSERVE + EXPLORE:
                    state = "explore"
                else:
                    state = "train"

                print("T:", t, "/ S:", state, "/ EPS:", epsilon, "/ P:", p, "/ ACT", player.DIR_STR[np.argmax(action_t[p])],
                      "/ R0", r0, "/ R1", r1, "/ Q_MAX %e" % np.max(readout_t))
        if need_reset:
            # We want to reset it with different initial area
            h1, h2 = get_heads(N)
            game_wrapper.reset(h1=h1, h2=h2,
                               init_pad=np.random.randint(7 - 3, 7))
            prev_reset = t
        state_t = state_t1
        t += 1
    saver.save(sess, 'saved_networks_2/dqn', global_step=global_step)
    game_wrapper.game.show_board()


In [None]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
s, readout, h_fc1 = create_network_single()
SIZE = 50

In [None]:
print("Let's start")

In [None]:
# This is for taking a look without change anything

# train_single_network(s, 
#                      readout, 
#                      sess, 
#                      N=SIZE, 
#                      is_print=True,# if True, print image and every time stamp, otherwise no image 
#                      has_random=False, # if False, epsilon will always be zero, only model decide next move
#                      only_observe=True # if True, model will not save in file, program stop afer OBSERVE
#                     )

# ===== This is for actual training ===== 

# train_single_network(s, readout, sess, N=SIZE, is_print=False, has_random=True, only_observe=False)

In [None]:
train_network(s, 
              readout, 
              sess, 
              N=SIZE, 
              is_two_player=False, # if False, then it will run in single player
              is_print=True,# if True, print image and every time stamp, otherwise no image 
              has_random=False, # if False, epsilon will always be zero, only model decide next move
              only_observe=True # if True, model will not save in file, program stop afer OBSERVE
             )