# Policy Prediction

Gym environment uses pachi-py as the AI.

* Can we predict the the AI move from pachi?
* If so, can we used this to narrow down our MCTS search?

Assume:
* The game grid is 9x9
* There are only 9^2+1 moves. Resign terminals the episode. 

In [1]:
import gym
import numpy as np
import pachi_py
import time
import six

import tensorflow as tf

from gym.envs.board_game.go import _coord_to_action, GoState, _action_to_coord

ENV_ID = 'Go9x9-v0'
WIDTH = HEIGH = 9
SHAPE = (WIDTH, HEIGH)
# Action 82 is resign

In [2]:
def roll_axis(ob):
    """ Change observations from CxHxW to HxWxC"""
    ob = np.swapaxes(ob, 0, 1)
    ob = np.swapaxes(ob, 1, 2)
    return ob

In [3]:
def get_legal_actions(board, color):
    """ Get the all black legal moves """
    cs = board.get_legal_coords(color)
    a = [_coord_to_action(board, c) for c in cs]
    return a

In [4]:
def get_color_channel(color):
    assert color in (pachi_py.BLACK, pachi_py.WHITE)
    if color == pachi_py.BLACK:
        return np.zeros(SHAPE, dtype=np.int8)
    else:
        return np.ones(SHAPE, dtype=np.int8)

In [5]:
def make_pachi_policy(board, engine_type='uct', threads=1, pachi_timestr=''):
    engine = pachi_py.PyPachiEngine(board, engine_type, six.b('threads=%d' % threads))

    def pachi_policy(curr_state, prev_state, prev_action):
        if prev_state is not None:
            assert engine.curr_board == prev_state.board, 'Engine internal board is inconsistent with provided board. The Pachi engine must be called consistently as the game progresses.'
            prev_coord = _action_to_coord(prev_state.board, prev_action)
            engine.notify(prev_coord, prev_state.color)
            engine.curr_board.play_inplace(prev_coord, prev_state.color)
        out_coord = engine.genmove(curr_state.color, pachi_timestr)
        out_action = _coord_to_action(curr_state.board, out_coord)
        engine.curr_board.play_inplace(out_coord, curr_state.color)
        return out_action

    return pachi_policy

In [6]:
def sim_game(player):
    assert player in (pachi_py.BLACK, pachi_py.WHITE)
    enemy = pachi_py.stone_other(player)

    ob = []
    a = []

    state = GoState(pachi_py.CreateBoard(9), pachi_py.BLACK)
    player_policy = make_pachi_policy(board=state.board.clone(
    ), engine_type=six.b('uct'), pachi_timestr=six.b('_2400'))

    last_enemy_action = None
    last_state = None

    while not state.board.is_terminal:
        if state.color == player:
            ob.append(roll_axis(state.board.encode()))
            player_action = player_policy(state, last_state, last_enemy_action)
            a.append(player_action)
            state = state.act(player_action)
            assert state.color != player
            continue
        elif state.color == enemy:
            legal_actions = get_legal_actions(state.board, enemy)
            last_enemy_action = np.random.choice(legal_actions)
            last_state = state
            state = state.act(last_enemy_action)
            continue
        else:
            raise NotImplementedError

    return ob, a, [player-1 for _ in a]

# Simulate 10 Games

In [8]:
ob_batch = []
a_batch = []
p_batch = []

tic = time.time()
for i in range(3):
    p = np.random.choice((pachi_py.BLACK, pachi_py.WHITE))
    ob, a, p = sim_game(p)
    ob_batch += ob
    a_batch += a
    p_batch += p
toc = time.time() - tic
print(toc)


10.596940994262695


In [9]:
ob_batch = np.stack(ob_batch)
a_batch = np.stack(a_batch)
p_batch = np.stack(p_batch).reshape((-1, 1))

In [12]:
print(ob_batch.shape)
print(a_batch.shape)
print(p_batch.shape)

(105, 9, 9, 3)
(105,)
(105, 1)


# TensorFlow

In [62]:
N_CHANNELS = 3

tf.reset_default_graph()


ob_ph = tf.placeholder(tf.float32, [None, WIDTH, HEIGH, N_CHANNELS])
p_ph = tf.placeholder(tf.float32, [None, 1])
a_ph = tf.placeholder(tf.int32, [None])

player_channel = p_ph * tf.ones([1, HEIGH], dtype=tf.float32)
player_channel = tf.expand_dims(player_channel, -1)
player_channel = player_channel * tf.ones([1, HEIGH, WIDTH], dtype=tf.float32)
player_channel = tf.expand_dims(player_channel, -1)

obp = tf.concat((ob_ph, player_channel), 3)

conv_1 = tf.layers.conv2d(obp, filters=256, kernel_size=3, activation=tf.nn.relu)
conv_2 = tf.layers.conv2d(conv_1, filters=2, kernel_size=1, activation=tf.nn.relu)
conv_2_flat = tf.layers.flatten(conv_2)
pi = tf.layers.dense(conv_2_flat, 9*9+1, activation=tf.nn.relu)

obj = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=a_ph)
pg_loss = tf.reduce_mean(obj)

optimiser = tf.train.AdamOptimizer(1e-4)
train_op = optimiser.minimize(pg_loss)

init_op = tf.global_variables_initializer()

In [63]:
with tf.Session() as sess:
    init_op.run()
    sess.run(train_op, feed_dict={ob_ph:ob_batch, p_ph:p_batch, a_ph:a_batch})

In [39]:
c[0, :, :, 3]

array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]], dtype=float32)

In [32]:
ob_batch[0, :, :, 0]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [40]:
p_batch[0]

array([1])

In [99]:
for i, v in enumerate(players):
    p = v[0]
    cp = np.unique(c[i])
    assert c[i].shape == (9, 9)
    assert p==cp
    break

In [53]:
p.shape

(316,)