# Policy Prediction

Gym environment uses pachi-py as the AI.

* Can we predict the the AI move from pachi?
* If so, can we used this to narrow down our MCTS search?

Assume:
* The game grid is 9x9
* There are only 9^2+1 moves. Resign terminals the episode. 

In [1]:
import gym
import numpy as np
import pachi_py
import time
import six
import os
import threading

import tensorflow as tf

from gym.envs.board_game.go import _coord_to_action, GoState, _action_to_coord

ENV_ID = 'Go9x9-v0'
WIDTH = HEIGH = 9
SHAPE = (WIDTH, HEIGH)
# Action 82 is resign

In [2]:
def roll_axis(ob):
    """ Change observations from CxHxW to HxWxC"""
    ob = np.swapaxes(ob, 0, 1)
    ob = np.swapaxes(ob, 1, 2)
    return ob

In [3]:
def get_legal_actions(board, color):
    """ Get the all black legal moves """
    cs = board.get_legal_coords(color)
    a = [_coord_to_action(board, c) for c in cs]
    return a

In [4]:
def make_pachi_policy(board, engine_type='uct', threads=1, pachi_timestr=''):
    engine = pachi_py.PyPachiEngine(board, engine_type, six.b('threads=%d' % threads))

    def pachi_policy(curr_state, prev_state, prev_action):
        if prev_state is not None:
            assert engine.curr_board == prev_state.board, 'Engine internal board is inconsistent with provided board. The Pachi engine must be called consistently as the game progresses.'
            prev_coord = _action_to_coord(prev_state.board, prev_action)
            engine.notify(prev_coord, prev_state.color)
            engine.curr_board.play_inplace(prev_coord, prev_state.color)
        out_coord = engine.genmove(curr_state.color, pachi_timestr)
        out_action = _coord_to_action(curr_state.board, out_coord)
        engine.curr_board.play_inplace(out_coord, curr_state.color)
        return out_action

    return pachi_policy

In [6]:
def sim_game(player):
    assert player in (pachi_py.BLACK, pachi_py.WHITE)
    enemy = pachi_py.stone_other(player)

    ob = []
    a = []

    state = GoState(pachi_py.CreateBoard(9), pachi_py.BLACK)
    player_policy = make_pachi_policy(board=state.board.clone(
    ), engine_type=six.b('uct'), pachi_timestr=six.b('_2400'))

    last_enemy_action = None
    last_state = None

    while not state.board.is_terminal:
        if state.color == player:
            ob.append(roll_axis(state.board.encode()))
            player_action = player_policy(state, last_state, last_enemy_action)
            a.append(player_action)
            state = state.act(player_action)
            assert state.color != player
            continue
        elif state.color == enemy:
            legal_actions = get_legal_actions(state.board, enemy)
            last_enemy_action = np.random.choice(legal_actions)
            last_state = state
            state = state.act(last_enemy_action)
            continue
        else:
            raise NotImplementedError

    return ob, a, [player-1 for _ in a]

In [7]:
def sim_batch(n):
    """ Randomly Simulate n number of games """
    ob_batch = []
    a_batch = []
    p_batch = []
    for i in range(n):
        p = np.random.choice((pachi_py.BLACK, pachi_py.WHITE))
        ob, a, p = sim_game(p)
        ob_batch += ob
        a_batch += a
        p_batch += p
    ob_batch = np.stack(ob_batch)
    a_batch = np.stack(a_batch)
    p_batch = np.stack(p_batch).reshape((-1, 1))
    
    return ob_batch, a_batch, p_batch

# Simulate a few games

In [8]:
tic = time.time()
ob_batch, a_batch, p_batch = sim_batch(20)
toc = time.time() - tic
s = ob_batch.shape[0]
print(toc)
print('Total: {:.2f}s {:.2f}ms per step'.format(toc, 100*toc/s))
ob_test, a_test, p_test = sim_batch(4)

67.13176774978638
Total: 67.13s 10.11ms per step


In [10]:
tic = time.time()
ob_batch, a_batch, p_batch = sim_batch(2)
toc = time.time() - tic
s = ob_batch.shape[0]
print(toc)
print('Total: {:.2f}s {:.2f}ms per step'.format(toc, 100*toc/s))
# for 1000 training loops this will take 100 minutes
# might have to generate data first to save on training time

6.456079006195068
Total: 6.46s 10.09ms per step


In [9]:
print(ob_batch.shape)
print(a_batch.shape)
print(p_batch.shape)

(664, 9, 9, 3)
(664,)
(664, 1)


# TensorFlow

In [9]:
N_CHANNELS = 3

tf.reset_default_graph()

global_step = tf.Variable(0, trainable=False)

ob_ph = tf.placeholder(tf.float32, [None, WIDTH, HEIGH, N_CHANNELS])
p_ph = tf.placeholder(tf.float32, [None, 1])
a_ph = tf.placeholder(tf.int64, [None])

player_channel = p_ph * tf.ones([1, HEIGH], dtype=tf.float32)
player_channel = tf.expand_dims(player_channel, -1)
player_channel = player_channel * tf.ones([1, HEIGH, WIDTH], dtype=tf.float32)
player_channel = tf.expand_dims(player_channel, -1)

obp = tf.concat((ob_ph, player_channel), 3)

conv_1 = tf.layers.conv2d(obp, filters=256, kernel_size=3, activation=tf.nn.relu)
conv_2 = tf.layers.conv2d(conv_1, filters=2, kernel_size=1, activation=tf.nn.relu)
conv_2_flat = tf.layers.flatten(conv_2)
pi = tf.layers.dense(conv_2_flat, 9*9+1, activation=tf.nn.relu)


a_pred = tf.argmax(pi, axis=1)
metric_accuracy, accuracy_op = tf.metrics.accuracy(a_ph, a_pred)
tf.summary.scalar('metric_accuracy', accuracy_op)

correct_pred = tf.equal(tf.argmax(pi,1), a_ph)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
tf.summary.scalar('classification_accuracy', accuracy)

obj = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=a_ph)
pg_loss = tf.reduce_mean(obj)

tf.summary.scalar('policy_loss', pg_loss)
tf.summary.scalar('global_step', global_step)

optimiser = tf.train.AdamOptimizer(1e-4)
train_op = optimiser.minimize(pg_loss, global_step=global_step)

merged_summary = tf.summary.merge_all()
global_init_op = tf.global_variables_initializer()
local_init_op = tf.local_variables_initializer()

In [10]:
tb_path = 'tb'
session_id = time.strftime('%m-%dT%H-%M-%S')
train_tb = os.path.join(tb_path,session_id, 'TRAIN')
train_writer = tf.summary.FileWriter(train_tb)

test_tb = os.path.join(tb_path,session_id, 'TEST')
test_writer = tf.summary.FileWriter(test_tb)

default_dict = {ob_ph:ob_batch, p_ph: p_batch, a_ph:a_batch}
test_dict = {ob_ph:ob_test, p_ph: p_test, a_ph:a_test}

def record_tb():
    _, train_summary, gs = sess.run([accuracy_op, merged_summary, global_step], feed_dict=default_dict)
    train_writer.add_summary(train_summary, gs)
            
    _, test_summary, gs = sess.run([accuracy_op, merged_summary, global_step], feed_dict=test_dict)
    test_writer.add_summary(test_summary, gs)


sess = tf.InteractiveSession()
train_writer.add_graph(sess.graph)
global_init_op.run()
local_init_op.run()

record_tb()

for i in range(20):
    for j in range(50):
        train_op.run(feed_dict=default_dict)
    
    record_tb()
        
train_writer.close()       
test_writer.close()

# Test Accuracy

In [11]:
test_pi = pi.eval(feed_dict=test_dict)
test_pred = np.argmax(test_pi, axis=1)
correct = np.equal(test_pred, a_test).astype(np.int32)
accuracy = np.mean(correct)
print('Test Accuracy: {:.2f}%'.format(100*accuracy))

Test Accuracy: 1.30%


# Others

Can we make this process faster? How about multi-threading?

This does not improve the performance since pachi is already multi-threaded

``` python
class GameRunner(threading.Thread):
    def __init__(self, n):
        threading.Thread.__init__(self)
        self.n = n
        self.reset()
        
    def reset(self):
        self.done = False
        self.result = None
        
    def run(self):
        self.ans = sim_batch(self.n)
        
runner = GameRunner(3)
tic = time.time()

runners = [GameRunner(2) for _ in range(3)]
for r in runners:
    r.start()
    
for i in runners:
    r.join()
    
s = 0
for r in runners:
    s += r.ans[0].shape[0]
    
toc = time.time() - tic
print('Total: {:.2f}s {:.2f}ms per step'.format(toc, 100*toc/s))
```