In [1]:
import numpy as np
import keras

Using TensorFlow backend.


In [2]:
import gym
env = gym.make('Go9x9-v0')

[2017-10-04 23:58:51,949] Making new env: Go9x9-v0


In [3]:
import importlib

In [4]:
def opponent(position1st, position2nd):
    """Return the opponent move with the position they saw
    
    Go9x9 gym environment makes whites moves automatically, so 
    it is not as easy to capture their state and move for training
    as it is for the black moves.
    This method calculates the position the white player saw and the 
    move they took in that position.
    
    :param positions___: np.array    3x9x9 arrays
    :return: (np.array, int)         (3x9x9 observation array, move integer)
    """
    mask = (position1st - position2nd)[1,:,:]
    if np.all(mask == 0):
        opp_move = 81
    else:
        opp_move = np.argmin(mask.reshape(81))
    
    mask = np.concatenate([np.zeros((1,9,9)), mask.reshape(1,9,9), -mask.reshape(1,9,9)],)
    opp_position = position2nd + mask

    return opp_position, opp_move

In [5]:
import paths
from nn import policy9x9
importlib.reload(policy9x9)

<module 'nn.policy9x9' from '/Users/andrew.brown/projects/GoFamiliar/src/nn/policy9x9.py'>

In [6]:
net = policy9x9.PolicyNet()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 2, 9, 9)       0                                            
____________________________________________________________________________________________________
zero_padding2d_1 (ZeroPadding2D) (None, 4, 11, 9)      0           input_1[0][0]                    
____________________________________________________________________________________________________
conv2d_1 (Conv2D)                (None, 2, 9, 32)      2624        zero_padding2d_1[0][0]           
____________________________________________________________________________________________________
zero_padding2d_2 (ZeroPadding2D) (None, 4, 11, 32)     0           conv2d_1[0][0]                   
___________________________________________________________________________________________

In [9]:
for epoch in range(500):
    observations = []
    actionrewards = np.empty(shape=(0,len(policy9x9.ACTION_SPACE)))

    opp_observations = []
    opp_actionrewards = np.empty(shape=(0,len(policy9x9.ACTION_SPACE)))
    
    moves_made_per_game = []
    total_reward = 0
    
    for game in range(50):
        moves = []
        opp_moves = []

        obser = env.reset()    
        done = False

        while not done:
            observations.append(obser)
            move = net.move(position=obser)
            obser, reward, done, info = env.step(move)

            moves.append(move)

            opp_obser, opp_move = opponent(observations[-1], obser)
            if opp_move < 81:
                opp_observations.append(opp_obser)
                opp_moves.append(opp_move)
        moves_made_per_game.append(len(moves))
        total_reward += reward

        moves_arr = keras.utils.to_categorical(np.array(moves), num_classes=len(policy9x9.ACTION_SPACE))
        actionrewards = np.concatenate([actionrewards, moves_arr * reward], axis=0)    # uses the final reward as the reward for all moves

        opp_moves_arr = keras.utils.to_categorical(np.array(opp_moves), num_classes=len(policy9x9.ACTION_SPACE))
        opp_actionrewards = np.concatenate([opp_actionrewards, opp_moves_arr * (-reward)], axis=0)
        print('*', end='')
    observations = np.array(observations)
    opp_observations = np.array(opp_observations)

    net.train(observations=observations, actionrewards=actionrewards, verbose=0, epochs=5)
    net.train(observations=opp_observations, actionrewards=opp_actionrewards, verbose=0, epochs=5)

    print(' ', np.median(moves_made_per_game), total_reward)

**************************************************  26.5 -50.0
**************************************************  29.5 -50.0
**************************************************  24.0 -50.0
**************************************************  28.0 -50.0
**************************************************  28.0 -50.0
**************************************************  27.0 -50.0
**************************************************  24.5 -50.0
**************************************************  26.5 -50.0
**************************************************  27.0 -50.0
**************************************************  29.0 -50.0
**************************************************  27.0 -50.0
**************************************************  27.0 -50.0
**************************************************  28.0 -50.0
**************************************************  30.0 -50.0
**************************************************  25.0 -50.0
**************************************************  26.

KeyboardInterrupt: 

## Original

In [10]:
np.median(moves_made_per_game), total_reward

(4.0, -89.0)

In [12]:
np.median(moves_made_per_game), total_reward

(4.0, -91.0)

In [14]:
np.median(moves_made_per_game), total_reward

(4.0, -93.0)

## Removed dropout

In [8]:
np.median(moves_made_per_game), total_reward

(6.0, -84.0)

In [10]:
np.median(moves_made_per_game), total_reward

(4.0, -55.0)

In [12]:
np.median(moves_made_per_game), total_reward

(3.0, -70.0)

## Added connection from inputs to end
also reduced the action sapce, so passing was no longer allowed.

In [11]:
np.median(moves_made_per_game), total_reward

(8.5, -100.0)

In [13]:
np.median(moves_made_per_game), total_reward

(6.5, -100.0)

## Split stones from Open board part of position

In [32]:
np.median(moves_made_per_game), total_reward

(20.5, -100.0)

In [39]:
np.median(moves_made_per_game), total_reward

(20.0, -100.0)

In [41]:
np.median(moves_made_per_game), total_reward

(20.0, -100.0)

In [55]:
moves[-1].reshape(9,9), info

(array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]),
 {'state': To play: black
  Move:  28  Komi: 0.0  Handicap: 0  Captures B: 0 W: 1
        A B C D E F G H J  
      +-------------------+
    9 | . . . X . . X . . |
    8 | . . . X . O O X . |
    7 | . . O O O . O O . |
    6 | . . O X X O). . . |
    5 | . . O . . . . . . |
    4 | . . . O X . . O . |
    3 | . . . . . . . O X |
    2 | X X . . O . . . . |
    1 | . . . . X X . X . |
      +-------------------+})

## Started training in batches of 10

In [31]:
np.median(moves_made_per_game), total_reward

(26.5, -10.0)

In [42]:
np.median(moves_made_per_game), total_reward

(25.0, -10.0)

In [39]:
np.round(net.probailities(observations[-1]), decimals=3)

array([ 0.001     ,  0.        ,  0.        ,  0.99900001,  0.        ,
        0.        ,  0.006     ,  0.001     ,  0.002     ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.001     ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.003     ,
        0.        ,  0.001     ,  0.001     ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.995     ,
        0.015     ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.20100001,  0.001     ,  0.        ,  0.        ,  0.        ,
        0.96200001,  0.        ,  0.        ,  0.28299999,  0.029     ,
        0.        ,  0.        ,  0.        ,  0.001     ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.001     ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.131     ,  0.  

In [41]:
len(moves)

28