In [1]:
import numpy as np
import gym
import gym_tictactoe as t3
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected

In [2]:
n_input = (3 * 3 * 3) * 3
n_hidden = 1000
n_output = 3 * 3 * 3

checkpoint_path = 'my_tictactoe.ckpt'
initializer = tf.contrib.layers.variance_scaling_initializer()

learning_rate = 0.01

In [3]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
encoder.fit(np.array([[0], [1], [2]]))

def create_network(X_state, n_hidden, n_output, scope):
    with tf.variable_scope(scope) as scope:
        hidden = fully_connected(X_state, n_hidden, activation_fn=tf.nn.relu, weights_initializer=initializer)
        logit = fully_connected(hidden, n_output, activation_fn=None, weights_initializer=initializer)
        return logit

def convert_game_world(obs):
    # gym_tictactoe now supports int-encoded world
    world = np.array(obs, dtype=np.float32)
    data = list(map(lambda x: [x], world.flatten()))
    return encoder.transform(data).flatten()

def convert_action_to_step(action, player):
    val = 0
    multiplier = 1
    while action:
        val += (action%3)*multiplier
        multiplier *= 10
        action //= 3
    
    return str(player) + str(val).zfill(3)

In [6]:
optimizer = tf.train.AdamOptimizer(learning_rate)

networks = []
outputs = []
ys = []
cross_entropies = []
gradients = []

X_state = tf.placeholder(shape=(None, n_input), dtype=tf.float32, name='X')
logits_1 = create_network(X_state, n_hidden, n_output, 'network_1')
logits_2 = create_network(X_state, n_hidden, n_output, 'network_2')
networks.append(logits_1)
networks.append(logits_2)

for network in networks:
    output = tf.contrib.layers.softmax(network)
    outputs.append(output)
    
    y = tf.to_float(tf.multinomial(tf.log(output), num_samples=n_output))
    ys.append(y)

    cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=network)
    cross_entropies.append(cross_entropy)

    grad = optimizer.compute_gradients(cross_entropy)
    gradients.append(grad)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [7]:
env = gym.make('tictactoe-v0')
obs = env.reset()

In [8]:
for i in range(27):
    s = convert_action_to_step(i, 2)
    print(i, '==>', s)

0 ==> 2000
1 ==> 2001
2 ==> 2002
3 ==> 2010
4 ==> 2011
5 ==> 2012
6 ==> 2020
7 ==> 2021
8 ==> 2022
9 ==> 2100
10 ==> 2101
11 ==> 2102
12 ==> 2110
13 ==> 2111
14 ==> 2112
15 ==> 2120
16 ==> 2121
17 ==> 2122
18 ==> 2200
19 ==> 2201
20 ==> 2202
21 ==> 2210
22 ==> 2211
23 ==> 2212
24 ==> 2220
25 ==> 2221
26 ==> 2222


In [9]:
convert_game_world(obs)

array([1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0])

In [17]:
n_games = 1
with tf.Session() as sess:
    init.run()
    for i in range(n_games):
        env = gym.make('tictactoe-v0')
        obs = env.reset()
        turn = 0
        done = False
        game_rewards = 0
        while not done:
            for i in range(3):
                for j in range(3):
                    for k in range(3):
                        obs, reward, done, info = env.step('{}{}{}{}'.format(turn%2+1, i, j, k))
                        world = convert_game_world(obs)            
                        output[0].eval(feed_dict={X_state: [world]})
                        game_rewards += reward
                        turn += 1
                        if done:
                            break
                    if done:
                        break
                if done:
                    break
        print('Game rewards:', game_rewards)

Game rewards: 1


In [18]:
env.render()

x o x    - - -    - - -    
o x -    - - -    - - -    
x o -    - - -    - - -    
