In [1]:
import numpy as np
import gym
import gym_tictactoe as t3
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
from os import path

In [9]:
DEBUG = False

n_input = (3 * 3 * 3) * 3
n_hidden = 1000
n_output = 3 * 3 * 3

checkpoint_path = './my_tictactoe.ckpt'
initializer = tf.contrib.layers.variance_scaling_initializer()

learning_rate = 0.01

env = gym.make('tictactoe-v0')

In [46]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
encoder.fit(np.array([[0], [1], [2]]))

def convert_game_to_x_state(obs):
    # gym_tictactoe now supports int-encoded world
    world = np.array(obs, dtype=np.float32)
    data = list(map(lambda x: [x], world.flatten()))
    return encoder.transform(data).flatten()

def convert_action_to_step(action, player):
    action = int(action)
    val = 0
    multiplier = 1
    while action:
        val += (action%3)*multiplier
        multiplier *= 10
        action //= 3
    
    return str(player) + str(val).zfill(3)

def discount_rewards(rewards, discount_rate):
    reward_epsilon = 0.1
    discounted_rewards = np.empty(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = (rewards[step] + reward_epsilon) + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate=discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]

In [4]:
player_scopes = ['player_1', 'player_2']
all_logits = []
outputs = []
ys = []
cross_entropies = []
training_ops = []
all_gradient_placeholders = []
all_gradients = []
optimizers = []

X_state = tf.placeholder(shape=(None, n_input), dtype=tf.float32, name='X')
global_step = tf.Variable(0, trainable=False, name='global_step')

for scope in player_scopes:
    with tf.variable_scope(scope):
        hidden = fully_connected(X_state, n_hidden, activation_fn=tf.nn.relu, weights_initializer=initializer)
        logits = fully_connected(hidden, n_output, activation_fn=None, weights_initializer=initializer)
        all_logits.append(logits)

        output = tf.contrib.layers.softmax(logits)
        outputs.append(output)

        y = tf.to_float(tf.multinomial(tf.log(output), num_samples=n_output))
        ys.append(y)

        cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=logits)
        cross_entropies.append(cross_entropy)

        optimizer = tf.train.AdamOptimizer(learning_rate)
        optimizers.append(optimizer)

        trainable_variables = tf.trainable_variables(scope=scope)
        grads_and_vars = optimizer.compute_gradients(cross_entropy, trainable_variables)
        gradients = [grad for grad, variable in grads_and_vars]
        all_gradients.append(gradients)
        gradient_placeholders = []
        grads_and_vars_feed = []

        for grad, variable in grads_and_vars:
            gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
            gradient_placeholders.append(gradient_placeholder)
            grads_and_vars_feed.append((gradient_placeholder, variable))
        training_op = optimizer.apply_gradients(grads_and_vars_feed, global_step=global_step)
        training_ops.append(training_op)
        all_gradient_placeholders.append(gradient_placeholders)

        
init = tf.global_variables_initializer()
saver = tf.train.Saver()

file_writer = tf.summary.FileWriter('logs', tf.get_default_graph())
file_writer.close()

In [5]:
from timeit import Timer

timer = Timer()

n_games = 10
n_rounds = 2
with tf.Session() as sess:
    init.run()
    for r in range(n_rounds):
        gen = r % len(player_scopes)
        adv = (r+1) % 2
        round_rewards = []
        round_gradients = []
        start = timer.timer()
        for i in range(n_games):
            env = gym.make('tictactoe-v0')
            obs = env.reset()
            turn = 0
            done = False
            game_rewards = 0
            while not done:
                player = (turn%2) + 1
                world = convert_game_to_x_state(obs)            
                with tf.variable_scope(player_scopes[gen]):
                    action, grad_result = sess.run([ys[gen], all_gradients[gen]], feed_dict={X_state: [world]})
                action_taken = int(action[0][0])
                step = convert_action_to_step(action_taken, player)
                obs, reward, done, info = env.step(step)
                print(action_taken, end=',')
                game_rewards += reward
                turn += 1
                if done:
                    break
            print(' =', game_rewards)
#             game_gradients = gradients[gen].eval(feed_dict=None)
            round_rewards.append(game_rewards)
            round_gradients.append(grad_result)
        print('Game rewards ({}):{}'.format((timer.timer() - start)//1, game_rewards))

23,15,2,20,7,7, = -1
13,26,21,25,6,9,8,24, = 1
20,17,24,3,18,11,3, = -1
26,2,13,13, = -1
24,14,5,16,17,17, = -1
17,0,3,24,14,17, = -1
16,25,26,25, = -1
8,21,26,18,18, = -1
23,21,14,7,23, = -1
17,5,11,25,19,17, = -1
Game rewards (0.0):-1
18,7,2,2, = -1
21,9,10,1,13,10, = -1
12,3,12, = -1
1,21,7,21, = -1
21,20,20, = -1
0,1,4,8,11,11, = -1
21,7,22,20,2,3,25,7, = -1
3,23,23, = -1
7,1,11,8,15,25,23,18,12,12, = -1
3,10,7,18,20,12,9,20, = -1
Game rewards (0.0):-1


In [6]:
env.render()

- x -    x o -    o - -    
- - x    o - -    - - -    
- - -    - - -    x - -    


In [13]:
def play_game(env, gen, adv, gen_starts=True):
    # Gen starts the game as player 1
    gen_player = 1 if gen_starts else 2
    adv_player = 2 if gen_starts else 1
    obs = env.reset()
    done = False
    action_rewards = []
    action_gradients = []
    while not done:
        if gen_starts:
            world = convert_game_to_x_state(obs)
            with tf.variable_scope(player_scopes[gen]):
                gen_action, gen_grad_result = sess.run([ys[gen], all_gradients[gen]], feed_dict={X_state: [world]})
            gen_action_taken = int(gen_action[0][0])
            gen_step = convert_action_to_step(gen_action_taken, gen_player)
            obs, gen_reward, done, info = env.step(gen_step)
            if DEBUG:
                print('G({})'.format(gen_step), end=' ')

            action_rewards.append(gen_reward)
            action_gradients.append(gen_grad_result)

            if done:
                break

            world = convert_game_to_x_state(obs)
            with tf.variable_scope(player_scopes[adv]):
                adv_action = sess.run(ys[adv], feed_dict={X_state: [world]})
            adv_action_taken = int(adv_action[0][0])
            adv_step = convert_action_to_step(adv_action_taken, adv_player)
            obs, adv_reward, done, info = env.step(adv_step)

            if DEBUG:
                print('A({})'.format(adv_step), end=' ')

        else:
            world = convert_game_to_x_state(obs)
            with tf.variable_scope(player_scopes[adv]):
                adv_action = sess.run(ys[adv], feed_dict={X_state: [world]})
            adv_action_taken = int(adv_action[0][0])
            adv_step = convert_action_to_step(adv_action_taken, adv_player)
            obs, adv_reward, done, info = env.step(adv_step)

            if DEBUG:
                print('A({})'.format(adv_step), end=' ')

            if done:
                break

            world = convert_game_to_x_state(obs)
            with tf.variable_scope(player_scopes[gen]):
                gen_action, gen_grad_result = sess.run([ys[gen], all_gradients[gen]], feed_dict={X_state: [world]})
            gen_action_taken = int(gen_action[0][0])
            gen_step = convert_action_to_step(gen_action_taken, gen_player)
            obs, gen_reward, done, info = env.step(gen_step)
            if DEBUG:
                print('G({})'.format(gen_step), end=' ')

            action_rewards.append(gen_reward)
            action_gradients.append(gen_grad_result)
            
    return action_rewards, action_gradients

In [15]:
n_games_per_iteration = 50
n_iterations = 100
iteration_rewards = []
iteration_gradients = []
discount_rate = 0.95

gen = 0
adv = 1

with tf.Session() as sess:
    if path.exists(checkpoint_path + '.meta'):
        saver.restore(sess, checkpoint_path)
    else: 
        init.run()
        
    for it in range(n_iterations):
        print('Global step:', global_step.eval())
        for g in range(n_games_per_iteration):
            game_rewards, game_gradients = play_game(env, gen, adv, False)
            if DEBUG:
                print('rewards:', game_rewards)

            iteration_rewards.append(game_rewards)
            iteration_gradients.append(game_gradients)
            print('.', end='')
            
        for g in range(n_games_per_iteration):
            game_rewards, game_gradients = play_game(env, gen, adv, True)
            if DEBUG:
                print('rewards:', game_rewards)

            iteration_rewards.append(game_rewards)
            iteration_gradients.append(game_gradients)
            print('.', end='')
            
        feed_dict = {}
        iteration_rewards = discount_and_normalize_rewards(iteration_rewards, discount_rate)
        for var_index, grad_placeholder in enumerate(all_gradient_placeholders[gen]):
            mean_gradients = np.mean(
                [ reward * iteration_gradients[game_index][step][var_index]
                     for game_index, rewards in enumerate(iteration_rewards)
                     for step, rewards in enumerate(rewards)],
                axis=0)
            feed_dict[grad_placeholder] = mean_gradients

        with tf.variable_scope(player_scopes[0]):
            sess.run(training_ops[gen], feed_dict=feed_dict)
        saver.save(sess, checkpoint_path)

INFO:tensorflow:Restoring parameters from ./my_tictactoe.ckpt
Global step: 82
....................................................................................................Global step: 83
....................................................................................................Global step: 84
....................................................................................................Global step: 85
....................................................................................................Global step: 86
....................................................................................................Global step: 87
....................................................................................................Global step: 88
....................................................................................................Global step: 89
....................................................................................................Global ste

....................................................................................................Global step: 152
....................................................................................................Global step: 153
....................................................................................................Global step: 154
....................................................................................................Global step: 155
....................................................................................................Global step: 156
....................................................................................................Global step: 157
....................................................................................................Global step: 158
....................................................................................................Global step: 159
................................................................

In [77]:
def play_with_human(env):
    obs = env.reset()
    done = False
    player = 0
    with tf.Session() as sess:
        saver.restore(sess, checkpoint_path)
        with tf.variable_scope('player_1'):
            while not done:
                env.render()
                step = input()
                obs, reward, done, info = env.step('{}{}'.format(player%2+1, step))

                if done:
                    break

                player += 1
                world = convert_game_to_x_state(obs)
                action_result = sess.run(ys[0], feed_dict={X_state: [world]})
                action = int(action_result[0][0])
                step = convert_action_to_step(action, player%2+1)
                obs, reward, done, info = env.step(step)

                player += 1
    env.render()

In [78]:
play_with_human(env)

INFO:tensorflow:Restoring parameters from ./my_tictactoe.ckpt
- - -    - - -    - - -    
- - -    - - -    - - -    
- - -    - - -    - - -    
000
x - -    - - -    - - -    
- - -    - - -    - o -    
- - -    - - -    - - -    
011
x - -    - - -    - - -    
- x -    - - -    - o -    
- - -    - - -    - - -    
