# TIC-TAC-TOE Dynamic Programming 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%run tic_tac_toe_aux_funcs.py
%run ../libs/dynamic_programming.py

In [2]:
# Reload lookup tables, P and R matrices.
id_state_lkt = np.load("ttt_id2s.dat", allow_pickle=True)
state_id_lkt = np.load("ttt_s2id.dat", allow_pickle=True)
P = np.load("ttt_P.dat", allow_pickle=True)
R = np.load("ttt_R.dat", allow_pickle=True)
# Number of states and actions.
S = P.shape[0]
A = R.shape[1]
# Model discount factor.
gamma = 1.0
# Seed random generator.
np.random.seed(42)
# Initial values.
# p_init = np.random.randint(A, size=S)
p_init = np.zeros(S, np.int32)
for s in range(S - 3):
    info = state_id_lkt[s]
    p_init[s] = np.random.choice(get_actions(info[2]))

v_init = np.random.normal(size=S)
#v_init = np.zeros(S, dtype=np.float128)
v_init[-3:] = 0.0

## Policy Iteration

In [3]:
p_star_pi, v_star_pi = policy_iteration(P, R, gamma, v_init, p_init)

## Value Iteration

In [4]:
p_star_vi, v_star_vi = value_iteration(P, R, gamma, v_init, tol=1e-8)

63.874054541413163664 2423
16.93256269648531556 1094
4.6081251610375240195 648
1.2390529031882260608 239
0.27301951417574756972 50
0.03810389645839104924 1
0.0 0


In [5]:
s = 340
id = state_id_lkt[s, 2]
print(get_actions(id))
board = id_to_board(id)
print_board(board)
print(p_star_pi[s])
id = id + 3 ** (8 - p_star_pi[s])
board = id_to_board(id)
print_board(board)

id = id + 2 * (3 ** (8 - 0))

s = id_state_lkt[id, 4]
id = state_id_lkt[s, 2]
print(get_actions(id))
board = id_to_board(id)
print_board(board)
print(p_star_pi[s])
id = id + 3 ** (8 - p_star_pi[s])
board = id_to_board(id)
print_board(board)

[0 1 4 7 8]
###########
   |   | O 
---+---+---
 O |   | X 
---+---+---
 X |   |   
###########
7
###########
   |   | O 
---+---+---
 O |   | X 
---+---+---
 X | X |   
###########
[1 4 8]
###########
 O |   | O 
---+---+---
 O |   | X 
---+---+---
 X | X |   
###########
1
###########
 O | X | O 
---+---+---
 O |   | X 
---+---+---
 X | X |   
###########


In [6]:
s = 0
id = state_id_lkt[s, 2]
print(get_actions(id))
board = id_to_board(id)
print_board(board)
print(p_star_pi[s])
id = id + 3 ** (8 - p_star_pi[s])
board = id_to_board(id)
print_board(board)

id = id + 2 * (3 ** (8 - 1))

s = id_state_lkt[id, 4]
id = state_id_lkt[s, 2]
print(get_actions(id))
board = id_to_board(id)
print_board(board)
print(p_star_pi[s])
id = id + 3 ** (8 - p_star_pi[s])
board = id_to_board(id)
print_board(board)

id = id + 2 * (3 ** (8 - 2))

s = id_state_lkt[id, 4]
id = state_id_lkt[s, 2]
print(get_actions(id))
board = id_to_board(id)
print_board(board)
print(p_star_pi[s])
id = id + 3 ** (8 - p_star_pi[s])
board = id_to_board(id)
print_board(board)

id = id + 2 * (3 ** (8 - 5))

s = id_state_lkt[id, 4]
id = state_id_lkt[s, 2]
print(get_actions(id))
board = id_to_board(id)
print_board(board)
print(p_star_pi[s])
id = id + 3 ** (8 - p_star_pi[s])
board = id_to_board(id)
print_board(board)

[0 1 2 3 4 5 6 7 8]
###########
   |   |   
---+---+---
   |   |   
---+---+---
   |   |   
###########
0
###########
 X |   |   
---+---+---
   |   |   
---+---+---
   |   |   
###########
[2 3 4 5 6 7 8]
###########
 X | O |   
---+---+---
   |   |   
---+---+---
   |   |   
###########
3
###########
 X | O |   
---+---+---
 X |   |   
---+---+---
   |   |   
###########
[4 5 6 7 8]
###########
 X | O | O 
---+---+---
 X |   |   
---+---+---
   |   |   
###########
4
###########
 X | O | O 
---+---+---
 X | X |   
---+---+---
   |   |   
###########
[6 7 8]
###########
 X | O | O 
---+---+---
 X | X | O 
---+---+---
   |   |   
###########
6
###########
 X | O | O 
---+---+---
 X | X | O 
---+---+---
 X |   |   
###########
