# TIC-TAC-TOE Dynamic Programming 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%run tic_tac_toe_aux_funcs.py
%run ../libs/dynamic_programming.py

In [2]:
# Reload lookup tables, P and R matrices.
id_state_lkt = np.load("ttt_id2s.dat", allow_pickle=True)
state_id_lkt = np.load("ttt_s2id.dat", allow_pickle=True)
P = np.load("ttt_P.dat", allow_pickle=True)
R = np.load("ttt_R.dat", allow_pickle=True)
# Number of states and actions.
S = P.shape[0]
A = R.shape[1]
# Model discount factor.
gamma = 1.0
# Seed random generator.
np.random.seed(42)
# Initial values.
# p_init = np.random.randint(A, size=S)
p_init = np.zeros(S, np.int32)
for s in range(S - 3):
    info = state_id_lkt[s]
    p_init[s] = np.random.choice(get_actions(info[2]))

v_init = np.random.normal(size=S)
#v_init = np.zeros(S, dtype=np.float128)
v_init[-3:] = 0.0

## Policy Iteration

In [3]:
p_star_pi, v_star_pi = policy_iteration(P, R, gamma, v_init, p_init)

## Value Iteration

In [4]:
p_star_vi, v_star_vi = value_iteration(P, R, gamma, v_init, tol=1e-8)

1438.0845641233392748 338
800.1266680249536291 119
139.00629099475484159 40
16.327946327194622057 6
0.0037220509341977653373 1
0.0 0


In [5]:
s = 74
id = state_id_lkt[s, 2]
print(get_actions(id))
board = id_to_board(id)
print_board(board)
print(p_star_pi[s])
new_id = id + 3 ** (8 - p_star_pi[s])
new_board = id_to_board(new_id)
print_board(new_board)

[0 1 3 4 8]
###########
   |   | X 
---+---+---
   |   | O 
---+---+---
 X | O |   
###########
0
###########
 X |   | X 
---+---+---
   |   | O 
---+---+---
 X | O |   
###########


In [6]:
s = 74
id = state_id_lkt[s, 2]
print(get_actions(id))
board = id_to_board(id)
print_board(board)
print(p_star_vi[s])
new_id = id + 3 ** (8 - p_star_vi[s])
new_board = id_to_board(new_id)
print_board(new_board)

[0 1 3 4 8]
###########
   |   | X 
---+---+---
   |   | O 
---+---+---
 X | O |   
###########
0
###########
 X |   | X 
---+---+---
   |   | O 
---+---+---
 X | O |   
###########


In [7]:
s = 74
id = state_id_lkt[s, 2]
board = id_to_board(id)
print_board(board)
id += 3 ** (8 - 4)
board = id_to_board(id)
print_board(board)
print(id_state_lkt[id])
print(P[74, 339, :])
print(R[74, :])

###########
   |   | X 
---+---+---
   |   | O 
---+---+---
 X | O |   
###########
###########
   |   | X 
---+---+---
   | X | O 
---+---+---
 X | O |   
###########
[  1   1   0   0 339]
[0. 0. 0. 0. 1. 0. 0. 0. 0.]
[  0.   0.  -1.   0. 100.  -1.  -1.  -1.   0.]
