# TIC-TAC-TOE Markov Decision Process

## Auxiliary functions to build the model

In [1]:
%run tic_tac_toe_aux_funcs.py
%run ../libs/dynamic_programming.py

## States
- A board is represented with a 3x3 matrix and each cell can be filled with None, X or O, numerically represented with 0, 1 and 2. Hence, the number of possible boards is 3^9 = 19683.
- Each board is uniquely identified with an ID, obtained by the conversion in base-10 of the flattened board.
- Many boards aren't valid (e.g. the number of Xs minus number of Os is less than 0 or greater than 1) and many are equal (e.g. flip and rotation).
- Only valid and unique boards are considered states of the MDP. Hence, the number of states is 765.
- To map and speed up the relations between IDs and states are created two lookup tables:
    - **id_to_state** -> for each ID are associated the following informations:
        1. The type of the board: -1 not valid, 1 terminal, 0 otherwise. 
        2. If it's terminal the winner, otherwise the next player to play.
        3. If it's necessary to make flip.
        4. If it's necessary to make rotations and how many.
        5. The assosiated state.
    - **state_to_id** -> for each state are associated the following informations:
        1. The type of the board: -1 not valid, 1 terminal, 0 otherwise. 
        2. If it's terminal the winner or the next player to play.
        3. The smallest associated ID. 

In [2]:
# Number of IDs to checks.
IDs = 3 ** 9
# Create the two lookup tables.
id_state_lkt = np.zeros((IDs, 5), dtype=np.int32)
state_id_lkt = np.empty([0, 3], dtype=np.int32)
# Loop for each ID.
state = 0
for i in range(IDs):
    # Convert the ID to the relative board.
    board = id_to_board(i)
    # Compute the board rotations and symmetries.
    ids = board_to_ids(board)
    # Skip if already valueted.
    if ids[0, 0] < i:
        continue
    # Get board info.
    info = board_info(board)
    # Check if the board is valid.
    if info[0] == -1:
        # Populate the id_to_state lookup table.
        for id in ids:
            id_state_lkt[id[0], :] = [-1, -1, -1, -1, -1]
    else:
        # Populate the id_to_state lookup table.
        for id in ids:
            id_state_lkt[id[0], :] = [info[0], info[1], id[1], id[2], state]
        # Populate the state_to_id lookup table.
        state_id_lkt = np.append(state_id_lkt, [[info[0], info[1], ids[0, 0]]], axis=0)
        # Increase the state counter.
        state += 1
        
# Number of states
S = state_id_lkt.shape[0]
# Number of actions
A = 9

# Checks on lookup tables
for s in range(S):
    board = id_to_board(state_id_lkt[s, 2])
    ids = board_to_ids(board)
    for id in ids[:, 0]:
        if id_state_lkt[id, 4] != s:
            print("Error in lookup tables")

## Transitions matrix
- The transitions matrix P has dimensions *S x S x A*.
- Use the *Law of Total Probability* to create P.
- ***P_X(s,s',a)***: the transition from s to s' is deterministaclly defined by action a taken.
- ***P_O(s,s')***: the transition from s to s' is randomic, in particular the opponent's policy is uniform.
- ***P = P_X \* P_O***

In [3]:
prec = np.float128
# Create P_X matrix.
P_X = np.zeros((S, S, A), dtype=prec)
# Loop for each state.
for s in range(S):
    # Take the entry of the lookup table.
    lkt_entry = state_id_lkt[s, :]
    # Checks if it's terminal.
    if lkt_entry[0] == 1:
        P_X[s, s, :] = 1.0
    else:
        # Checks if it's X's turn.
        if lkt_entry[1] == 1:
            # Get the board from the state.
            id = lkt_entry[2]
            actions = get_actions(id)
            # Loop for each action.
            for a in range(A):
                if a in actions:
                    # Compute the new board and its ID.
                    new_id = id + 3 ** (8 - a)
                    # Get the next state from the new ID.
                    new_state = id_state_lkt[new_id, 4]
                    # Set P_X(s,s',a) = 1
                    P_X[s, new_state, a] = 1.0
                else:
                    P_X[s, s, a] = 1.0
        else:
            P_X[s, s, :] = 1.0

# Create P_O matrix
P_O = np.zeros((S, S), dtype=prec)
# Loop for each state.
for s in range(S):
    # Take the entry of the lookup table.
    lkt_entry = state_id_lkt[s, :]
    # Checks if it's terminal.
    if lkt_entry[0] == 1:
        P_O[s, s] = 1.0
    else:
        # Checks if it's O's turn.
        if lkt_entry[1] == 2:
            # Get the board from the state.
            id = lkt_entry[2]
            actions = get_actions(id)
            # Loop for each action.
            for i, a in enumerate(actions):
                # Compute the new board and its ID.
                new_id = id + 2 * (3 ** (8 - a))
                # Get the next state from the new ID.
                new_state = id_state_lkt[new_id, 4]
                # Set P_O(s,s') = 1/size
                #if i == len(actions) - 1:
                #    P_O[s, new_state] += 1.0 - np.sum(P_O[s, :], dtype=prec)
                #else:
                P_O[s, new_state] += 1.0 / len(actions)
        else:
            P_O[s, s] = 1.0

# Create transitions matrix P.
P = np.zeros((S, S, A), dtype=prec)
for a in range(A):
    P[:, :, a] = np.matmul(P_X[:, :, a], P_O, dtype=prec)

## Row-stochasticity checks

In [4]:
sto_X = np.zeros((S, A))
for a in range(A):
    for i, row in enumerate(P_X[:, :, a]):
        sto_X[i, a] = np.sum(row)
print(np.count_nonzero(sto_X != 1.0))

sto_O = np.zeros(S)
for i, row in enumerate(P_O):
    sto_O[i] = np.sum(row)
    if sto_O[i] != 1.0:
        print("i = {}, {}".format(i, sto_O[i]))
print(np.count_nonzero(sto_O != 1.0))

#state = 7
#tot = 0
#for j in P_O[state, :]:
#    if j != 0.0:
#        tot += j
#        print(tot)
#        print(j)
#id = state_id_lkt[state, 2]
#print_board(id_to_board(id))
    
sto = np.zeros((S, A))
for a in range(A):
    for i, row in enumerate(P[:, :, a]):
        sto[i, a] = np.sum(row)
print(np.count_nonzero(sto != 1.0))

0
0
0


## Rewards matrix
- +1 for an action that leads to a winning board.
- -1 for an action that leads to a losing board.
- 0 for any other actions.
- Expected rewards matrix R has dimentions *S x A*.
- Create a reward vector with dimention S that for each state says 1, 0, -1  in base of its type.
- For each couple state-action, take the transitions vector P(s, :, a) 
- Compute the expected reward for the state s as the inner product of the two former vectors.


In [5]:
prec = np.float128
# Compute the rewards vector.
reward_vector = np.zeros(S, dtype=prec)
# Loop for each state.
for s in range(S):
    # Take the entry of the lookup table.
    entry_lkt = state_id_lkt[s, :]
    # Check if it's terminal.
    if entry_lkt[0] == 0:
        reward_vector[s] = 0.0
    else:
        # Check the winner.
        if entry_lkt[1] == 1:
            reward_vector[s] = 1.0
        elif entry_lkt[1] == 2:
            reward_vector[s] = -1.0

# Create rewards matrix.
R = np.zeros((S, A), dtype=prec)
# Loop for each state.
for s in range(S):
    # Loop for each action.
    for a in range(A):
        R[s,a] = np.matmul(np.copy(P[s, :, a]), reward_vector, dtype=prec)

In [6]:
# Dump data on files.
id_state_lkt.dump("ttt_id2s.dat")
state_id_lkt.dump("ttt_s2id.dat")
P.dump("ttt_P.dat")
R.dump("ttt_R.dat")