# TIC-TAC-TOE Markov Decision Process

## Auxiliary functions to build the model

In [1]:
%run tic_tac_toe_aux_funcs.py
%run ../libs/dynamic_programming.py

## States
- A board is represented with a 3x3 matrix and each cell can be filled with None, X or O, numerically represented with 0, 1 and 2. Hence, the number of possible boards is 3^9 = 19683.
- Each board is uniquely identified with an ID, obtained by the conversion in base-10 of the flattened board.
- Many boards aren't valid (e.g. the number of Xs minus number of Os is less than 0 or greater than 1) and many are equal (e.g. flip and rotation).
- Only valid and unique boards are considered states of the MDP. Hence, the number of states is 765.
- To map and speed up the relations between IDs and states are created two lookup tables:
    - **id_to_state** -> for each ID are associated the following informations:
        1. The type of the board: -1 not valid, 1 terminal, 0 otherwise. 
        2. If it's terminal the winner, otherwise the next player to play.
        3. If it's necessary to make flip.
        4. If it's necessary to make rotations and how many.
        5. The assosiated state.
    - **state_to_id** -> for each state are associated the following informations:
        1. The type of the board: -1 not valid, 1 terminal, 0 otherwise. 
        2. If it's terminal the winner or the next player to play.
        3. The smallest associated ID. 

In [2]:
# Number of IDs to checks.
IDs = 3 ** 9
# Create the two lookup tables.
id_state_lkt = np.zeros((IDs, 5), dtype=np.int32)
state_id_lkt = np.empty([0, 3], dtype=np.int32)
# Loop for each ID.
state = 0
for i in range(IDs):
    # Convert the ID to the relative board.
    board = id_to_board(i)
    # Compute the board rotations and symmetries.
    ids = board_to_ids(board)
    # Skip if already valueted.
    if ids[0, 0] < i:
        continue
    # Get board info.
    info = board_info(board)
    # Check if the board is valid.
    if info[0] == -1:
        # Invalid board.
        # Populate the id_to_state lookup table.
        for id in ids:
            id_state_lkt[id[0], :] = [-1, -1, -1, -1, -1]
    elif info[0] == 0:
        # Non-terminal board.
        # Check next player.
        if info[1] == 1:
            # X's turn.
            # Populate the id_to_state lookup table.
            for id in ids:
                id_state_lkt[id[0], :] = [info[0], info[1], id[1], id[2], state]
            # Populate the state_to_id lookup table.
            state_id_lkt = np.append(state_id_lkt, [[info[0], info[1], ids[0, 0]]], axis=0)
            # Increase the state counter.
            state += 1
        else:
            # O's turn, not a state.
            for id in ids:
                id_state_lkt[id[0], :] = [info[0], info[1], id[1], id[2], -1]
    else:
        # Terminal board.
        # We place dummy values that we'll replace.
        if info[1] == 0:
            # Draw: -2.
            for id in ids:
                id_state_lkt[id[0], :] = [info[0], info[1], id[1], id[2], -2]
        elif info[1] == 1:
            # Win: -3.
            for id in ids:
                id_state_lkt[id[0], :] = [info[0], info[1], id[1], id[2], -3]
        else:
            # Loss: -4.
            for id in ids:
                id_state_lkt[id[0], :] = [info[0], info[1], id[1], id[2], -4]
# Add terminal states to lookup table.
state_id_lkt = np.append(state_id_lkt, [[1, 0, -1]], axis=0)  # index: state
state_id_lkt = np.append(state_id_lkt, [[1, 1, -2]], axis=0)  # index: state + 1
state_id_lkt = np.append(state_id_lkt, [[1, 2, -3]], axis=0)  # index: state + 2
# Remap terminal states.
id_state_lkt[(id_state_lkt[:, -1] == -2), -1] = state
id_state_lkt[(id_state_lkt[:, -1] == -3), -1] = state + 1
id_state_lkt[(id_state_lkt[:, -1] == -4), -1] = state + 2

# Number of states.
S = state_id_lkt.shape[0]
# Number of actions.
A = 9

## Transitions matrix
- The transitions matrix P has dimensions *S x S x A*.
- Use the *Law of Total Probability* to create P.
- ***P_X(s,s',a)***: the transition from s to s' is deterministaclly defined by action a taken.
- ***P_O(s,s')***: the transition from s to s' is randomic, in particular the opponent's policy is uniform.
- ***P = P_X \* P_O***

In [3]:
prec = np.float128

# Create transitions matrix P.
P = np.zeros((S, S, A), dtype=prec)
for s in range(S):
    info = state_id_lkt[s]
    if info[0] == 1:
        # Terminal state: all actions lead here.
        P[s, s, :] = 1.0
    else:
        # Not a terminal board.
        # What can we do?
        actions_X = get_actions(info[2])
        # SANITY CHECK
        if len(actions_X) == 0:
            print("ERROR IN P GENERATION: NO POSSIBLE ACTIONS FOR X.")
            raise
        for a_X in range(A):
            if a_X in actions_X:
                # Possible action.
                # Compute the new board and its ID.
                id = np.copy(info[2])
                id_X = id + (3 ** (8 - a_X))
                # Check if this is a terminal state.
                if id_state_lkt[id_X, 0] == 1:
                    if id_state_lkt[id_X, 1] == 0:
                        # Draw.
                        P[s, S - 3, a_X] = 1.0
                    elif id_state_lkt[id_X, 1] == 1:
                        # Win.
                        P[s, S - 2, a_X] = 1.0
                    else:
                        print("ERROR IN P GENERATION: TERMINAL STATE INVALID.")
                        raise
                else:
                    # Get the next state from the new ID.
                    actions_O = get_actions(id_X)
                    if len(actions_O) == 0:
                        print("ERROR IN P GENERATION: EMPTY O ACTIONS.")
                        raise
                    for a_O in actions_O:
                        id_O = id_X + 2 * (3 ** (8 - a_O))
                        sp = id_state_lkt[id_O, 4]
                        # SANITY CHECK
                        if sp < 0:
                            print("ERROR IN P GENERATION: NEXT STATE {} INVALID.".format([id_O, sp]))
                            raise
                        P[s, sp, a_X] += 1.0 / float(len(actions_O))
            else:
                # Impossible action: stay here.
                P[s, s, a_X] = 1.0

## Row-stochasticity checks

In [4]:
sto = np.zeros((S, A))
for a in range(A):
    for i, row in enumerate(P[:, :, a]):
        sto[i, a] = np.sum(row)
print(np.count_nonzero(sto != 1.0))

0


## Rewards matrix
- +1 for an action that leads to a winning board.
- -1 for an action that leads to a losing board.
- 0 for any other actions.
- Expected rewards matrix R has dimentions *S x A*.
- Create a reward vector with dimention S that for each state says 1, 0, -1  in base of its type.
- For each couple state-action, take the transitions vector P(s, :, a) 
- Compute the expected reward for the state s as the inner product of the two former vectors.


In [5]:
prec = np.float128
# Create the rewards vector.
# Standard reward.
reward_vector = np.zeros(S, dtype=prec)
# Least possible number of moves.
# reward_vector = np.full(S, -1.0, dtype=prec)
# Set reward for win and loss.
reward_vector[-2] = 1.0
reward_vector[-1] = -1.0

# Create rewards matrix.
R = np.zeros((S, A), dtype=prec)
# Loop for each state.
for s in range(S):
    # Get state info.
    info = state_id_lkt[s]
    if info[0] == 1:
        continue
    # Get valid actions.
    actions = get_actions(info[2])
    # Loop for each action.
    for a in range(A):
        if a in actions:
            R[s, a] = np.matmul(np.copy(P[s, :, a]), reward_vector, dtype=prec)
        else:
            # Soft constrain for invalid move.
            R[s, a] = -1.0

In [6]:
# Dump data on files.
id_state_lkt.dump("ttt_id2s.dat")
state_id_lkt.dump("ttt_s2id.dat")
P.dump("ttt_P.dat")
R.dump("ttt_R.dat")