# TIC-TAC-TOE Markov Decision Process

## Auxiliary functions to build the model

In [2]:
import numpy as np

def board_to_id(board):
    """Takes a board matrix and returns a base-10 board ID."""
    id = 0
    for i, elem in enumerate(board.flatten()):
        id += int(elem * (3 ** (8 - i)))
    return id

def board_to_ids(board):
    """Takes a board matrix and returns the IDs of all its rotations and symmetries with the relative transformations."""
    # Define an empty matrix for the IDs.
    ids = np.empty([0, 3], dtype=np.int32)
    # No flip.
    for i in range(4):
        id = board_to_id(np.rot90(board, i))
        # Append the ID only if not present.
        if not id in ids[:, 0]:
            ids = np.append(ids, [[id, 0, i]], axis=0)
    # Flip left-right.
    fliped_board = np.fliplr(board)
    for j in range(4):
        id = board_to_id(np.rot90(fliped_board, j))
        # Append the ID only if not present.
        if not id in ids[:, 0]:
            ids = np.append(ids, [[id, 1, j]], axis=0)
    # Return the sorted matrix.
    return ids[ids[:,0].argsort()]

def id_to_board(id):
    """Takes a base-10 board ID and returns a board matrix."""
    board_str = np.base_repr(id, base=3).zfill(9)
    return np.array(list(board_str), dtype=np.int8).reshape(3, 3)

def find_win(board, marker):
    """Takes a board matrix and checks if there are 3 equal markers in a row horizontal, vertical or diagonal."""
    # Checks the rows.
    for row in board:
        if np.all(row == marker):
            return True
    # Checks the columns.
    for col in board.T:
        if np.all(col == marker):
            return True
    # Checks the diagonal.
    diag = np.diagonal(board)
    if np.all(diag == marker):
        return True
    # Checks the anti-diagonal.
    fliped_board = np.fliplr(board)
    anti_diag = np.diagonal(fliped_board)
    if np.all(anti_diag == marker):
        return True
    # No winning combinations.
    return False

def board_info(board):
    """Takes a board matrix and returns its information: terminal, valid or invalid board and the winner or the next player."""
    xs = np.count_nonzero(board == 1)
    os = np.count_nonzero(board == 2)
    # Swith according to the difference of the markers.
    diff = xs - os
    if diff == 1:
        # Last player to move was X.
        if find_win(board, 2):
            return -1, -1
        if find_win(board, 1):
            return 1, 1
        else:
            # Board is full.
            if xs == 5:
                return 1, 0
            else:
                return 0, 2
    elif diff == 0:
        # Last player to move was O.
        if find_win(board, 1):
            return -1, -1
        if find_win(board, 2):
            return 1, 2
        else:
            return 0, 1
    else:
        return -1, -1

def map_action(action, flip, rot):
    """Takes an action and applys flip and rotations."""
    # Create a dummy matrix of zeros with a 1.
    flat_board = np.zeros(9, dtype=np.int32)
    flat_board[action] = 1
    board = flat_board.reshape(3, 3)
    # Flip the matrix.
    if flip == 1:
        flipped_board = np.fliplr(board)
    else:
        flipped_board = board
    # Rotate the matrix.
    rotated_B = np.rot90(flipped_board, rot)
    # Find the new postion of the 1.
    new_action = np.argmax(rotated_B)
    new_indices = np.unravel_index(new_action, (3, 3))
    return new_action, new_indices

def get_actions(id):
    """Takes a id and returns the possible actions to be taken."""
    flat_board = id_to_board(id).flatten()
    return np.where(flat_board == flat_board.min())[0]

def print_board(B):
    for i in range(3):
        for j in range(3):
            if B[i, j] == 1:
                print(" X ", end="")
            elif B[i, j] == 2:
                print(" O ", end="")
            else:
                print("   ", end="")
            if j != 2:
                print("|", end="")
            else:
                print()
        if i != 2:
            print("---+---+---")

In [3]:
B = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.int32)

## States
- A board is represented with a 3x3 matrix and each cell can be filled with None, X or O, numerically represented with 0, 1 and 2. Hence, the number of possible boards is 3^9 = 19683.
- Each board is uniquely identified with an ID, obtained by the conversion in base-10 of the flattened board.
- Many boards aren't valid (e.g. the number of Xs minus number of Os is less than 0 or greater than 1) and many are equal (e.g. flip and rotation).
- Only valid and unique boards are considered states of the MDP. Hence, the number of states is 765.
- To map and speed up the relations between IDs and states are created two lookup tables:
    - **id_to_state** -> for each ID are associated the following informations:
        1. The type of the board: -1 not valid, 1 terminal, 0 otherwise. 
        2. If it's terminal the winner, otherwise the next player to play.
        3. If it's necessary to make flip.
        4. If it's necessary to make rotations and how many.
        5. The assosiated state.
    - **state_to_id** -> for each state are associated the following informations:
        1. The type of the board: -1 not valid, 1 terminal, 0 otherwise. 
        2. If it's terminal the winner or the next player to play.
        3. The smallest associated ID. 

In [1]:
# Number of IDs to checks.
IDs = 3 ** 9
# Create the two lookup tables.
id_state_lkt = np.zeros((IDs, 5), dtype=np.int32)
state_id_lkt = np.empty([0, 3], dtype=np.int32)
# Loop for each ID.
state = 0
for i in range(IDs):
    # Convert the ID to the relative board.
    board = id_to_board(i)
    # Compute the board rotations and symmetries.
    ids = board_to_ids(board)
    # Skip if already valueted.
    if ids[0, 0] < i:
        continue
    # Get board info.
    info = board_info(board)
    # Check if the board is valid.
    if info[0] == -1:
        # Populate the id_to_state lookup table.
        for id in ids:
            id_state_lkt[id[0]] = [-1, -1, -1, -1, -1]
    else:
        # Populate the id_to_state lookup table.
        for id in ids:
            id_state_lkt[id[0]] = [info[0], info[1], id[1], id[2], state]
        # Populate the state_to_id lookup table.
        state_id_lkt = np.append(state_id_lkt, [[info[0], info[1], ids[0, 0]]], axis=0)
        # Increase the state counter.
        state += 1
        
# Number of states
S = state_id_lkt.shape[0]
# Number of actions
A = 9

NameError: name 'np' is not defined

## Transitions matrix
- The transitions matrix P has dimensions *S x S x A*.
- Use the *Law of Total Probability* to create P.
- ***P_X(s,s',a)***: the transition from s to s' is deterministaclly defined by action a taken.
- ***P_O(s,s')***: the transition from s to s' is randomic, in particular the opponent's policy is uniform.
- ***P = P_X \* P_O***

In [5]:
# Create P_X matrix.
P_X = np.zeros((S, S, A), dtype=np.float64)
# Loop for each satate.
# Get the board from the state.
# Loop for each action.
# Compute the new board.
# Get the next state from the new board.
# Set P_X(s,s',a) = 1

# Create P_O matrix
P_O = np.zeros((S, S), dtype=np.float64)
# Loop for each satate.
# Get the board from the state.
# Get possible actions and thier number. 
# Loop for each action.
# Compute the new board.
# Get the next state from the new board.
# Set P_O(s,s') = 1/size

# Create transitions matrix P.
P = np.zeros((S, S, A), dtype=np.float64)
for a in range(A):
    P[:, :, a] = np.matmul(P_X[:, :, a], P_O, dtype=np.float64)

## Rewards matrix
- +1 for an action that leads to a winning board.
- -1 for an action that leads to a losing board.
- 0 for any other actions.
- Expected rewards matrix R has dimentions *S x A*.
- Create a reward vector with dimention S that for each state says 1, 0, -1  in base of its type.
- For each couple state-action, take the transitions vector P(s, :, a) 
- Compute the expected reward for the state s as the inner product of the two former vectors.


In [6]:
# Compute the rewards vector.
reword_vector = np.zeros(S, dtype=np.float64)
# For each termianl state
# Set 1 if X is winner, -1 if it's loser.
# Create rewards matrix.
R = np.zeros((S, A), dtype=np.float64)
# For each state
# For each action
# R(s,a) = np.matmul(P[s, :, a], reword_vector, dtype=np.float64)

In [None]:
# Dump data on files.
# P.dump("tic_tac_toe_P.dat")
# R.dump("tic_tac_toe_R.dat")