#### Game Setup

In [204]:
import numpy as np
import random

def print_board(board):
    mapping = {0:'.',1:'o',2:'x'}
    x = list(map(lambda x: mapping[x] if x in mapping else x, board))
    print('\n')
    print(x[0],'|',x[1],'|',x[2])
    print('---------')
    print(x[3],'|',x[4],'|',x[5])
    print('---------')
    print(x[6],'|',x[7],'|',x[8])

def generate_random_board():
    game_history, status = play_random_game()
    boards = [events[-1] for events in game_history]
    return random.choice(boards)

def permissible_actions(x):
    return np.where(x == 0)[0]

def random_action(x):
    return np.random.choice(permissible_actions(x))

def update_board(board,action_idx,player_idx):
    board_new = board.copy()
    if player_idx == 0:
        board_new[action_idx] = 1
    else:
        board_new[action_idx] = 2
    return board_new
    
def check_status(board):
    # 0 if player 1 wins, 1 if player 2 wins, 2 if draw, and 3 if incomplete
    status = 3
    board_string = ''.join(map(str, board))
    if '0' not in board_string:
        status = 2
    numbers = ['123','456','789','147','258','369','159','357']
    for number in numbers:
        indices = [int(number[0])-1,int(number[1])-1,int(number[2])-1]
        subset_board = ''.join(board_string[i] for i in indices)
        if subset_board  in ['111']:
            status = 0
        elif subset_board in ['222']:
            status = 1
    return status

def visualize_game(list_x):
    for x in list_x:
        print_board(x[-1])
        
def play_random_game(verbose = 0):
    board = np.array([0,0,0,0,0,0,0,0,0])
    game_history = []
    turn_count = 0
    player_idx = 0
    status = 3
    while True: # incomplete
            
        # check status
        status = check_status(board)
        if status != 3:
            status_final = status
            break
        turn_count += 1
            
        # take random action and update board
        action_idx = random_action(board)
        board_new = update_board(board,action_idx,player_idx)
        game_history.append([board, status, turn_count, player_idx, action_idx, board_new])
        board = board_new.copy()
        
        # switch player
        if player_idx == 0:
            player_idx = 1
        else:
            player_idx = 0
        
    if verbose == 1:
        visualize_game(game_history)
        
    return game_history, status

In [205]:
generate_random_board()

[array([0, 0, 1, 0, 0, 0, 0, 0, 0]), array([0, 0, 1, 0, 0, 0, 0, 2, 0]), array([1, 0, 1, 0, 0, 0, 0, 2, 0]), array([1, 0, 1, 0, 0, 0, 2, 2, 0]), array([1, 0, 1, 0, 0, 0, 2, 2, 1]), array([1, 2, 1, 0, 0, 0, 2, 2, 1]), array([1, 2, 1, 1, 0, 0, 2, 2, 1]), array([1, 2, 1, 1, 0, 2, 2, 2, 1]), array([1, 2, 1, 1, 1, 2, 2, 2, 1])]


array([0, 0, 1, 0, 0, 0, 0, 2, 0])

In [198]:
game_history, status = play_random_game()
print(game_history)

[[array([0, 0, 0, 0, 0, 0, 0, 0, 0]), 3, 1, 0, 5, array([0, 0, 0, 0, 0, 1, 0, 0, 0])], [array([0, 0, 0, 0, 0, 1, 0, 0, 0]), 3, 2, 1, 1, array([0, 2, 0, 0, 0, 1, 0, 0, 0])], [array([0, 2, 0, 0, 0, 1, 0, 0, 0]), 3, 3, 0, 0, array([1, 2, 0, 0, 0, 1, 0, 0, 0])], [array([1, 2, 0, 0, 0, 1, 0, 0, 0]), 3, 4, 1, 2, array([1, 2, 2, 0, 0, 1, 0, 0, 0])], [array([1, 2, 2, 0, 0, 1, 0, 0, 0]), 3, 5, 0, 8, array([1, 2, 2, 0, 0, 1, 0, 0, 1])], [array([1, 2, 2, 0, 0, 1, 0, 0, 1]), 3, 6, 1, 6, array([1, 2, 2, 0, 0, 1, 2, 0, 1])], [array([1, 2, 2, 0, 0, 1, 2, 0, 1]), 3, 7, 0, 3, array([1, 2, 2, 1, 0, 1, 2, 0, 1])], [array([1, 2, 2, 1, 0, 1, 2, 0, 1]), 3, 8, 1, 7, array([1, 2, 2, 1, 0, 1, 2, 2, 1])], [array([1, 2, 2, 1, 0, 1, 2, 2, 1]), 3, 9, 0, 4, array([1, 2, 2, 1, 1, 1, 2, 2, 1])]]


In [335]:
#### Random Play

In [64]:
def simulate_games(N):
    data = [] # board, action, win/loss
    data2 = []
    for i in range(N):
        game_history, status_final = play_random_game()
        for event in game_history:
            board, status, turn_count, player_idx, action_idx, board_new = event
            if status_final == 2:
                score = 0
            elif status_final == player_idx:
                score = +1
            else:
                score = -1
            data.append([i, ''.join(map(str, board)), turn_count, player_idx, action_idx, score, status_final])
        data2.append(status_final)
    return data, data2
    
data, data2  = simulate_games(10000)

In [65]:
import pandas as pd
pd.DataFrame(data2).value_counts()

0    5792
1    2923
2    1285
dtype: int64

In [66]:
import pandas as pd
df = pd.DataFrame(data, columns = ['Game','Board', 'Turn','Player','Action', 'Score', 'Result'])
df.head()

Unnamed: 0,Game,Board,Turn,Player,Action,Score,Result
0,0,0,1,0,3,1,0
1,0,100000,2,1,8,-1,0
2,0,100002,3,0,6,1,0
3,0,100102,4,1,7,-1,0
4,0,100122,5,0,2,1,0


In [67]:
df.nunique()

Game      10000
Board      4514
Turn          9
Player        2
Action        9
Score         3
Result        3
dtype: int64

#### Minmax Algorithm

- maximize: find the best value of an action given child states
- minimize: find the lowest value of an action given child states
- terminal node: returns the value of an action

In [68]:
def generate_child(board, player_idx):
    possible_actions = permissible_actions(board)
    childs = []
    for action_idx in possible_actions:
        childs.append(update_board(board, action_idx, player_idx))
        #print_board(update_board(board, action_idx, player_idx))
    return childs
    
def compute_max_value(board, player_idx, index = False):
    
    if player_idx == 0:
        other_player_idx = 1
    else:
        other_player_idx = 0
    status = check_status(board)
    if status == player_idx:
        return +1
    elif status == other_player_idx:
        return -1
    elif status == 2:
        return 0
    else:
        # find child boards
        childs = generate_child(board, player_idx)

        # find max value for opponent
        max_value = np.max([compute_max_value(child, other_player_idx) for child in childs])    
        max_index = np.argmax([compute_max_value(child, other_player_idx) for child in childs])
        
        # what does that mean for you
        if index == False:
            return -max_value
        else:
            return -max_value, permissible_actions(board)[max_index]

In [70]:
test = np.array([2,1,2, 0, 0, 1, 0, 1, 2])
print_board(test) # baseline
compute_max_value(test, 1, True)



x | o | x
---------
. | . | o
---------
. | o | x


(1, 3)

In [71]:
test = generate_random_board()


In [72]:
print_board(test)
compute_max_value(test, 1, True)



o | o | o
---------
x | o | o
---------
. | . | .


-1

In [73]:
compute_max_value(test, 0, True)

1

#### Q-learning against Random Opponent

In [334]:
# Hyperparameters
verbose = 0
num_actions = 9
num_episodes = 1000000
alpha = 0.1  # Learning rate
gamma = 0.9 # No discount for immediate rewards
initial_epsilon = 1.00
epsilon_decay = 0.999995  # Decay factor for epsilon
min_epsilon = 0.01
epsilon = initial_epsilon
player_idx = 0
opponent_idx = 1

# Q-table initialization
num_states = 3
num_actions = 9
q_table = np.zeros((num_states, num_states, num_states, num_states, num_states, num_states, num_states, num_states, num_states, num_actions))

def greedy_action(x):
    subset = q_table[tuple(x)][permissible_actions(x)]
    argmax_index = np.argmax(subset)
    original_argmax_index = permissible_actions(x)[argmax_index]
    return original_argmax_index

def get_reward(status,player_idx,opponent_idx):
    if status==player_idx:
        return +1
    elif status==opponent_idx:
        return -1
    elif status == 3:
        return 0
    elif status == 2:
        return 0
    
result_history = []
for episode in range(num_episodes):
    if verbose ==1:
        print('\n--- Reset ---')
        print_board(board)
        print(status, turn)
        print(q_table[tuple(board)])
    # Reset Game
    board = np.array([0,0,0,0,0,0,0,0,0])
    status = check_status(board)
    turn = 0
    
    # Play Game
    while status == 3:
                
        # Player 1: action
        if np.random.uniform(0, 1) < epsilon:
            action = random_action(board)  
        else:
            action = greedy_action(board) 
        board_interim = update_board(board,action,player_idx)
        status = check_status(board_interim)

        if verbose == 1:
            print(action)
            print_board(board_interim)
            print(status, get_reward(status,player_idx,opponent_idx))
            print(q_table[tuple(board)][action])
        
        if status != 3:
            # Game ends with players move, terminal state reached
            result_history.append(status) # winner
            reward = get_reward(status,player_idx,opponent_idx)            
            q_table[tuple(board)][action] += alpha * (reward - q_table[tuple(board)][action])
            break
            
        # Opponent action (Random)
        action_opponent = random_action(board_interim)
        board_new = update_board(board_interim,action_opponent,opponent_idx)
        status = check_status(board_new) 
        
        if verbose == 1:
            print_board(board_new)
            print(status, get_reward(status,player_idx,opponent_idx))
        
        if status != 3:
            # Game ends with opponents move, terminal state reached
            result_history.append(status)
            reward = get_reward(status,player_idx,opponent_idx)
            q_table[tuple(board)][action] += alpha * (reward - q_table[tuple(board)][action])
            break
            
        # Game continues on, Update Q-table
        q_table[tuple(board)][action] += alpha * (reward + gamma * np.max(q_table[tuple(board_new)]) - q_table[tuple(board)][action]) 
        board = board_new
    
    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % (num_episodes / 10) == 0:
        print("Player Win Rate:", np.round(np.mean(np.where(np.array(result_history[-1000:])==player_idx, 1, 0)),2), "Opponent Win Rate:", np.round(np.mean(np.where(np.array(result_history[-1000:])==opponent_idx, 1, 0)),2),"Epsilon:", round(epsilon, 2))


Player Win Rate: 0.0 Opponent Win Rate: 1.0 Epsilon: 1.0
Player Win Rate: 0.69 Opponent Win Rate: 0.17 Epsilon: 0.61
Player Win Rate: 0.75 Opponent Win Rate: 0.09 Epsilon: 0.37
Player Win Rate: 0.77 Opponent Win Rate: 0.06 Epsilon: 0.22
Player Win Rate: 0.81 Opponent Win Rate: 0.03 Epsilon: 0.14
Player Win Rate: 0.85 Opponent Win Rate: 0.02 Epsilon: 0.08
Player Win Rate: 0.82 Opponent Win Rate: 0.01 Epsilon: 0.05
Player Win Rate: 0.82 Opponent Win Rate: 0.01 Epsilon: 0.03
Player Win Rate: 0.82 Opponent Win Rate: 0.01 Epsilon: 0.02
Player Win Rate: 0.85 Opponent Win Rate: 0.01 Epsilon: 0.01


In [345]:
test = generate_random_board()
print_board(test)
print(q_table[tuple(test)])

[array([0, 0, 0, 0, 0, 1, 0, 0, 0]), array([0, 2, 0, 0, 0, 1, 0, 0, 0]), array([0, 2, 0, 0, 0, 1, 0, 0, 1]), array([0, 2, 0, 0, 0, 1, 0, 2, 1]), array([0, 2, 0, 0, 1, 1, 0, 2, 1]), array([0, 2, 2, 0, 1, 1, 0, 2, 1]), array([0, 2, 2, 0, 1, 1, 1, 2, 1]), array([0, 2, 2, 2, 1, 1, 1, 2, 1]), array([1, 2, 2, 2, 1, 1, 1, 2, 1])]


. | x | .
---------
. | . | o
---------
. | x | o
[0.74457758 0.         0.99980337 0.52865885 2.15347762 0.
 0.62176854 0.         0.        ]


In [343]:
q_table[tuple(test)]

array([0.        , 2.10151871, 2.00917492, 0.        , 2.07056195,
       2.32712626, 0.        , 0.        , 1.56715459])

In [102]:
np.where(np.array(result_history)==player_idx, 1, 0)

array([1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,