### Q-learning
- q-learner (X) against random actor (O)
- q-learner (O) against random actor (X)
- q-learner (X) against minimax actor (O)
- q-learner (O) against minimax actor (X)
- against another q-learner
- against q-learner trained on random actor
- against q-learner trained on minimax

In [11]:
import numpy as np
import pandas as pd
import random
from numba import jit
import warnings
warnings.filterwarnings('ignore')

def print_board(board, depth = 0):
    mapping = {0:'x',1:'o',2:'.'}
    x = list(map(lambda x: mapping[x] if x in mapping else x, board))
    print('\n')
    print('\t'*depth,x[0],'|',x[1],'|',x[2])
    print('\t'*depth,'---------')
    print('\t'*depth,x[3],'|',x[4],'|',x[5])
    print('\t'*depth,'---------')
    print('\t'*depth,x[6],'|',x[7],'|',x[8])
    
@jit
def permissible_actions(x):
    return np.where(x == 2)[0]    

@jit
def random_action(board):
    return np.random.choice(permissible_actions(board))

@jit
def check_status(board):
    # Define all possible winning combinations
    winning_combinations = [
        [0, 1, 2], [3, 4, 5], [6, 7, 8],  # Rows
        [0, 3, 6], [1, 4, 7], [2, 5, 8],  # Columns
        [0, 4, 8], [2, 4, 6]             # Diagonals
    ]
    
    # Check each winning combination for a win
    for combo in winning_combinations:
        if board[combo[0]] == board[combo[1]] == board[combo[2]] and board[combo[0]] != 2:
            return board[combo[0]]
    
    # Check for a draw
    if 2 not in board:
        return 3
    
    return 2

    
@jit
def visualize_game(list_x):
    for x in list_x:
        print_board(x[-1])

def update_board(board,action_idx):
    player_idx, opponent_idx = who_to_move(board)
    board_new = board.copy()
    if player_idx == 0:
        board_new[action_idx] = 0
    else:
        board_new[action_idx] = 1
    return board_new


@jit
def who_to_move(board):
    if np.sum(np.where(board==0,1,0))>np.sum(np.where(board==1,1,0)):
        return 1,0
    else:
        return 0,1

@jit
def play_random_game(verbose = 0):
    board = np.array([2,2,2,2,2,2,2,2,2])
    game_history = []
    turn_count = 0
    status = 2
    while True:
        player_idx, opponent_idx = who_to_move(board)
        status = check_status(board)
        if status != 2:
            status_final = status
            break
        turn_count += 1
        action_idx = random_action(board)
        board_new = update_board(board,action_idx)
        game_history.append([board, status, turn_count, player_idx, opponent_idx, action_idx, board_new])
        board = board_new.copy()
    if verbose == 1:
        visualize_game(game_history)
    return game_history, status

@jit
def generate_random_board():
    game_history, status = play_random_game()
    boards = [events[-1] for events in game_history]
    return random.choice(boards)

@jit
def simulate_games(N):
    data = [] # board, action, win/loss
    for i in range(N):
        game_history, status_final = play_random_game()
        for event in game_history:
            board, status, turn_count, player_idx, opponent_idx, action_idx, board_new = event
            if status_final == 3:
                score = 0
            elif status_final == player_idx:
                score = +1
            else:
                score = -1
            data.append([i, ''.join(map(str, board)), turn_count, player_idx, opponent_idx, action_idx, score, status_final])
    return data

#### Q-learner (X) against random actor (O)

In [51]:
# Hyperparameters
verbose = 0
num_actions = 9
num_episodes = 1000000
alpha = 0.1  # Learning rate
gamma = 0.9 # No discount for immediate rewards
initial_epsilon = 1.00
epsilon_decay = 0.999995  # Decay factor for epsilon
min_epsilon = 0.01
epsilon = initial_epsilon
player_idx = 0
opponent_idx = 1

# Q-table initialization
num_states = 3
num_actions = 9
q_table = np.zeros((num_states, num_states, num_states, num_states, num_states, num_states, num_states, num_states, num_states, num_actions))

def greedy_action(x):
    subset = q_table[tuple(x)][permissible_actions(x)]
    argmax_index = np.argmax(subset)
    original_argmax_index = permissible_actions(x)[argmax_index]
    return original_argmax_index

def get_reward(status,player_idx,opponent_idx):
    if status==player_idx:
        return +1
    elif status==opponent_idx:
        return -1
    elif status == 3:
        return 0
    elif status == 2:
        return 0
    
result_history = []
for episode in range(num_episodes):
    if verbose ==1:
        print('\n--- Reset ---')
        print_board(board)
        print(status, turn)
        print(q_table[tuple(board)])
        
    # Reset Game
    board = np.array([2,2,2,2,2,2,2,2,2])
    status = check_status(board)
    turn = 0

    # Play Game
    while status == 2:
                
        # Player 1: action
        if np.random.uniform(0, 1) < epsilon:
            action = random_action(board)  
        else:
            action = greedy_action(board) 
        board_interim = update_board(board,action)
        status = check_status(board_interim)

        if verbose == 1:
            print(action)
            print_board(board_interim)
            print(status, get_reward(status,player_idx,opponent_idx))
            print(q_table[tuple(board)][action])
        
        if status != 2:
            # Game ends with players move, terminal state reached
            result_history.append(status) # winner
            reward = get_reward(status,player_idx,opponent_idx)            
            q_table[tuple(board)][action] += alpha * (reward - q_table[tuple(board)][action])
            break
            
        # Opponent action (Random)
        action_opponent = random_action(board_interim)
        board_new = update_board(board_interim,action_opponent)
        status = check_status(board_new) 
        
        if verbose == 1:
            print_board(board_new)
            print(status, get_reward(status,player_idx,opponent_idx))
        
        if status != 2:
            # Game ends with opponents move, terminal state reached
            result_history.append(status)
            reward = get_reward(status,player_idx,opponent_idx)
            q_table[tuple(board)][action] += alpha * (reward - q_table[tuple(board)][action])
            break
            
        # Game continues on, Update Q-table
        q_table[tuple(board)][action] += alpha * (0 + gamma * np.max(q_table[tuple(board_new)]) - q_table[tuple(board)][action]) 
        board = board_new
    
    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    # Print relevant information for each episode
    if episode % (num_episodes / 10) == 0:
        print("Player Win Rate:", np.round(np.mean(np.where(np.array(result_history[-1000:])==player_idx, 1, 0)),2), "Opponent Win Rate:", np.round(np.mean(np.where(np.array(result_history[-1000:])==opponent_idx, 1, 0)),2),"Epsilon:", round(epsilon, 2))


Player Win Rate: 0.0 Opponent Win Rate: 0.0 Epsilon: 1.0
Player Win Rate: 0.74 Opponent Win Rate: 0.17 Epsilon: 0.61
Player Win Rate: 0.89 Opponent Win Rate: 0.06 Epsilon: 0.37
Player Win Rate: 0.94 Opponent Win Rate: 0.03 Epsilon: 0.22
Player Win Rate: 0.97 Opponent Win Rate: 0.02 Epsilon: 0.14
Player Win Rate: 0.97 Opponent Win Rate: 0.02 Epsilon: 0.08
Player Win Rate: 0.98 Opponent Win Rate: 0.01 Epsilon: 0.05
Player Win Rate: 0.98 Opponent Win Rate: 0.0 Epsilon: 0.03
Player Win Rate: 0.99 Opponent Win Rate: 0.0 Epsilon: 0.02
Player Win Rate: 0.99 Opponent Win Rate: 0.0 Epsilon: 0.01
