Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

In [30]:
from itertools import combinations, chain
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy


from typing import Literal
from tqdm.auto import tqdm
import numpy as np

In [31]:
State = namedtuple('State', ['x', 'o'])

In [32]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))
        self.state = State(set(), set())
        self.magic = np.array([[2, 7, 6], [9, 5, 1], [4, 3, 8]])
        self.marks = {-1: 'O', 0: '.', 1: 'X'}
        self.winner = None
        self.game_over = False
    
    def print_board(self):
        """
        Print intuitive table
        """
        board = np.chararray(self.board.shape, itemsize=1, unicode=True)
        for i in range(board.shape[0]):
            for j in range(board.shape[1]):
                board[(i, j)] = self.marks[self.board[(i, j)]]
        print(board)

    def get_state(self):
        """ 
        get hashable state
        """
        return State(frozenset(self.state.x), frozenset(self.state.o))
    

    def win(self, elements):
        """
        Checks if elements is winning
        """
        return any(sum(c) == 15 for c in combinations(self.magic[elements], 3))
    
    
    def check_winner(self):
        """
        Check if there's a winner
        """

        p1 = self.board == 1
        p2 = self.board == -1

        if self.win(p1):
            self.winner = 'X'
            self.game_over = True
            return 1
        elif self.win(p2):
            self.winner = 'O'
            self.game_over = True
            return -1
        if 0 not in self.board:
            self.game_over = True
            return 0
        else:
            return 0


    def is_valid_move(self, action):
        return self.board[action] == '.'
    

    def make_move(self, action, id):
        self.board[action] = id
        self.check_winner()

        if id == 1:
            self.state.x.add(action)
        elif id == -1:
            self.state.o.add(action)

    

## Q-Learning Player Class

In [33]:
class QLPlayer:

    def __init__(
        self,
        epsilon = float(.7),
        alpha = float(.09),
        gamma = float(.9),
    ):
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.q_table = {}
    
    def get_q_val(self, state, action):
        """
        Get a value from the quality table, given a state-action pair
        """
        return self.q_table.get((state, action), 0.0)
    
    def update_q_table(self, state, action, reward, next_state):
        """
        Update values of the quality table
        """        
        if state not in self.q_table:
            self.q_table[state] = np.zeros((9))
        if next_state not in self.q_table:
            self.q_table[next_state] = np.zeros((9))

        # math
        q_val = reward + self.gamma * np.max(self.q_table[next_state])
        self.q_table[state][action] = (1 - self.alpha) * self.get_q_val(state, action) + self.alpha * q_val


    def make_move(self, state, available_moves, training: bool):
        """
        Make a move, choosing between exploration and exploitation based on epsilon
        """
        # exploration
        if np.random.rand() < self.epsilon and not training:
            return np.random.choice(available_moves)
        
        # exploitation
        else:
            q_val = [self.get_q_val(state, action) for action in available_moves]
            return available_moves[np.argmax(q_val)]




## Game

In [34]:
def ql_game(game: TicTacToe, agent):
    available_moves = [i for i in range(9)]
    id = 1
    final_reward = 0
    state = game.get_state()

    while not game.game_over:
        
        action = agent.make_move(state, available_moves, True)
        available_moves.remove(action)
        game.make_move((action // 3, action % 3), id)

        
        next_state = game.get_state()
        reward = 0

        if game.game_over:
            if game.winner == 1:
                reward = 1
            elif game.winner == -1:
                reward = -1
            else:
                reward = 0
        
        agent.update_q_table(state, action, reward, next_state)
        state = next_state
        final_reward += reward
        id = -id
    
    return final_reward
        


## Training

In [35]:
agent = QLPlayer()
final_reward = 0

for steps in tqdm(range(50_000)):
    game = TicTacToe()
    final_reward += ql_game(game, agent)


  0%|          | 0/50000 [00:00<?, ?it/s]

## Test against Random Player

In [38]:

total_wins = 0
total_losses = 0

for _ in range(7):
    game = TicTacToe()
    state = game.get_state()
    available_moves = [i for i in range(9)]
    id = 1

    while not game.game_over:
        # player 1, q-learner
        if id == 1:
            action = agent.make_move(state, available_moves, False)
            available_moves.remove(action)
            game.make_move((action // 3, action % 3), id)
            state = game.get_state()


        # player 2: random player
        elif id == -1:
            action = choice(list(available_moves))
            available_moves.remove(action)
            game.make_move((action // 3, action % 3), id)
            state = game.get_state()
        
        id = -id

    print(f"Game Over. Winner: {game.winner}")
    if game.winner == 'X':
        total_wins += 1
    elif game.winner == 'O':
        total_losses += 1
    game.print_board()
    print("\n   *****   \n")

print(f"Total wins: {total_wins}/10")
print(f"Total losses: {total_losses}/10")

Game Over. Winner: O
[['X' '.' 'O']
 ['O' 'O' 'X']
 ['O' 'X' 'X']]

   *****   

Game Over. Winner: X
[['X' 'X' 'X']
 ['O' '.' 'O']
 ['X' 'O' '.']]

   *****   

Game Over. Winner: X
[['X' 'O' 'O']
 ['X' 'X' '.']
 ['O' '.' 'X']]

   *****   

Game Over. Winner: None
[['X' 'O' 'X']
 ['X' 'X' 'O']
 ['O' 'X' 'O']]

   *****   

Game Over. Winner: X
[['.' 'O' 'X']
 ['.' 'O' 'X']
 ['O' 'X' 'X']]

   *****   

Game Over. Winner: O
[['X' 'X' 'O']
 ['O' 'O' 'O']
 ['.' 'X' 'X']]

   *****   

Game Over. Winner: X
[['X' 'X' 'X']
 ['X' 'O' 'O']
 ['.' 'O' '.']]

   *****   

Total wins: 4/10
Total losses: 2/10
