Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [310]:
from copy import deepcopy
import random
from numpy import linspace

class TicTacToe:
    def __init__(self):
        self.board = [[0 for _ in range(3)] for _ in range(3)]
        self.current_player = 1

    def print_board(self):
        elements = ['X', 'O']
        for row in self.board:
            for e in row:
                print(f"{elements[e-1]} ", end="")
            print()

    def make_move(self, row, col):
        if self.is_valid_move(row, col):
            self.board[row][col] = self.current_player
            self.switch_player()
        else:
            print("Invalid move.")

    def is_valid_move(self, row, col):
        return 0 <= row < 3 and 0 <= col < 3 and self.board[row][col] == 0

    def switch_player(self):
        self.current_player = 2 if self.current_player == 1 else 1

    def make_random_move(self):
        empty_cells = [(i, j) for i in range(3) for j in range(3) if self.board[i][j] == 0]
        if empty_cells:
            row, col = random.choice(empty_cells)
            self.make_move(row, col)

    def check_winner(self):
        # Check rows, columns, and diagonals for a winner
        for i in range(3):
            if self.board[i][0] == self.board[i][1] == self.board[i][2] != 0 or \
               self.board[0][i] == self.board[1][i] == self.board[2][i] != 0:
                return self.board[i][0]

        if self.board[0][0] == self.board[1][1] == self.board[2][2] != 0 or \
           self.board[0][2] == self.board[1][1] == self.board[2][0] != 0:
            return self.board[1][1]

        return None

    def is_board_full(self):
        return all(self.board[i][j] != 0 for i in range(3) for j in range(3))

    def play_with_agents(self, agents):
        agent = agents[self.current_player-1]
        while True:
            if self.current_player == 1:
                action = agent.choose_action(self.board)
                self.make_move(action[0], action[1])   
            else:
                self.make_random_move()

            winner = self.check_winner()
            if winner:
                if winner == 1:
                    return 1
                else:
                    return 0

            if self.is_board_full():
                return 0


    def play_with_agent(self, agent, start):
        self.current_player = start
        while True:
            if self.current_player == 1:
                action = agent.choose_action(self.board)
                self.make_move(action[0], action[1])   
            else:
                self.make_random_move()

            winner = self.check_winner()
            if winner:
                if winner == 1:
                    return 1
                else:
                    return 0

            if self.is_board_full():
                return 0

class TicTacToeRND:
    def get_valid_actions(self, state):
        # Return a list of valid actions for the given state
        a =  [(i, j) for i in range(3) for j in range(3) if state[i][j] == 0]
        return a

    def choose_action(self, state):
        return random.choice(self.get_valid_actions(state))

class TicTacToeRL:
    def __init__(self, learning_rate=0.5, discount_factor=0.7, exploration_prob=0.1):
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob

        # Q-table to store Q-values for state-action pairs
        self.q_table = {}

    def get_state_key(self, state):
        return str(state)

    def get_valid_actions(self, state):
        # Return a list of valid actions for the given state
        a =  [(i, j) for i in range(3) for j in range(3) if state[i][j] == 0]
        return a

    def get_q_value(self, state, action):
        state_key = self.get_state_key(state)
        return self.q_table.get((state_key, action), 0.0)

    def update_q_value(self, state, action, new_value):
        state_key = self.get_state_key(state)
        self.q_table[(state_key, action)] = new_value

    def choose_action(self, state):
        # Epsilon-greedy strategy for action selection
        if random.uniform(0, 1) <= self.exploration_prob:
            return random.choice(self.get_valid_actions(state))
        else:
            # Choose the action with the highest Q-value
            q_values = [self.get_q_value(state, action) for action in self.get_valid_actions(state)]
            max_q_value = max(q_values, default=0.0)
            best_actions = [action for action, q_value in zip(self.get_valid_actions(state), q_values) if q_value == max_q_value]
            return random.choice(best_actions)

    def train(self, state, action, reward, next_state):
        current_q_value = self.get_q_value(state, action)
        max_q_value_next = max([self.get_q_value(next_state, next_action) for next_action in self.get_valid_actions(next_state)], default=0.0)

        new_q_value = current_q_value + self.learning_rate * (reward + self.discount_factor * max_q_value_next - current_q_value)
        self.update_q_value(state, action, new_q_value)

def tic_tac_toe_rl_train():
    # Initialize the RL model
    agents = (TicTacToeRL(0.5, 0.7, 0.1), TicTacToeRL(0.5, 0.7, 0.1))
    # Training loop
    num_episodes = 100000
    for ep in range(num_episodes):
        if ep < num_episodes-1:
            print(f"TRAINING: {ep+1}/{num_episodes}", end="\r")
        else:
            print(f"TRAINING COMPLETED\n")
        
        history = ([],[])
        start = random.choice([0,1])
        turn = start
        cntTurn = 0
        game = TicTacToe()
        state = [[0,0,0],[0,0,0],[0,0,0]]
        while True:
            # Player 1 (RL agent) move
            action = agents[turn].choose_action(state)
            game.make_move(action[0], action[1])
            history[turn].append((state, action, game.board))

            cntTurn+=1

            if cntTurn>=5:

                # Check for a winner or a tie
                winner = game.check_winner()

                if winner:
                    loser = 1 if winner == 2 else 2
                    n=len(history[winner-1])

                    for i, h in enumerate(history[winner-1]):
                        scores = linspace(0.1,1,num=n)**2
                        agents[winner-1].train(h[0], h[1], scores[i], h[2])

                    n=len(history[loser-1])
                    for i,h in enumerate(history[loser-1]):
                        scores = linspace(-.1,-.1,num=n)**2
                        agents[loser-1].train(h[0], h[1], scores[i], h[2])

                    break

                if game.is_board_full():
                    n=len(history[start])
                    for i, h in enumerate(history[start]):
                        scores = linspace(-.1,-.5,num=n)**2
                        agents[start].train(h[0], h[1], scores[i], h[2])

                    n=len(history[1-start])
                    for i, h in enumerate(history[1-start]):
                        scores = linspace(-.01,-3.,num=n)**2
                        agents[1-start].train(h[0], h[1], scores[i], h[2])

                    break

            state = deepcopy(game.board)

            turn = 1 - turn

    return agents[0]

def tic_tac_toe_rl_train_random(x):
    # Initialize the RL model
    agents = (TicTacToeRL(*x), TicTacToeRND())

    # Training loop
    num_episodes = 300000
    for ep in range(num_episodes):
        if ep <num_episodes-1:
            print(f"TRAINING: {round((ep+1)/num_episodes*100, 2)}%", end="\r")
        else:
            print(f"TRAINING COMPLETED!\n")

        game = TicTacToe()
        state = [[0,0,0],[0,0,0],[0,0,0]]
        history = ([],[])
        start = random.choice([0,1])
        turn = start
        cntTurn = 0
        while True:
            # Player 1 (RL agent) move
            action = agents[turn].choose_action(state)
            game.make_move(action[0], action[1])
            history[turn].append((state, action, game.board))

            cntTurn+=1

            if cntTurn>=5:

                # Check for a winner or a tie
                winner = game.check_winner()

                if winner == 1:
                    n=len(history[0])
                    for i, h in enumerate(history[0]):
                        scores = linspace(.1,1,num=n)**2
                        agents[0].train(h[0], h[1], scores[i], h[2])

                    break

                if winner == 2:
                    n=len(history[0])
                    for i, h in enumerate(history[0]):
                        scores = linspace(-.01,-.1,num=n)**2
                        agents[0].train(h[0], h[1], scores[i], h[2])

                    break

                if game.is_board_full():
                    n=5
                    scoreMAX = 0.1 if start == 0 else 0.3
                    scores = linspace(.01,scoreMAX,num=n)**2
                    for i, h in enumerate(history[0]):
                        agents[0].train(h[0], h[1], scores[i], h[2])

                    break

            state = deepcopy(game.board)

            turn = 1 - turn

    return agents[0]



In [311]:
#agent = tic_tac_toe_rl_train_random()

In [312]:
def test(agent):
    num_episodes=10000
    winsF = 0
    winsS = 0

    for _ in range(num_episodes):
        game = TicTacToe()
        winsF += game.play_with_agent(agent, 1)
        winsS += game.play_with_agent(agent, 2)


    print(f"wins F:{round(winsF/num_episodes*100, 2)}%")
    print(f"wins S:{round(winsS/num_episodes*100, 2)}%")

    # return winsF+winsS

import numpy

def validate():
    a = linspace(0, 1,num=11)[1:]
    b = deepcopy(a)
    c = deepcopy(a)
    i=1

    mesh = numpy.array(numpy.meshgrid(a, b, c)).T.reshape(-1, 3)
    finals = []

    for x in mesh:
        a = tic_tac_toe_rl_train_random(x)
        score = test(a)
        finals.append((x, a, score))
        i+=1
        print(f"{numpy.round(i/mesh.shape[0]*100,2)}%", end="\r")

    finals.sort(key=lambda f:f[2], reverse=True)

    for f in finals:
        print(f"{str(f[0])} -> {f[2]}")


In [None]:
agent2=tic_tac_toe_rl_train_random([.2,1,.1])

In [318]:
test(agent2)

wins F:94.51%
wins S:94.0%
