In [3]:
import numpy as np
import random
from collections import defaultdict

# a. Setting up the environment
class TicTacToeEnv:
    def __init__(self):
        self.reset()

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)  # 0 for empty, 1 for 'X', -1 for 'O'
        self.current_player = 1
        return self.board

    def available_actions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def take_action(self, action):
        x, y = action
        self.board[x, y] = self.current_player
        reward = self.check_winner()
        self.current_player *= -1  # Switch player
        done = reward != 0 or len(self.available_actions()) == 0
        return self.board, reward, done

    def check_winner(self):
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3 or abs(sum(self.board[:, i])) == 3:
                return 1 if self.current_player == 1 else -1
        if abs(sum([self.board[i, i] for i in range(3)])) == 3 or abs(sum([self.board[i, 2 - i] for i in range(3)])) == 3:
            return 1 if self.current_player == 1 else -1
        return 0

# b. Defining the Tic-Tac-Toe game
class TicTacToeGame:
    def __init__(self):
        self.env = TicTacToeEnv()
        self.q_table = defaultdict(float)

    def get_state(self):
        return tuple(self.env.board.flatten())

    def choose_action(self, epsilon=0.1):
        state = self.get_state()
        if random.uniform(0, 1) < epsilon:
            return random.choice(self.env.available_actions())
        q_values = {action: self.q_table[(state, action)] for action in self.env.available_actions()}
        return max(q_values, key=q_values.get)

    # c. Building the reinforcement learning model
    def train(self, episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1):
        for episode in range(episodes):
            self.env.reset()
            state = self.get_state()
            done = False
            while not done:
                action = self.choose_action(epsilon)
                next_state, reward, done = self.env.take_action(action)
                next_state = tuple(next_state.flatten())
                if self.env.available_actions():  # Check if there are available actions
                    best_next_action = max(self.q_table[(next_state, a)] for a in self.env.available_actions())
                else:
                    best_next_action = 0
                self.q_table[(state, action)] += alpha * (reward + gamma * best_next_action - self.q_table[(state, action)])
                state = next_state

    # d. Training the model
    def test(self, episodes=10):
        for episode in range(episodes):
            self.env.reset()
            done = False
            print(f"Game {episode + 1}")
            while not done:
                action = self.choose_action(epsilon=0)  # No exploration during testing
                _, reward, done = self.env.take_action(action)
                print(self.env.board)
                print()
                if reward == 1:
                    print("X wins!")
                elif reward == -1:
                    print("O wins!")
                else:
                    print("It's a draw!")

# e. Testing the model
if __name__ == "__main__":
    game = TicTacToeGame()
    game.train(episodes=1000)
    game.test(episodes=3)


Game 1
[[1 0 0]
 [0 0 0]
 [0 0 0]]

It's a draw!
[[ 1 -1  0]
 [ 0  0  0]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [ 0  0  0]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [-1  0  0]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [-1  1  0]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [-1  1 -1]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [-1  1 -1]
 [ 1  0  0]]

X wins!
Game 2
[[1 0 0]
 [0 0 0]
 [0 0 0]]

It's a draw!
[[ 1 -1  0]
 [ 0  0  0]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [ 0  0  0]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [-1  0  0]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [-1  1  0]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [-1  1 -1]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [-1  1 -1]
 [ 1  0  0]]

X wins!
Game 3
[[1 0 0]
 [0 0 0]
 [0 0 0]]

It's a draw!
[[ 1 -1  0]
 [ 0  0  0]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [ 0  0  0]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [-1  0  0]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [-1  1  0]
 [ 0  0  0]]

It's a draw!
[[ 1 -1  1]
 [-1  1 -1]
 [ 0  0  0]]

It's a dr