In [1]:
import numpy as np
import random

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1
        return self.board

    def available_actions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def step(self, action):
        if self.board[action] != 0:
            raise ValueError("Invalid Action")
        self.board[action] = self.current_player
        reward, done = self.check_winner()
        self.current_player = -self.current_player
        return self.board, reward, done

    def check_winner(self):
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3:
                return (self.board[i, 0], True)
            if abs(sum(self.board[:, i])) == 3:
                return (self.board[0, i], True)

        diag1 = sum([self.board[i, i] for i in range(3)])
        diag2 = sum([self.board[i, 2-i] for i in range(3)])
        if abs(diag1) == 3:
            return (np.sign(diag1), True)
        if abs(diag2) == 3:
            return (np.sign(diag2), True)

        if not self.available_actions():
            return (0, True)  # Draw
        return (0, False)  # No winner yet

    def render(self):
        print(self.board)


class Agent:
    def __init__(self, player=1, epsilon=0.1, alpha=0.5, gamma=0.9):
        self.player = player
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.value_function = {}

    def get_state_hash(self, board):
        return str(board.reshape(9))

    def update_value(self, state, reward, next_state, done):
        state_hash = self.get_state_hash(state)
        next_state_hash = self.get_state_hash(next_state)
        if state_hash not in self.value_function:
            self.value_function[state_hash] = 0.0
        if next_state_hash not in self.value_function:
            self.value_function[next_state_hash] = 0.0

        if not done:
            self.value_function[state_hash] += self.alpha * (
                reward + self.gamma * self.value_function[next_state_hash] - self.value_function[state_hash]
            )
        else:
            self.value_function[state_hash] += self.alpha * (reward - self.value_function[state_hash])

    def choose_action(self, board, available_actions):
        if np.random.rand() < self.epsilon:
            return random.choice(available_actions)

        value_max = -float('inf')
        best_action = None
        for action in available_actions:
            board_copy = board.copy()
            board_copy[action] = self.player
            state_hash = self.get_state_hash(board_copy)
            value = self.value_function.get(state_hash, 0)
            if value > value_max:
                value_max = value
                best_action = action

        return best_action if best_action is not None else random.choice(available_actions)


def train(episodes=10000):
    env = TicTacToe()
    agent = Agent()

    for episode in range(episodes):
        state = env.reset()
        done = False
        while not done:
            available_actions = env.available_actions()
            action = agent.choose_action(state, available_actions)
            next_state, reward, done = env.step(action)
            agent.update_value(state, reward, next_state, done)
            state = next_state

    return agent


if __name__ == '__main__':
    agent = train()
    print("Training completed!")
    # Further implementation for playing against the trained agent or human interaction can be added


Training completed!
