In [1]:
import numpy as np
import random

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1

    def reset(self):
        self.board.fill(0)
        self.current_player = 1
        return self.board

    def is_winner(self, player):
        return any(np.all(line == player) for line in np.vstack((self.board, self.board.T, [np.diag(self.board), np.diag(np.fliplr(self.board))])))

    def is_draw(self):
        return not np.any(self.board == 0)

    def available_actions(self):
        return list(zip(*np.where(self.board == 0)))

    def step(self, action):
        if self.board[action] != 0: raise ValueError("Invalid action!")
        self.board[action] = self.current_player
        if self.is_winner(self.current_player): return self.board, 1
        if self.is_draw(): return self.board, 0
        self.current_player *= -1
        return self.board, None

    def render(self):
        symbols = {1: 'X', -1: 'O', 0: ' '}
        for row in self.board: print(" | ".join(symbols[cell] for cell in row)); print("-" * 9)

class QLearningAgent:
    def __init__(self, lr=0.1, discount=0.9, explore=1.0, decay=0.99):
        self.q_table = {}
        self.lr, self.discount, self.explore, self.decay = lr, discount, explore, decay

    def get_q(self, state, action):
        return self.q_table.get((state.tobytes(), action), 0)

    def update_q(self, state, action, reward, next_state):
        current_q = self.get_q(state, action)
        max_next_q = max([self.get_q(next_state, a) for a in TicTacToe().available_actions()] or [0])
        self.q_table[(state.tobytes(), action)] = current_q + self.lr * (reward + self.discount * max_next_q - current_q)

    def select_action(self, state, actions):
        if random.random() < self.explore: return random.choice(actions)
        q_values = [self.get_q(state, a) for a in actions]
        return random.choice([a for a, q in zip(actions, q_values) if q == max(q_values)])

    def decay_explore(self):
        self.explore *= self.decay

def train(agent, episodes=10000):
    game = TicTacToe()
    for _ in range(episodes):
        state, done = game.reset(), False
        while not done:
            action = agent.select_action(state, game.available_actions())
            next_state, reward = game.step(action)
            agent.update_q(state, action, reward or 0, next_state)
            state, done = next_state, reward is not None
        agent.decay_explore()

def test(agent):
    game, state, done = TicTacToe(), TicTacToe().reset(), False
    while not done:
        game.render()
        action = agent.select_action(state, game.available_actions())
        state, reward = game.step(action)
        done = reward is not None
        if reward == 1: print("Agent wins!")
        elif reward == 0: print("It's a draw!")

if __name__ == "__main__":
    agent = QLearningAgent()
    train(agent, episodes=10000)
    test(agent)


  |   |  
---------
  |   |  
---------
  |   |  
---------
  |   |  
---------
  |   |  
---------
  | X |  
---------
  |   | O
---------
  |   |  
---------
  | X |  
---------
  |   | O
---------
  | X |  
---------
  | X |  
---------
  |   | O
---------
O | X |  
---------
  | X |  
---------
  |   | O
---------
O | X | X
---------
  | X |  
---------
  | O | O
---------
O | X | X
---------
  | X |  
---------
  | O | O
---------
O | X | X
---------
  | X | X
---------
  | O | O
---------
O | X | X
---------
O | X | X
---------
Agent wins!
