## Set Up the Environment

In [6]:
import numpy as np
import random


def initialize_board():
    return np.zeros((3, 3), dtype=int)

def print_board(board):
    for row in board:
        print(" | ".join(["X" if cell == 1 else "O" if cell == -1 else " " for cell in row]))
        print("-" * 9)


## Define Game Mechanics

In [7]:
def check_winner(board):
    for i in range(3):
        if abs(sum(board[i, :])) == 3:  
            return board[i, 0]
        if abs(sum(board[:, i])) == 3:  
            return board[0, i]
    if abs(sum(board.diagonal())) == 3:  
        return board[0, 0]
    if abs(sum(np.fliplr(board).diagonal())) == 3: 
        return board[0, 2]
    
    if not 0 in board:  
        return 0
    return None  

def get_possible_moves(board):
    return [(i, j) for i in range(3) for j in range(3) if board[i, j] == 0]


## Build the Q-Learning Model


In [8]:
Q_table = {}

def initialize_q_values(state):
    if state not in Q_table:
        Q_table[state] = {move: 0 for move in get_possible_moves(np.array(eval(state)))}
        
def update_q_values(state, action, reward, next_state, alpha=0.1, gamma=0.9):
    max_future_q = max(Q_table.get(next_state, {}).values(), default=0)
    current_q = Q_table[state][action]
    Q_table[state][action] += alpha * (reward + gamma * max_future_q - current_q)


## Training the Model

In [9]:
def train(num_episodes=10000, epsilon=0.1):
    for _ in range(num_episodes):
        board = initialize_board()
        state = str(board.tolist())
        done = False
        while not done:
            
            if random.uniform(0, 1) < epsilon:
                action = random.choice(get_possible_moves(board))
            else:
                initialize_q_values(state)
                action = max(Q_table[state], key=Q_table[state].get)
              
            board[action] = 1
            next_state = str(board.tolist())
            reward = 0
            winner = check_winner(board)
            if winner == 1: reward = 1  
            elif winner == 0: reward = 0.5  
            elif winner == -1: reward = -1  
            done = winner is not None
            
            if not done:
                opponent_move = random.choice(get_possible_moves(board))
                board[opponent_move] = -1
                done = check_winner(board) is not None
                next_state = str(board.tolist())
            
            initialize_q_values(state)
            initialize_q_values(next_state)
            update_q_values(state, action, reward, next_state)
            state = next_state


## Testing the Model

In [13]:
def test():
    board = initialize_board()
    state = str(board.tolist())
    done = False
    while not done:
        print_board(board)
        
        # Initialize Q-values for the current state if not already in Q_table
        if state not in Q_table:
            initialize_q_values(state)
        
      
        action = max(Q_table[state], key=Q_table[state].get)
        
        # Take action (agent's move)
        board[action] = 1
        if check_winner(board) == 1:
            print("Agent wins!")
            break
        elif check_winner(board) == 0:
            print("It's a draw!")
            break
        
        # Opponent's random move
        opponent_move = random.choice(get_possible_moves(board))
        board[opponent_move] = -1
        if check_winner(board) == -1:
            print("Opponent wins!")
            break
        
        
        state = str(board.tolist())
    print_board(board)

# Run the test function to play against the trained model
test()


  |   |  
---------
  |   |  
---------
  |   |  
---------
X |   |  
---------
  |   |  
---------
  |   | O
---------
X | X |  
---------
O |   |  
---------
  |   | O
---------
Agent wins!
X | X | X
---------
O |   |  
---------
  |   | O
---------
