In [1]:
import numpy as np
import random

# Define the Tic-Tac-Toe board
EMPTY = 0
PLAYER_X = 1
PLAYER_O = 2

# Initialize Q-value table
num_states = 3 ** 9  # Total number of possible board configurations
num_actions = 9      # Number of possible moves
Q = np.zeros((num_states, num_actions))

# Hyperparameters
learning_rate = 0.1
discount_factor = 0.9
exploration_prob = 0.1
num_episodes = 10000

# Convert board state to a unique index
def state_to_index(state):
    index = 0
    for i, mark in enumerate(state):
        index += mark * (3 ** i)
    return index

# Choose an action based on epsilon-greedy policy
def choose_action(state):
    if random.random() < exploration_prob:
        return random.choice([action for action, mark in enumerate(state) if mark == EMPTY])
    else:
        return np.argmax(Q[state_to_index(state)])

# Update Q-values using Q-learning update rule
def update_Q(state, action, reward, next_state):
    Q[state_to_index(state)][action] += learning_rate * (reward + discount_factor * np.max(Q[state_to_index(next_state)]) - Q[state_to_index(state)][action])

# Play a game using Q-learning
def play_game():
    state = [EMPTY] * 9
    player_x_turn = True
    while EMPTY in state:
        if player_x_turn:
            player = PLAYER_X
        else:
            player = PLAYER_O
            
        if player == PLAYER_X:
            action = choose_action(state)
        else:
            action = random.choice([action for action, mark in enumerate(state) if mark == EMPTY])
        
        state[action] = player
        
        if check_winner(state, player):
            return player
        
        player_x_turn = not player_x_turn
    
    return 0  # Draw

# Check if a player has won
def check_winner(state, player):
    winning_combinations = [
        [0, 1, 2], [3, 4, 5], [6, 7, 8],  # Rows
        [0, 3, 6], [1, 4, 7], [2, 5, 8],  # Columns
        [0, 4, 8], [2, 4, 6]             # Diagonals
    ]
    
    for combo in winning_combinations:
        if all(state[i] == player for i in combo):
            return True
    
    return False

# Train the Q-learning agent
for episode in range(num_episodes):
    winner = play_game()
    
    # Update Q-values based on game outcome
    if winner == PLAYER_X:
        reward_x = 1
        reward_o = -1
    elif winner == PLAYER_O:
        reward_x = -1
        reward_o = 1
    else:
        reward_x = 0
        reward_o = 0
    
    for i, mark in enumerate(state):
        if mark == PLAYER_X:
            update_Q(state, i, reward_x, state)
        elif mark == PLAYER_O:
            update_Q(state, i, reward_o, state)

print("Q-learning training complete!")

# Play a game using the learned Q-values
def play_with_q_learning():
    state = [EMPTY] * 9
    player_x_turn = True
    while EMPTY in state:
        if player_x_turn:
            player = PLAYER_X
        else:
            player = PLAYER_O
            
        if player == PLAYER_X:
            action = choose_action(state)
        else:
            action = random.choice([action for action, mark in enumerate(state) if mark == EMPTY])
        
        state[action] = player
        print_board(state)
        
        if check_winner(state, player):
            if player == PLAYER_X:
                print("Player X wins!")
            else:
                print("Player O wins!")
            return
        
        player_x_turn = not player_x_turn
    
    print("It's a draw!")

# Helper function to print the board
def print_board(state):
    mapping = {EMPTY: " ", PLAYER_X: "X", PLAYER_O: "O"}
    board = [mapping[mark] for mark in state]
    print(f"{board[0]}|{board[1]}|{board[2]}\n-+-+-\n{board[3]}|{board[4]}|{board[5]}\n-+-+-\n{board[6]}|{board[7]}|{board[8]}")

# Play a game using Q-learning agent
play_with_q_learning()


NameError: name 'state' is not defined