In [30]:
import numpy as np
import random

# 1. Set up the environment
def display_board(board):
    for row in board:
        print("-" * 9)
        print(" | ".join(["X" if cell == 1 else "O" if cell == -1 else " " for cell in row]))
    print("-" * 9)

def is_valid_move(board, row, col):
    return board[row][col] == 0

def check_win(board, player):
    # Check rows, columns, and diagonals for a win
    for row in board:
        if all(cell == player for cell in row):
            return True
    for col in board.T:
        if all(cell == player for cell in col):
            return True
    if all(board[i][i] == player for i in range(3)) or all(board[i][2 - i] == player for i in range(3)):
        return True
    return False

def check_draw(board):
    return np.all(board != 0)

# 2. Define the Tic-Tac-Toe game

class QLearningAgent:
    def __init__(self, epsilon, alpha, gamma):
        self.epsilon = epsilon  # Exploration rate
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.q_table = {}  # Q-value table

    def get_action(self, state):
        state_tuple = tuple(map(tuple, state))
        if np.random.rand() < self.epsilon:
            valid_moves = [i for i in range(9) if state[i // 3][i % 3] == 0]
            return random.choice(valid_moves)
        else:
            return max(
                (i for i in range(9) if state[i // 3][i % 3] == 0),
                key=lambda i: self.q_table.get(state_tuple, {}).get(i, 0),
                default=random.choice([i for i in range(9) if state[i // 3][i % 3] == 0])
            )

    def learn(self, state, action, reward, next_state):
        state_tuple = tuple(map(tuple, state))
        next_state_tuple = tuple(map(tuple, next_state))

        if state_tuple not in self.q_table:
            self.q_table[state_tuple] = {}
        if next_state_tuple not in self.q_table:
            self.q_table[next_state_tuple] = {}

        if action not in self.q_table[state_tuple]:
            self.q_table[state_tuple][action] = 0

        best_next_action = max(
            (i for i in range(9) if next_state[i // 3][i % 3] == 0),
            key=lambda i: self.q_table.get(next_state_tuple, {}).get(i, 0),
            default=None
        )

        if best_next_action is not None:
            self.q_table[state_tuple][action] += self.alpha * (
                reward + self.gamma * self.q_table.get(next_state_tuple, {}).get(best_next_action, 0)
                - self.q_table[state_tuple].get(action, 0)
            )
        else:
            self.q_table[state_tuple][action] += self.alpha * (reward - self.q_table[state_tuple].get(action, 0))

# 3. Train the reinforcement learning model
def play_game(agent1, agent2, board):
    state = board.copy()
    while True:
        action1 = agent1.get_action(state)
        row, col = divmod(action1, 3)
        state[row][col] = 1
        if check_win(state, 1):
            agent1.learn(state, action1, 1, state)
            return 1  # Agent 1 wins
        if check_draw(state):
            return 0  # Draw
        action2 = agent2.get_action(state)
        row, col = divmod(action2, 3)
        state[row][col] = -1
        if check_win(state, -1):
            agent1.learn(state, action1, -1, state)
            return -1  # Agent 2 wins

def train_q_learning_agents(agent1, agent2, num_episodes):
    for episode in range(num_episodes):
        board = np.zeros((3, 3), dtype=int)
        if episode % 2 == 0:
            result = play_game(agent1, agent2, board)
            if result == 1:
                agent1.learn(board, None, 1, board)
                agent2.learn(board, None, -1, board)
            elif result == -1:
                agent1.learn(board, None, -1, board)
                agent2.learn(board, None, 1, board)
            else:
                agent1.learn(board, None, 0, board)
                agent2.learn(board, None, 0, board)
        else:
            result = play_game(agent2, agent1, board)
            if result == 1:
                agent1.learn(board, None, -1, board)
                agent2.learn(board, None, 1, board)
            elif result == -1:
                agent1.learn(board, None, 1, board)
                agent2.learn(board, None, -1, board)
            else:
                agent1.learn(board, None, 0, board)
                agent2.learn(board, None, 0, board)

# 4. Test the model
if __name__ == "__main__":
    agent1 = QLearningAgent(epsilon=0.2, alpha=0.1, gamma=0.9)
    agent2 = QLearningAgent(epsilon=0.2, alpha=0.1, gamma=0.9)

    # Train the agents
    train_q_learning_agents(agent1, agent2, num_episodes=10000)

    # Play a game between the trained agents
    board = np.zeros((3, 3), dtype=int)
    while True:
        display_board(board)
        action1 = agent1.get_action(board)
        row, col = divmod(action1, 3)
        board[row][col] = 1
        if check_win(board, 1):
            display_board(board)
            print("Agent 1 wins!")
            break
        if check_draw(board):
            display_board(board)
            print("It's a draw!")
            break
        display_board(board)
        action2 = agent2.get_action(board)
        row, col = divmod(action2, 3)
        board[row][col] = -1
        if check_win(board, -1):
            display_board(board)
            print("Agent 2 wins!")
            break


---------
  |   |  
---------
  |   |  
---------
  |   |  
---------
---------
X |   |  
---------
  |   |  
---------
  |   |  
---------
---------
X | O |  
---------
  |   |  
---------
  |   |  
---------
---------
X | O | X
---------
  |   |  
---------
  |   |  
---------
---------
X | O | X
---------
O |   |  
---------
  |   |  
---------
---------
X | O | X
---------
O | X |  
---------
  |   |  
---------
---------
X | O | X
---------
O | X | O
---------
  |   |  
---------
---------
X | O | X
---------
O | X | O
---------
X |   |  
---------
Agent 1 wins!


Sure! Let me walk you through the code step by step.

### Imports
```python
import numpy as np
import random
```
- `numpy` is imported as `np` for handling arrays (especially for the board).
- `random` is imported for making random decisions (used in the exploration step of Q-learning).

### Board Display Function
```python
def display_board(board):
    for row in board:
        print("-" * 9)
        print(" | ".join(["X" if cell == 1 else "O" if cell == -1 else " " for cell in row]))
    print("-" * 9)
```
- **`display_board(board)`** prints the Tic-Tac-Toe board in a human-readable format.
  - It loops through each row of the board (a 2D array).
  - It prints a separator line `"-" * 9` for visual clarity.
  - The row is printed with `"X"` for 1, `"O"` for -1, and `" "` for empty cells (`0`).
  - Finally, another separator line is printed after the last row.

### Valid Move Checker
```python
def is_valid_move(board, row, col):
    return board[row][col] == 0
```
- **`is_valid_move(board, row, col)`** checks whether a move is valid by ensuring the selected cell is empty (`0`).

### Win Checker
```python
def check_win(board, player):
    # Check rows, columns, and diagonals for a win
    for row in board:
        if all(cell == player for cell in row):
            return True
    for col in board.T:
        if all(cell == player for cell in col):
            return True
    if all(board[i][i] == player for i in range(3)) or all(board[i][2 - i] == player for i in range(3)):
        return True
    return False
```
- **`check_win(board, player)`** checks if the given player (1 for "X" and -1 for "O") has won.
  - It checks each row, each column (using the transpose of the board), and both diagonals.
  - If any row, column, or diagonal is fully occupied by the player’s symbol (`1` or `-1`), it returns `True`, meaning the player has won.

### Draw Checker
```python
def check_draw(board):
    return np.all(board != 0)
```
- **`check_draw(board)`** returns `True` if the board is full (i.e., no cells contain `0`), meaning the game has ended in a draw.

### Q-Learning Agent Class
```python
class QLearningAgent:
    def __init__(self, epsilon, alpha, gamma):
        self.epsilon = epsilon  # Exploration rate
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.q_table = {}  # Q-value table
```
- **`QLearningAgent`** is a class that represents a Q-learning agent.
  - `epsilon`: The exploration rate, determining the likelihood of choosing a random move over a learned move.
  - `alpha`: The learning rate, which determines how much new experiences affect the agent's learning.
  - `gamma`: The discount factor, which determines the importance of future rewards in the agent’s decision-making.
  - `q_table`: A dictionary to store the Q-values (the quality of each action in a given state).

### Action Selection (Exploration vs Exploitation)
```python
def get_action(self, state):
    state_tuple = tuple(map(tuple, state))
    if np.random.rand() < self.epsilon:
        valid_moves = [i for i in range(9) if state[i // 3][i % 3] == 0]
        return random.choice(valid_moves)
    else:
        return max(
            (i for i in range(9) if state[i // 3][i % 3] == 0),
            key=lambda i: self.q_table.get(state_tuple, {}).get(i, 0),
            default=random.choice([i for i in range(9) if state[i // 3][i % 3] == 0])
        )
```
- **`get_action(self, state)`** determines which action (move) the agent should take.
  - It first converts the board (state) to a tuple to be used as a key in the `q_table`.
  - If a random number is less than `epsilon`, it explores by choosing a random valid move.
  - Otherwise, it exploits its learned knowledge and selects the action with the highest Q-value from the `q_table`. If there are no Q-values, it defaults to choosing a random move.

### Learning from Experience
```python
def learn(self, state, action, reward, next_state):
    state_tuple = tuple(map(tuple, state))
    next_state_tuple = tuple(map(tuple, next_state))

    if state_tuple not in self.q_table:
        self.q_table[state_tuple] = {}
    if next_state_tuple not in self.q_table:
        self.q_table[next_state_tuple] = {}

    if action not in self.q_table[state_tuple]:
        self.q_table[state_tuple][action] = 0

    best_next_action = max(
        (i for i in range(9) if next_state[i // 3][i % 3] == 0),
        key=lambda i: self.q_table.get(next_state_tuple, {}).get(i, 0),
        default=None
    )

    if best_next_action is not None:
        self.q_table[state_tuple][action] += self.alpha * (
            reward + self.gamma * self.q_table.get(next_state_tuple, {}).get(best_next_action, 0)
            - self.q_table[state_tuple].get(action, 0)
        )
    else:
        self.q_table[state_tuple][action] += self.alpha * (reward - self.q_table[state_tuple].get(action, 0))
```
- **`learn(self, state, action, reward, next_state)`** updates the Q-values based on the agent’s experience.
  - It ensures both `state_tuple` and `next_state_tuple` exist in the `q_table`.
  - If the `action` has no Q-value, it initializes it to `0`.
  - The agent calculates the best next action's Q-value and updates the Q-value for the current action using the Q-learning formula:
    \[
    Q(s, a) \leftarrow Q(s, a) + \alpha \times \left[ R + \gamma \times \max_{a'}Q(s', a') - Q(s, a) \right]
    \]
  - If no valid next action exists, it simply updates the Q-value based on the reward.

### Game Play Function
```python
def play_game(agent1, agent2, board):
    state = board.copy()
    while True:
        action1 = agent1.get_action(state)
        row, col = divmod(action1, 3)
        state[row][col] = 1
        if check_win(state, 1):
            agent1.learn(state, action1, 1, state)
            return 1  # Agent 1 wins
        if check_draw(state):
            return 0  # Draw
        action2 = agent2.get_action(state)
        row, col = divmod(action2, 3)
        state[row][col] = -1
        if check_win(state, -1):
            agent1.learn(state, action1, -1, state)
            return -1  # Agent 2 wins
```
- **`play_game(agent1, agent2, board)`** simulates a game between two Q-learning agents (`agent1` and `agent2`).
  - Each agent takes turns selecting a move and updating the board.
  - If any agent wins, the game ends and the winning agent is rewarded.
  - If the game ends in a draw, a reward of 0 is given.
  - After each move, the agent learns from the current state, action, and reward.

### Training the Agents
```python
def train_q_learning_agents(agent1, agent2, num_episodes):
    for episode in range(num_episodes):
        board = np.zeros((3, 3), dtype=int)
        if episode % 2 == 0:
            result = play_game(agent1, agent2, board)
            if result == 1:
                agent1.learn(board, None, 1, board)
                agent2.learn(board, None, -1, board)
            elif result == -1:
                agent1.learn(board, None, -1, board)
                agent2.learn(board, None, 1, board)
            else:
                agent1.learn(board, None, 0, board)
                agent2.learn(board, None, 0, board)
        else:
            result = play_game(agent2, agent1, board)
            if result == 1:
                agent1.learn(board, None, -1, board)
                agent2.learn(board, None, 1, board)
            elif result == -1:
                agent1.learn(board, None, 1, board)
                agent2.learn(board, None, -1, board)
            else:
                agent1.learn(board, None, 0, board)
                agent2.learn(board, None, 0, board)
```
- **`train_q_learning_agents(agent1, agent2, num_episodes)`** trains the agents over multiple episodes.
  - In each episode, the agents play a game and learn from the outcome.
  -

 Depending on whether `episode % 2 == 0`, the order of agents is swapped.
  - After each game, the agents learn from the result (win, loss, or draw).

### Main Loop
```python
if __name__ == "__main__":
    epsilon = 0.1
    alpha = 0.5
    gamma = 0.9
    agent1 = QLearningAgent(epsilon, alpha, gamma)
    agent2 = QLearningAgent(epsilon, alpha, gamma)
    num_episodes = 10000
    train_q_learning_agents(agent1, agent2, num_episodes)
```
- **`__main__`** initializes two agents (`agent1` and `agent2`).
  - It sets the Q-learning parameters: `epsilon`, `alpha`, and `gamma`.
  - It trains the agents for 10,000 episodes using `train_q_learning_agents`.

This implementation sets up and trains two Q-learning agents to play Tic-Tac-Toe. The agents learn optimal strategies through repeated games and experience. The learning process updates the Q-values and adjusts the agents' decision-making based on the outcome of each game.