In [None]:
import gym
import numpy as np
from gym import Env
from gym.spaces import Discrete, Box
import random
from tqdm.auto import tqdm

In [None]:
class TicTacToeEnv(Env):
    def __init__(self):
        self.state = [' ']*9
        self.action_space = Discrete(9)
        self.observation_space = Box(low=0, high=2, shape=(9,), dtype=int)
        self.done = False

    def reset(self):
        self.state = [' ']*9
        self.done = False
        return self._get_obs()

    def step(self, action):
        if self.state[action] == ' ':
            self.state[action] = 'X'  
            if self._check_win('X'):
                return self._get_obs(), 1, True, {}  
            elif ' ' not in self.state:
                return self._get_obs(), 0, True, {}  

            opponent_action = random.choice([i for i, x in enumerate(self.state) if x == ' '])
            self.state[opponent_action] = 'O'  
            if self._check_win('O'):
                return self._get_obs(), -1, True, {}  
            elif ' ' not in self.state:
                return self._get_obs(), 0, True, {}  

        return self._get_obs(), 0, False, {}

    def _get_obs(self):
        return np.array([0 if x == ' ' else 1 if x == 'X' else 2 for x in self.state])

    def _check_win(self, player):
        win_states = [(0,1,2), (3,4,5), (6,7,8), (0,3,6), (1,4,7), (2,5,8), (0,4,8), (2,4,6)]
        player_token = 'X' if player == 'X' else 'O'
        return any(all(self.state[i] == player_token for i in win) for win in win_states)

In [4]:
from tqdm import tqdm

env = TicTacToeEnv()
q_table = np.zeros((3**9, env.action_space.n))

def encode_state(state):
    return sum((state[i] * (3 ** i)) for i in range(9))

alpha, gamma, epsilon = 0.1, 0.9, 1.0
epsilon_decay = 0.995

for episode in tqdm(range(1000), desc="Training Progress"):
    state = env.reset()
    done = False
    while not done:
        state_encoded = encode_state(state)
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  
        else:
            action = np.argmax(q_table[state_encoded])  

        next_state, reward, done, _ = env.step(action)
        next_state_encoded = encode_state(next_state)

        q_table[state_encoded][action] += alpha * (reward + gamma * np.max(q_table[next_state_encoded]) - q_table[state_encoded][action])
        state = next_state

    epsilon *= epsilon_decay

Training Progress: 100%|██████████████████████████████████████████████████████████| 1000/1000 [00:02<00:00, 458.81it/s]


In [7]:
def display_state(state):
    board = [ 'X' if x == 1 else 'O' if x == 2 else ' ' for x in state ]
    print(f"{board[0]} | {board[1]} | {board[2]}")
    print("---------")
    print(f"{board[3]} | {board[4]} | {board[5]}")
    print("---------")
    print(f"{board[6]} | {board[7]} | {board[8]}")
    print("\n")

state = env.reset()  
done = False
print("Testing the agent's gameplay for a single instance:")

while not done:
    state_encoded = encode_state(state)
    action = np.argmax(q_table[state_encoded])  
    state, reward, done, _ = env.step(action)      
    print(f"Agent's Action: {action}")
    display_state(state)  
    print(f"Reward: {reward}\n")

print("Testing completed.")


Testing the agent's gameplay for a single instance:
Agent's Action: 1
  | X |  
---------
  |   | O
---------
  |   |  


Reward: 0

Agent's Action: 0
X | X |  
---------
  |   | O
---------
O |   |  


Reward: 0

Agent's Action: 4
X | X |  
---------
  | X | O
---------
O |   | O


Reward: 0

Agent's Action: 7
X | X |  
---------
  | X | O
---------
O | X | O


Reward: 1

Testing completed.
