In [1]:
class TicTacToe:
    def __init__(self):
        self.board = [' '] * 9
        self.current_player = 'X'

    def print_board(self):
        print('-------------')
        for i in range(3):
            print('|', self.board[i * 3], '|', self.board[i * 3 + 1], '|', self.board[i * 3 + 2], '|')
            print('-------------')

    def get_state(self):
        return [1 if x == 'X' else -1 if x == 'O' else 0 for x in self.board]

    def available_moves(self):
        return [i for i, x in enumerate(self.board) if x == ' ']

    def make_move(self, position):
        self.board[position] = self.current_player
        self.current_player = 'O' if self.current_player == 'X' else 'X'

    def check_winner(self):
        winning_combinations = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],
            [0, 3, 6], [1, 4, 7], [2, 5, 8],
            [0, 4, 8], [2, 4, 6]
        ]
        for combo in winning_combinations:
            if self.board[combo[0]] == self.board[combo[1]] == self.board[combo[2]] != ' ':
                return self.board[combo[0]]
        if ' ' not in self.board:
            return 'Draw'
        return None

    def reset(self):
        self.board = [' '] * 9
        self.current_player = 'X'
        return self.get_state()

    def step(self, action):
        if action not in self.available_moves():
            raise ValueError("Invalid action")

        self.make_move(action)
        winner = self.check_winner()
        if winner == 'X':
            return self.get_state(), 1, True  # X wins
        elif winner == 'O':
            return self.get_state(), -1, True  # O wins
        elif winner == 'Draw':
            return self.get_state(), 0, True  # Draw
        else:
            return self.get_state(), 0, False  # Continue playing


In [3]:
# import random

# class QLearningAgent:
#     def __init__(self, epsilon=0.1, alpha=0.5, gamma=1):
#         self.epsilon = epsilon  # Exploration rate
#         self.alpha = alpha  # Learning rate
#         self.gamma = gamma  # Discount factor
#         self.q_values = {}  # Dictionary to store Q-values

#     def get_q_value(self, state, action):
#         return self.q_values.get((state, action), 0.0)

#     def choose_action(self, state, available_actions):
#         if random.random() < self.epsilon:  # Exploration
#             return random.choice(available_actions)
#         else:  # Exploitation
#             best_actions = []
#             best_q_value = float('-inf')
#             for action in available_actions:
#                 q_value = self.get_q_value(state, action)
#                 if q_value > best_q_value:
#                     best_actions = [action]
#                     best_q_value = q_value
#                 elif q_value == best_q_value:
#                     best_actions.append(action)
#             return random.choice(best_actions)

#     def update_q_value(self, state, action, reward, next_state):
#         max_next_q_value = max([self.get_q_value(next_state, next_action) for next_action in next_state.available_moves()])
#         new_q_value = (1 - self.alpha) * self.get_q_value(state, action) + self.alpha * (reward + self.gamma * max_next_q_value)
#         self.q_values[(state, action)] = new_q_value


In [4]:
# def train_q_learning_agent(num_episodes):
#     agent = QLearningAgent()
#     env = TicTacToe()

#     for _ in range(num_episodes):
#         env.reset()
#         state = tuple(env.board)
#         done = False

#         while not done:
#             available_actions = env.available_moves()
#             action = agent.choose_action(state, available_actions)
#             env.make_move(action)
#             next_state = tuple(env.board)
#             reward = 0
            
#             winner = env.check_winner()
#             if winner == 'X':
#                 reward = 1
#                 done = True
#             elif winner == 'O':
#                 reward = -1
#                 done = True
#             elif winner == 'Draw':
#                 done = True
                
#             agent.update_q_value(state, action, reward, TicTacToe())
#             state = next_state

#     return agent

# trained_agent = train_q_learning_agent(num_episodes=10000)


In [6]:
# def self_play_train_q_learning_agent(num_episodes):
#     agent = QLearningAgent()
#     env = TicTacToe()

#     for _ in range(num_episodes):
#         env.reset()
#         states_actions_rewards = []

#         while True:
#             state = tuple(env.board)
#             available_actions = env.available_moves()
#             action = agent.choose_action(state, available_actions)
#             env.make_move(action)
#             next_state = tuple(env.board)
#             reward = 0
            
#             winner = env.check_winner()
#             if winner == 'X':
#                 reward = 1
#                 break
#             elif winner == 'O':
#                 reward = -1
#                 break
#             elif winner == 'Draw':
#                 break

#             states_actions_rewards.append((state, action, reward))
#             env.make_move(random.choice(env.available_moves()))

#         # Update Q-values using collected experience
#         for state, action, reward in states_actions_rewards:
#             agent.update_q_value(state, action, reward, TicTacToe())

#     return agent

# trained_agent = self_play_train_q_learning_agent(num_episodes=10000)


In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

class QNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

import random

class NeuralQAgent:
    def __init__(self, input_size, output_size, epsilon=0.1, gamma=0.99):
        self.input_size = input_size
        self.output_size = output_size
        self.epsilon = epsilon  # Exploration rate
        self.gamma = gamma  # Discount factor
        self.q_network = QNetwork(input_size, output_size)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=0.000001)
        self.loss_function = nn.MSELoss()
        #self.loss_function = nn.CrossEntropyLoss()

    def choose_action(self, state, available_moves, testing=False):
        if not testing and random.random() < self.epsilon:
            return random.choice(available_moves)  # Exploration
        else:
            q_values = self.q_network(torch.tensor(state, dtype=torch.float32))

            # Only select valid moves
            while True:
                if torch.argmax(q_values).item() in available_moves:
                    return torch.argmax(q_values).item()  # Exploitation
                else:
                    q_values[torch.argmax(q_values).item()] = -math.inf

    def update_q_values(self, states, actions, rewards, next_states, dones):
        self.optimizer.zero_grad()

        # Convert lists to tensors
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        # Compute predicted Q-values
        predicted_q_values = self.q_network(states)
        predicted_q_values = torch.gather(predicted_q_values, 1, actions.unsqueeze(1)).squeeze(1)

        # Compute target Q-values
        target_q_values = rewards + (1 - dones) * self.gamma * torch.max(self.q_network(next_states), dim=1)[0]

        # Compute loss
        loss = self.loss_function(predicted_q_values, target_q_values.detach())
        print(loss)

        # Update network weights
        loss.backward()
        self.optimizer.step()




def train_neural_q_agent_against_random(agent, env, num_episodes):

    # Play the given number of games
    for episode in range(num_episodes):

        # At the start of the game make sure the board is reset
        state = env.reset()
        done = False
        states, actions, rewards, next_states, dones = [], [], [], [], []

        while not done:

            # The agent currently only plays as 'X'
            action = agent.choose_action(state, env.available_moves())
            next_state, reward, done = env.step(action)

            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            dones.append(done)

            # Make random move as 'O':
            if not done:
                random_action = random.choice(env.available_moves())
                next_state, reward, done = env.step(random_action)

            state = next_state
        
        if episode % 10000 == 0:
            print(episode)

        # Go back and punish bad moves and reward good ones
        # if rewards[-1] == -1:
        #    for r in rewards:
        #        r -= 0.01
        # if rewards[-1] == 1:
        #     for r in rewards:
        #         r += 0.1

        agent.update_q_values(states, actions, rewards, next_states, dones)
        

In [44]:
# Low epsilon (0.05) led to more losses 
# I think making it train against random has improved draw performance since the code didn't look absolutely correct
# Increasing the size of the network by adding an additional layer increased the number of draws (1 time it did at least)
# However this has increased the length of time it takes to train

# Nevermind, the loss is blowing up for some reason. So maybe switch to cross-entropy loss.
# This switch does not actually make sense

# Lowering the learning rate stopped the loss from blowing up and gave an agent with (600 wins)
# Yes, decreasing the learning rate was the problem. After lowering again got an agent with (700 wins) with 10000 episodes
# Increasing the number of episodes does not improve (but actually hurts). But this is a good point to save and then 
# try other things

# Punishing past moves does not seem to make much of a difference, probabably need to consider stopping rules
# We could also try different loss, larger network, playing against better opponent

trained_agent = NeuralQAgent(9, 9, epsilon=0.1)
env = TicTacToe()

train_neural_q_agent_against_random(trained_agent, env, 10000)

0
tensor(0.2959, grad_fn=<MseLossBackward0>)
tensor(0.2304, grad_fn=<MseLossBackward0>)
tensor(0.1752, grad_fn=<MseLossBackward0>)
tensor(0.2432, grad_fn=<MseLossBackward0>)
tensor(0.0042, grad_fn=<MseLossBackward0>)
tensor(0.2300, grad_fn=<MseLossBackward0>)
tensor(0.0002, grad_fn=<MseLossBackward0>)
tensor(0.0020, grad_fn=<MseLossBackward0>)
tensor(0.0011, grad_fn=<MseLossBackward0>)
tensor(0.1961, grad_fn=<MseLossBackward0>)
tensor(0.2139, grad_fn=<MseLossBackward0>)
tensor(0.0012, grad_fn=<MseLossBackward0>)
tensor(0.2655, grad_fn=<MseLossBackward0>)
tensor(0.2357, grad_fn=<MseLossBackward0>)
tensor(0.0036, grad_fn=<MseLossBackward0>)
tensor(0.1755, grad_fn=<MseLossBackward0>)
tensor(0.0287, grad_fn=<MseLossBackward0>)
tensor(0.2292, grad_fn=<MseLossBackward0>)
tensor(0.1978, grad_fn=<MseLossBackward0>)
tensor(0.1795, grad_fn=<MseLossBackward0>)
tensor(0.2002, grad_fn=<MseLossBackward0>)
tensor(0.2445, grad_fn=<MseLossBackward0>)
tensor(0.0060, grad_fn=<MseLossBackward0>)
tensor(0.

In [126]:
def play_against_agent(agent):
    env = TicTacToe()

    while True:
        env.print_board()
        print("Your turn (Enter position 0-8): ")
        user_input = input()
        try:
            user_action = int(user_input)
            if user_action not in env.available_moves():
                print("Invalid move. Please try again.")
                continue
        except ValueError:
            print("Invalid input. Please enter a number.")
            continue

        env.make_move(user_action)
        winner = env.check_winner()
        if winner == 'X':
            env.print_board()
            print("You win!")
            break
        elif winner == 'Draw':
            env.print_board()
            print("It's a draw!")
            break
        elif winner == 'O':
            env.print_board()
            print("You lost!")
            break

        # Agent's turn
        #state = tuple(env.board)
        state = env.get_state()
        available_actions = env.available_moves()
        action = agent.choose_action(state, available_actions, testing=True)
        env.make_move(action)
        winner = env.check_winner()
        if winner == 'O':
            env.print_board()
            print("You lost!")
            break
        elif winner == 'Draw':
            env.print_board()
            print("It's a draw!")
            break

    print("Game Over.")

# Example usage:
play_against_agent(trained_agent)


-------------
|   |   |   |
-------------
|   |   |   |
-------------
|   |   |   |
-------------
Your turn (Enter position 0-8): 
-------------
| X |   | O |
-------------
|   |   |   |
-------------
|   |   |   |
-------------
Your turn (Enter position 0-8): 
-------------
| X |   | O |
-------------
| X |   |   |
-------------
| O |   |   |
-------------
Your turn (Enter position 0-8): 
-------------
| X |   | O |
-------------
| X | X |   |
-------------
| O |   | O |
-------------
Your turn (Enter position 0-8): 
-------------
| X |   | O |
-------------
| X | X | X |
-------------
| O |   | O |
-------------
You win!
Game Over.


In [16]:

"""Random Player with X will get about 50% wins, 10% draws, 40% losses"""
# def play_random_vs_random():
#     env = TicTacToe()
#     wins = 0
#     draws = 0 
#     loses = 0  
#     for game in range(100000):
#         while True:
            
#             env.make_move(random.choice(env.available_moves()))
#             winner = env.check_winner()
#             if winner == 'X':
#                 wins+=1
#                 env.reset()
#                 break
#             elif winner == 'Draw':
#                 env.reset()
#                 draws += 1
#                 break
#             elif winner == 'O':
#                 env.reset()
#                 loses += 1
#                 break

#             # Agent's turn
#             #state = tuple(env.board)
            
#             env.make_move(random.choice(env.available_moves()))
#             winner = env.check_winner()
#             if winner == 'O':
#                 loses += 1
#                 break
#             elif winner == 'Draw':

#                 draws += 1
#                 env.reset()
#                 break
            

#     return wins, draws, loses

# play_random_vs_random()

(51250, 9827, 38923)

In [47]:


def play_against_random(agent):
    env = TicTacToe()
    wins = 0
    draws = 0 
    loses = 0  
    for game in range(1000):
        while True:
            state = env.get_state()
            available_actions = env.available_moves()
            action = agent.choose_action(state, available_actions, testing=True)
            env.make_move(action)
            winner = env.check_winner()
            if winner == 'X':
                wins+=1
                env.reset()
                break
            elif winner == 'Draw':
                env.reset()
                draws += 1
                break
            elif winner == 'O':
                env.reset()
                loses += 1
                break

            # Agent's turn
            #state = tuple(env.board)
            
            env.make_move(random.choice(env.available_moves()))
            winner = env.check_winner()
            if winner == 'O':
                loses += 1
                break
            elif winner == 'Draw':

                draws += 1
                env.reset()
                break
            elif winner == 'X':
                wins+=1
                env.reset()
                break

    return wins, draws, loses
# Example usage:
play_against_random(trained_agent)

(336, 138, 526)