In [1]:
import random
import math
from copy import deepcopy
from datetime import datetime, timedelta

In [8]:
def monte_carlo_tree_search(root, policy_net, max_seconds=3, max_milliseconds=0):
    end_time = datetime.now() + timedelta(seconds=max_seconds, milliseconds=max_milliseconds)
    node = root
    while datetime.now() < end_time:
        while node and not node.untried_moves and node.children:
            current_available_moves = node.game_state.available_moves()
            node = node.best_child(current_available_moves)
        if node and node.untried_moves:
            node = node.expand()
        while node and not node.game_state.is_terminal():
            possible_moves = node.game_state.available_moves()
            chosen_move = random.choice(possible_moves)
            node.game_state.make_move(chosen_move)
            node.game_state.switch_player()
        result = 1 if node and node.game_state.is_winner(node.player_just_moved) else 0
        while node:
            node.update(result)
            node = node.parent
    if root.children:
        root_available_moves = root.game_state.available_moves()
        best_child = root.best_child(root_available_moves)
        return best_child.move if best_child else None
    return None


In [9]:
def compute_loss(action_probabilities, action, outcome):
    action_tensor = torch.tensor([action], dtype=torch.long)
    outcome_tensor = torch.tensor([outcome], dtype=torch.float32)
    loss = nn.CrossEntropyLoss()(action_probabilities, action_tensor) * outcome_tensor
    return loss

In [10]:
import numpy as np
import torch
from copy import deepcopy
from datetime import datetime, timedelta
import torch.nn as nn
import torch.optim as optim

import os

class ConnectFour:
    def __init__(self):
        self.board = [[0]*7 for _ in range(6)]
        self.current_player = 1  # Start with player 1

    def make_move(self, column):
        if self.board[0][column] != 0:
            return False  # Column is full
        for row in reversed(range(6)):
            if self.board[row][column] == 0:
                self.board[row][column] = self.current_player
                return True
        return False

    def available_moves(self):
        return [c for c in range(7) if self.board[0][c] == 0]

    def is_winner(self, player):
        for row in range(6):
            for col in range(4):
                if all(self.board[row][col+i] == player for i in range(4)):
                    return True
        for col in range(7):
            for row in range(3):
                if all(self.board[row+i][col] == player for i in range(4)):
                    return True
        for row in range(3):
            for col in range(4):
                if all(self.board[row+i][col+i] == player for i in range(4)):
                    return True
                if all(self.board[row+3-i][col+i] == player for i in range(4)):
                    return True
        return False

    def is_draw(self):
        return all(self.board[0][col] != 0 for col in range(7))

    def is_terminal(self):
        return self.is_winner(1) or self.is_winner(2) or self.is_draw()

    def switch_player(self):
        self.current_player = 2 if self.current_player == 1 else 1

class PolicyNetwork(nn.Module):
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.conv1 = nn.Conv2d(2, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(128 * 6 * 7, 128)
        self.fc2 = nn.Linear(128, 7)  # Output probabilities for each column

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(-1, 128 * 6 * 7)
        x = torch.relu(self.fc1(x))
        return torch.softmax(self.fc2(x), dim=1)

class Node:
    def __init__(self, game_state, policy_net, move=None, parent=None):
        self.game_state = deepcopy(game_state)
        self.policy_net = policy_net
        self.move = move
        self.parent = parent
        self.children = []
        self.wins = 0
        self.visits = 0
        self.action_probabilities = None
        self.player_just_moved = game_state.current_player
        self.untried_moves = game_state.available_moves()

    def state_to_tensor(self):
        # Initialize a tensor for two channels. Each channel corresponds to one player.
        state_tensor = torch.zeros((1, 2, 6, 7), dtype=torch.float32)  # Shape [batch_size, num_channels, height, width]
        for r in range(6):
            for c in range(7):
                player = self.game_state.board[r][c]
                if player == 1:
                    state_tensor[0, 0, r, c] = 1  # Set player 1's layer
                elif player == 2:
                    state_tensor[0, 1, r, c] = 1  # Set player 2's layer
        return state_tensor


    # Ensure this method is called before feeding data into your neural network
    def fetch_policy_probabilities(self):
        input_tensor = self.state_to_tensor()
        print(f"Input Tensor Shape: {input_tensor.shape}")  # Debugging line to verify tensor shape
        with torch.no_grad():
            output = self.policy_net(input_tensor)
            self.action_probabilities = output.squeeze().numpy()

    def select_probabilistic_move(self):
        move_probabilities = {move: self.action_probabilities[move] for move in self.untried_moves}
        total = sum(move_probabilities.values())
        probabilities = [move_probabilities[move] / total for move in self.untried_moves]
        move = np.random.choice(self.untried_moves, p=probabilities)
        self.untried_moves.remove(move)
        return move

    def expand(self):
        if self.action_probabilities is None:
            self.fetch_policy_probabilities()
        move = self.select_probabilistic_move()
        next_state = deepcopy(self.game_state)
        next_state.make_move(move)
        next_state.switch_player()
        child_node = Node(next_state, self.policy_net, move, self)
        self.children.append(child_node)
        return child_node

    def update(self, result):
        self.visits += 1
        if result == 1:
            self.wins += result

    def best_child(self, available_moves):
        legal_children = [child for child in self.children if child.move in available_moves]
        choices_weights = [(child.wins / child.visits if child.visits > 0 else 0) +
                           np.sqrt(2 * np.log(self.visits) / child.visits if child.visits > 0 else float('inf'))
                           for child in legal_children]
        return legal_children[choices_weights.index(max(choices_weights))] if choices_weights else None



class Trainer:
    def __init__(self, iterations=1):
        self.game = ConnectFour()
        self.iterations = iterations
        self.policy_net = PolicyNetwork()  # Instantiate the PolicyNetwork
        model_path = 'path_to_trained_policy_net.pth'
        if os.path.exists(model_path):
            self.policy_net.load_state_dict(torch.load(model_path))  # Load the trained model if it exists
            print("Loaded trained model.")
        else:
            print("No trained model found. Starting training from scratch.")
        self.policy_net.eval()  # Set to evaluation mode
    
    def train(self):
        optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=0.001)  # Setup optimizer
        self.policy_net.train()  # Set network to training mode
    
        for i in range(self.iterations):
            game = ConnectFour()
            root = Node(game, self.policy_net)  # Initialize MCTS root
    
            # Collect data for training
            game_data = []
            while not game.is_terminal():
                move = monte_carlo_tree_search(root, self.policy_net, max_seconds=0, max_milliseconds=300)  # MCTS to choose action
                game.make_move(move)
                game.switch_player()
                # Store state, action, and provisional outcome as a list to modify later
                game_data.append([deepcopy(game.board), move, None])  # Use list instead of tuple
    
            # Assign outcomes to game data after game concludes
            outcome = 1 if game.is_winner(self.game.current_player) else 0  # Simplified outcome
            for data in game_data:
                data[2] = outcome  # Update the provisional None outcome with actual outcome
    
            # Update policy network based on collected game data
            self.update_policy_network(game_data, optimizer)
    
    def update_policy_network(self, game_data, optimizer):
        for state, action, outcome in game_data:
            state_tensor = self.board_to_tensor(state)  # Convert board to tensor
            print(f"Input Tensor Shape: {state_tensor.shape}")  # Debugging line to check shape
            action_probabilities = self.policy_net(state_tensor) 
            loss = compute_loss(action_probabilities, action, outcome)  # Define loss function based on your needs
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    def board_to_tensor(self, board):
        # Convert board state to a 2-channel tensor
        state_tensor = torch.zeros((1, 2, 6, 7), dtype=torch.float32)
        for r in range(6):
            for c in range(7):
                player = board[r][c]
                if player == 1:
                    state_tensor[0, 0, r, c] = 1  # Player 1's channel
                elif player == 2:
                    state_tensor[0, 1, r, c] = 1  # Player 2's channel
        return state_tensor



from pettingzoo.classic import connect_four_v3

class PettingZooAgent:
    def __init__(self, train_iterations=1):
        self.trainer = Trainer(train_iterations)
        self.trainer.train()

    def update_game_state(self, observation):
        """Update the internal game state based on the observation from PettingZoo."""
        self.trainer.game.board = [[0] * 7 for _ in range(6)]  # Reset the board state
        player1_count = 0
        player2_count = 0
        # Assuming the observation includes a 'board' key with a 6x7x2 tensor
        board_state = observation['observation']
        
        for row in range(6):
            for col in range(7):
                if board_state[row, col, 0] == 1:
                    self.trainer.game.board[row][col] = 1
                    player1_count += 1
                elif board_state[row, col, 1] == 1:
                    self.trainer.game.board[row][col] = 2
                    player2_count += 1

        # Set the current player based on the count of the tokens
        if player1_count <= player2_count:
            self.trainer.game.current_player = 1
        else:
            self.trainer.game.current_player = 2

    def choose_action(self, observation, action_mask):
        self.update_game_state(observation)
        if action_mask is None:
            available_moves = [True for i in range(action_mask)]
        else:
            available_moves = [i is True for i in range(action_mask)]
        root_node = Node(self.trainer.game, self.trainer.policy_net)
        best_move = monte_carlo_tree_search(root_node, self.trainer.policy_net, max_seconds=1, max_milliseconds=100)
        return best_move

pz_agent = PettingZooAgent()


No trained model found. Starting training from scratch.
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])
Input Tensor Shape: torch.Size([1, 2, 6, 7])

In [12]:
import random
from pettingzoo.classic import connect_four_v3


rewards = []
# Run multiple games to evaluate performance
for nb_run in range(10):
    env = connect_four_v3.env()
    env.reset()
    observation, reward, termination, truncation, info = env.last()
    
    # Randomly assign the trained agent to be the first or second player
    if random.choice([True, False]):
        trained_agent = env.agents[0]  # Trained agent as the first player
        random_agent = env.agents[1]  # Random agent as the second player
    else:
        trained_agent = env.agents[1]  # Trained agent as the second player
        random_agent = env.agents[0]  # Random agent as the first player

    cum_rewards = {trained_agent: 0, random_agent: 0}
    
    for agent_name in env.agent_iter():
        observation, reward, termination, truncation, info = env.last()
        action_mask = observation['action_mask']
        if termination or truncation:
            action = None
            cum_rewards[agent_name] += reward  # Update cumulative reward at game end
        elif agent_name == trained_agent:
            action = pz_agent.choose_action(observation, action_mask)
        else:  # This is for the random agent
            action = random.choice([i for i, available in enumerate(action_mask) if available])
        
        env.step(action)
        if not (termination or truncation):  # Update cumulative rewards if the game continues
            cum_rewards[agent_name] += reward

    # Store the results from this game
    rewards.append(cum_rewards)
    env.close()

# Print the results
print("Rewards from each game:")
for i, result in enumerate(rewards):
    print(f"Game {i + 1}: {result}")


TypeError: 'list' object cannot be interpreted as an integer