## RL

In [1]:
from hexagon import HexagonalGrid

pygame-ce 2.5.2 (SDL 2.30.8, Python 3.13.0)


In [6]:
grid = HexagonalGrid(6)
grid.randomize_grid()

In [7]:
grid.to_numpy()

array([[0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
        1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0]], dtype=int8)

In [8]:
len(grid.hexagons)

91

In [None]:
import torch
import torch.nn as nn

CONV_HIDDEN_CHANNELS = 64
LINEAR_HIDDEN_SIZE = 128
FLATTENED_BOARD_SIZE = 91

class GameModel(nn.Module):
    def __init__(self):
        super(GameModel, self).__init__()
        self.conv1 = nn.Conv1d(3, CONV_HIDDEN_CHANNELS, kernel_size=3, padding=1)  # 3 channels for the board
        self.conv2 = nn.Conv1d(CONV_HIDDEN_CHANNELS, CONV_HIDDEN_CHANNELS, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(CONV_HIDDEN_CHANNELS * FLATTENED_BOARD_SIZE, LINEAR_HIDDEN_SIZE)
        self.fc2 = nn.Linear(LINEAR_HIDDEN_SIZE, FLATTENED_BOARD_SIZE)  # Output logits for each cell

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)  # Returns logits for each position on the board


In [None]:
# Instantiate player-specific models and optimizers
player1_model = GameModel()
player2_model = GameModel()
optimizer1 = optim.Adam(player1_model.parameters(), lr=0.001)
optimizer2 = optim.Adam(player2_model.parameters(), lr=0.001)

# Define memory buffers for each player
memory1 = deque(maxlen=10000)
memory2 = deque(maxlen=10000)

# Self-play training loop
for episode in range(total_episodes):
    game_state = initialize_game()
    done = False
    current_player = 1

    while not done:
        if current_player == 1:
            action = choose_action(player1_model, game_state)  # Choose action for player 1
            new_state, reward, done = step(game_state, action)
            memory1.append((game_state, action, reward, new_state, done))
            game_state = new_state
            current_player = 2
        else:
            action = choose_action(player2_model, game_state)  # Choose action for player 2
            new_state, reward, done = step(game_state, action)
            memory2.append((game_state, action, -reward, new_state, done))  # Negative reward for player 1's loss
            game_state = new_state
            current_player = 1

        # Training steps (periodically, or every few episodes)
        if len(memory1) > batch_size and current_player == 1:
            batch1 = random.sample(memory1, batch_size)
            train_on_batch(player1_model, batch1, optimizer1)

        if len(memory2) > batch_size and current_player == 2:
            batch2 = random.sample(memory2, batch_size)
            train_on_batch(player2_model, batch2, optimizer2)