In [13]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from game import Game
from heuristics import *

In [14]:
class DeepQNetworkConnect4(nn.Module):
    # def __init__(self, env):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            nn.Flatten(start_dim = 1),
            #nn.Linear(126, 40),
            #nn.ReLU(),
            nn.Linear(42, 20),
            nn.ReLU(),
            nn.Linear(20, 7),
        )

    def forward(self, x):
        return self.network(x)

In [15]:
class ReplayBuffer:

    def __init__(self, max_frames):
        self.max_frames = max_frames
        self.buffer = []

    def add(self, frame):
        self.buffer.append(frame)
        if len(self.buffer) > self.max_frames:
            del self.buffer[0:len(self.buffer)-self.max_frames]

    def sample(self, num_samples):
        # Ensure we don't pick the same frame twice
        # Record the random indices picked from elements in the buffer
        sample_nums = set()
        while len(sample_nums) < num_samples:
            sample_nums.add(random.randrange(len(self.buffer)))
        experiences = [self.buffer[i] for i in sample_nums]
        return experiences

In [16]:
def calculate_epsilon(step, epsilon_start, epsilon_finish, total_timesteps, exploration_fraction):
    finish_step = total_timesteps * exploration_fraction
    if step > finish_step:
        return epsilon_finish
    epsilon_range = epsilon_start - epsilon_finish
    return epsilon_finish + (((finish_step - step) / finish_step) * epsilon_range)

def QLoss(state_evals, next_state_evals):
  loss = 1000 * torch.mean((state_evals - next_state_evals) ** 2)
  return loss

In [17]:
# HYPERPARAMETERS
seed = 0
buffer_size = 100000
learning_rate = 2.5e-4
pretrain_games = 1
ideal_batch_size = 10
total_timesteps = 10
epsilon_start = 0.9
epsilon_finish = 0
exploration_fraction = 0.8

In [18]:
def train():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True if seed > 0 else False

    # Initialise replay memory D to capacity N
    buffer = ReplayBuffer(buffer_size)

    # Initialize action-value function Q and target network
    q_network = DeepQNetworkConnect4().to(device)
    target_network = DeepQNetworkConnect4().to(device)
    target_network.load_state_dict(q_network.state_dict())
    optimiser = torch.optim.Adam(q_network.parameters(), learning_rate)

    for step in range(pretrain_games):
        # Generate games and add experiences
        g = game.Game()
        # TODO: Add options for more agent pairs
        g.playGame(agent1 = 1, agent2 = 3)
        for experience in g.experiences:
            buffer.add(experience)

    batch = buffer.sample(min(len(buffer.buffer), ideal_batch_size))
    # states and next states should be floats (same as the OUTPUT)
    # brackets are required to turn generator into a list
    states = torch.stack([torch.from_numpy(exp.state) for exp in batch]).float()
    next_states = torch.stack([torch.from_numpy(exp.next_state) for exp in batch]).float()
    # get q-network outputs
    state_q_values = q_network(states)
    next_state_q_values = q_network(next_states)

    # get loss
    loss = QLoss(state_q_values, next_state_q_values)
    # backprop
    with torch.no_grad():
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

train()