In [7]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from game import Game
from heuristics import *

In [8]:
class DeepQNetworkConnect4(nn.Module):
    # def __init__(self, env):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(42, 42),
            nn.ReLU(),
            nn.Linear(42, 20),
            nn.ReLU(),
            nn.Linear(20, 7),
        )

    def forward(self, x):
        return self.network(x)

In [9]:
class ReplayBuffer:

    def __init__(self, max_frames):
        self.max_frames = max_frames
        self.buffer = []

    def add(self, frame):
        self.buffer.append(frame)
        if len(self.buffer) > self.max_frames:
            del self.buffer[0:len(self.buffer)-self.max_frames]

    def sample(self, num_samples):
        # Ensure we don't pick the same frame twice
        # Record the random indices picked from elements in the buffer
        sample_nums = set()
        while len(sample_nums) < num_samples:
            sample_nums.add(random.randrange(len(self.buffer)))
        experiences = [self.buffer[i] for i in sample_nums]
        return experiences

In [10]:
def calculate_epsilon(step, epsilon_start, epsilon_finish, total_timesteps, exploration_fraction):
    finish_step = total_timesteps * exploration_fraction
    if step > finish_step:
        return epsilon_finish
    epsilon_range = epsilon_start - epsilon_finish
    return epsilon_finish + (((finish_step - step) / finish_step) * epsilon_range)

In [11]:
# HYPERPARAMETERS
seed = 0
buffer_size = 10000
learning_rate = 2.5e-2 # should be lower
ideal_batch_size = 1000
total_timesteps = 100
train_games = 20
epsilon_start = 0.9
epsilon_finish = 0
exploration_fraction = 0.8
gamma = 0.99

In [13]:
def QLoss(experiences, q_network, target_network):
    loss = 0

    for exp in experiences:
        state, action, reward, next_state = exp.state, exp.action, exp.reward, exp.next_state
        state = torch.from_numpy(state).float()
        state = torch.flatten(state, start_dim = 0, end_dim = 1) # double check dimensions

        if reward == None:
            next_state = torch.from_numpy(next_state).float()
            next_state = torch.flatten(next_state, start_dim = 0, end_dim = 1) # double check dimensions
            loss += (gamma * torch.max(target_network(next_state)) - q_network(state)[action]) ** 2
        else: # next_state is None
            loss += (reward - q_network(state)[action]) ** 2

    loss /= len(experiences)
    return loss

def train():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True if seed > 0 else False

    # Initialise replay memory D to capacity N
    buffer = ReplayBuffer(buffer_size)

    # Initialize action-value function Q and target network
    # TODO: Add extra agents to play against
    q_network = DeepQNetworkConnect4().to(device)
    target_network = DeepQNetworkConnect4().to(device)
    target_network.load_state_dict(q_network.state_dict())
    optimiser = torch.optim.Adam(q_network.parameters(), learning_rate)

    rewards = []
    for iter in range(int(total_timesteps)):
        for step in range(int(train_games)):
            epsilon = calculate_epsilon(iter, epsilon_start, epsilon_finish, total_timesteps, exploration_fraction)
            # Generate games and add experiences
            g = Game()
            # TODO: Why is the first player dominating? first_player is random, should be 50/50
            g.playGame(agent1 = q_network, agent2 = q_network, epsilon = epsilon, first_player = random.randint(1, 2))
            rewards.append(g.state)
            for experience in g.experiences:
                buffer.add(experience)

        print(sum(rewards[-20:-1]))

        batch = buffer.sample(min(len(buffer.buffer), ideal_batch_size))
        # states and next states should be floats (same as the OUTPUT)
        # brackets are required to turn generator into a list

        # get loss
        loss = QLoss(batch, q_network, target_network)
        if len(rewards) > 500:
            print(iter, loss, epsilon, sum(rewards[-500:])/500)
        else:
            print(iter, loss, epsilon, sum(rewards)/len(rewards))
            
        # backprop
        with torch.no_grad():
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()

train()

1
0 tensor(0.1140, grad_fn=<DivBackward0>) 0.9 0.1
9
1 tensor(0.0876, grad_fn=<DivBackward0>) 0.88875 0.25
-9
2 tensor(0.0923, grad_fn=<DivBackward0>) 0.8775 0.03333333333333333
-1
3 tensor(0.0919, grad_fn=<DivBackward0>) 0.8662500000000001 0.0
1
4 tensor(0.0903, grad_fn=<DivBackward0>) 0.855 0.0
-1
5 tensor(0.0893, grad_fn=<DivBackward0>) 0.84375 -0.016666666666666666
-3
6 tensor(0.0931, grad_fn=<DivBackward0>) 0.8325 -0.02857142857142857
7
7 tensor(0.0902, grad_fn=<DivBackward0>) 0.82125 0.0125
1
8 tensor(0.0960, grad_fn=<DivBackward0>) 0.81 0.022222222222222223
1
9 tensor(0.0907, grad_fn=<DivBackward0>) 0.79875 0.02
3
10 tensor(0.0863, grad_fn=<DivBackward0>) 0.7875 0.02727272727272727
-3
11 tensor(0.0842, grad_fn=<DivBackward0>) 0.7762500000000001 0.008333333333333333
9
12 tensor(0.0914, grad_fn=<DivBackward0>) 0.765 0.038461538461538464
-3
13 tensor(0.0918, grad_fn=<DivBackward0>) 0.75375 0.02857142857142857
1
14 tensor(0.0817, grad_fn=<DivBackward0>) 0.7424999999999999 0.03333333