In [2]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from copy import deepcopy as dc
from game import Game
from heuristics import *

In [3]:
class DeepQNetworkConnect4(nn.Module):
    # def __init__(self, env):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(42, 42),
            nn.ReLU(),
            nn.Linear(42, 20),
            nn.ReLU(),
            nn.Linear(20, 7),
        )

    def forward(self, x):
        return self.network(x)

In [4]:
class ReplayBuffer:

    def __init__(self, max_frames):
        self.max_frames = max_frames
        self.buffer = []

    def add(self, frame):
        self.buffer.append(frame)
        if len(self.buffer) > self.max_frames:
            del self.buffer[0:len(self.buffer)-self.max_frames]

    def sample(self, num_samples):
        # Ensure we don't pick the same frame twice
        # Record the random indices picked from elements in the buffer
        sample_nums = set()
        while len(sample_nums) < num_samples:
            sample_nums.add(random.randrange(len(self.buffer)))
        experiences = [self.buffer[i] for i in sample_nums]
        return experiences

In [5]:
def calculate_epsilon(step, epsilon_start, epsilon_finish, total_timesteps, exploration_fraction):
    finish_step = total_timesteps * exploration_fraction
    if step > finish_step:
        return epsilon_finish
    epsilon_range = epsilon_start - epsilon_finish
    return epsilon_finish + (((finish_step - step) / finish_step) * epsilon_range)

In [6]:
# HYPERPARAMETERS
seed = 0
buffer_size = 10000
learning_rate = 2e-3 # should be lower
ideal_batch_size = 1000
total_timesteps = 200
train_games = 20
epsilon_start = 0.9
epsilon_finish = 0
exploration_fraction = 0.8
gamma = 0.99

In [7]:
q_network = DeepQNetworkConnect4()
old_q_network = DeepQNetworkConnect4()
buffer = ReplayBuffer(buffer_size)

In [8]:
def QLoss(experiences, q_network, target_network):
    loss = 0

    for exp in experiences:
        state, action, reward, next_state = exp.state, exp.action, exp.reward, exp.next_state
        state = torch.from_numpy(state).float()
        state = torch.flatten(state, start_dim = 0, end_dim = 1) # double check dimensions

        if reward == None:
            next_state = torch.from_numpy(next_state).float()
            next_state = torch.flatten(next_state, start_dim = 0, end_dim = 1) # double check dimensions
            loss += (gamma * torch.max(target_network(next_state)) - q_network(state)[action]) ** 2
        else: # next_state is None
            loss += (reward - q_network(state)[action]) ** 2

    loss /= len(experiences)
    return loss

def train(q_network, opponent, epsilon_initial_value = 0):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True if seed > 0 else False

    # Initialize action-value function Q and target network
    # TODO: Add extra agents to play against
    target_network = dc(q_network)
    optimiser = torch.optim.Adam(q_network.parameters(), learning_rate)

    epsilon_start = epsilon_initial_value
    rewards = []
    losses = []
    for iter in range(int(total_timesteps)):
        for step in range(int(train_games)):
            epsilon = calculate_epsilon(iter, epsilon_start, epsilon_finish, total_timesteps, exploration_fraction)
            # Generate games and add experiences
            g = Game()
            # TODO: Why is the first player dominating? first_player is random, should be 50/50
            g.playGame(agent1 = q_network, agent2 = opponent, epsilon = epsilon, first_player = random.randint(1, 2))
            rewards.append(g.state)
            for experience in g.experiences:
                buffer.add(experience)

        # print(sum(rewards[-20:-1]))

        batch = buffer.sample(min(len(buffer.buffer), ideal_batch_size))
        # states and next states should be floats (same as the OUTPUT)
        # brackets are required to turn generator into a list

        # get loss
        loss = QLoss(batch, q_network, target_network)
        losses.append(loss.item())
        if iter % 20 == 0:
            print(iter, loss, epsilon, sum(rewards)/len(rewards))
        '''
        if len(rewards) > 500:
            print(iter, loss, epsilon, sum(rewards[-500:])/500)
        else:
            print(iter, loss, epsilon, sum(rewards)/len(rewards))
        '''
        # backprop
        with torch.no_grad():
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()

    # graph losses
    plt.plot(losses)
    plt.show()
    return q_network

In [9]:
q_networks = []
q_network = train(q_network, 1, 0.9)
q_networks.append(q_network)
torch.save(q_network, 'q_network.pth')

0 tensor(0.1446, grad_fn=<DivBackward0>) 0.9 0.2


KeyboardInterrupt: 

In [None]:
# Main training loop
for i in range(7): # Fix if crash
    
    # load q_network
    q_network = torch.load('q_network.pth')

    total_timesteps = 200
    old_q_network = dc(q_network)
    q_network = train(q_network, old_q_network, 0.9)
    q_networks.append(q_network)
    torch.save(q_network, 'q_network.pth')

    # load q_network
    q_network = torch.load('q_network.pth')

    total_timesteps = 400
    q_network = train(q_network, 3, 0.9)
    q_networks.append(q_network)
    torch.save(q_network, 'q_network.pth')

In [None]:
g = Game()
g.playGame(agent1 = q_network, agent2 = 3, epsilon = 0.9, first_player = 1, pick_display = 1)