In [7]:
# imports
import math
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch import nn 
import numpy as np



**Gomoku** Class

In [8]:
class Gomoku:
  def __init__(self, dim, win):
    self.dim = dim
    self.win = dim
    self.total_moves = 0
    self.winner = 0
    self.board = np.zeros((self.dim, self.dim))

  def get_available(self):
    positions = list(map(list, np.where(self.board == 0)))
    return list(zip(positions[0], positions[1]))

  def make_move(self, position, player):
    if position not in self.get_available():
      return -1000
      
    self.board[position] = player
    self.total_moves += 1 

    if self.won() == self.win:
      self.winner = player
      return 1000
    
    return -10

  def won(self):
    # Return self.winner if there's a winning subsequence for any player. 
    # check horizontal
    ans = 0

    for i in range(0, self.dim):
    #  print([ abs(sum(self.board[i, j: j +self.win])) for j in range(0, self.dim - self.win + 1)])
      ans = max(ans, max([ abs(sum(self.board[i, j: j +self.win])) for j in range(0, self.dim - self.win + 1)]))
    for j in range(0, self.dim):
      ans  =max(ans, max([ abs(sum(self.board[i: i+self.win, j])) for i in range(0, self.dim - self.win + 1)]))
    #check vertical

    for i in range(0, self.dim - self.win+1):
      ans = max(ans, max([ abs(sum(   self.board[ [i + k for k in range(0, self.win)], [j + l for l in range(0, self.win)]  ]    )) for j in range(0, self.dim - self.win + 1 )]))
    #check diagonal
    for i in range(self.win-1, self.dim):
      ans = max(ans, max([ abs(sum(self.board[[i - k for k in range(0, self.win)], [j + l for l in range(0, self.win)] ])) for j in range(0, self.dim - self.win + 1)]))
    #check anti-diagonal
    return ans

 

In [9]:
#Network for Q learning
class DQN(nn.Module):
  def __init__(self, n_channels, n_actions):
    super().__init__()
    self.conv = nn.Conv2d(in_channels=n_channels, out_channels=16,
                          kernel_size=5, stride=1)
    self.fc1 = nn.Linear(in_features=1024, out_features=128)
    self.fc2 = nn.Linear(in_features=128, out_features=n_actions)

  def forward(self, x):
    x = F.relu(self.conv(x))
    x = torch.flatten(x, start_dim=1)
    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    return x

In [10]:
#Agent class as in week 11 tutorial.
class Agents:
  def __init__(self, policy1, policy2, p1_net, p2_net, optimizer1, optimizer2):
    self.policy1 = policy1
    self.policy2 = policy2
    self.q1 = p1_net
    self.q2 = p2_net
    self.optimizer1 = optimizer1
    self.optimizer2 = optimizer2

  def act(self, state, player):
    with torch.no_grad():
      if player == 1 :
        return self.policy1(self.q1, state)
      else:
        return self.policy2(self.q2, state)
  
  def train(self, state, action, player, reward, discount, next_state):
    if player == 1:
      q_net = self.q1
      optimizer = self.optimizer1
    else:
      q_net = self.q2
      optimizer = self.optimizer2

    q_pred = q_net(state).gather(1, action)
    with torch.no_grad():
      q_target = q_net(next_state).max(dim=1)[0].view(-1, 1)
      q_target = reward + discount * q_target
    
    loss = F.mse_loss(q_pred, q_target)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [11]:
def train_agents(agents, gamma, num_games, dim, num_win):
  
  for _ in range(num_games):
    game = Gomoku(dim, num_win)
    curr_player = 1
    end_game = 0 
    rewards = {1: 0, -1:0}
    while game.winner == 0 and game.total_moves < dim**2:
      #let two agents play each other and learn.
      state = game.board
      action = agents.act(state, curr_player)
      reward = game.make_move(action, curr_player)
      end_game, next_state = abs(game.winner), game.board
      discount = gamma*(1-end_game)

      agents.train(state, action, curr_player, reward, discount, next_state)

      rewards[curr_player] += reward

      state = next_State
      


      

In [13]:
def epsilon_greedy(n_actions, epsilon):
  def policy_fn(q_net, state):
    if torch.rand(1) < epsilon:
      return torch.randint(n_actions, size=(1,), device=device)
    else:
      with torch.no_grad():
        q_pred = q_net(state)
        return torch.argmax(q_pred).view(1,)
  return policy_fn

In [None]:
num_games = 100
gamma = 0.99
epsilon = 0.1

dim = 3
num_win = 3
num_actions = 

p1 = DQN(dim, num_win)
p2 = DQN(dim, num_win)


policy1 = epsilon_greedy(num_actions, epsilon)
policy2 = epsilon_greedy(num_actions, epsilon)

optimizer1 = torch.optim.Adam(p1.parameters(), lr=1e-3)
optimizer2 = torch.optim.Adam(p2.parameters(), lr=1e-3)

agents = Agents(policy1, policy2, p1, p2, optimizer1, optimizer2)
train_agents(agents, gamma, num_games, dim, num_win)









plt.figure(figsize=(8, 4))
plt.plot(eps_b_qn[0])
plt.title('breakout reward curve')
plt.xlabel('episode')
plt.ylabel('return')
plt.show()