In [1]:
# imports
import math
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch import nn 
import numpy as np


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


**Gomoku** Class

In [2]:
class Gomoku:
  def __init__(self, dim, win):
    self.dim = dim
    self.win = dim
    self.total_moves = 0
    self.winner = 0
    self.board = np.zeros((self.dim, self.dim))
    self.log = [] 

  def get_available(self):
    positions = list(map(list, np.where(self.board == 0)))
    return list(zip(positions[0], positions[1]))

  def make_move(self, position, player):
    self.log.append((player, position))
    if position not in self.get_available():
      return -100
      
    self.board[position] = player
    self.total_moves += 1 

    if self.won() == self.win:
      self.winner = player
      return 100
    
    return -1

  def won(self):
    # Return self.winner if there's a winning subsequence for any player. 
    # check horizontal
    ans = 0

    for i in range(0, self.dim):
    #  print([ abs(sum(self.board[i, j: j +self.win])) for j in range(0, self.dim - self.win + 1)])
      ans = max(ans, max([ abs(sum(self.board[i, j: j +self.win])) for j in range(0, self.dim - self.win + 1)]))
    for j in range(0, self.dim):
      ans  =max(ans, max([ abs(sum(self.board[i: i+self.win, j])) for i in range(0, self.dim - self.win + 1)]))
    #check vertical

    for i in range(0, self.dim - self.win+1):
      ans = max(ans, max([ abs(sum(   self.board[ [i + k for k in range(0, self.win)], [j + l for l in range(0, self.win)]  ]    )) for j in range(0, self.dim - self.win + 1 )]))
    #check diagonal
    for i in range(self.win-1, self.dim):
      ans = max(ans, max([ abs(sum(self.board[[i - k for k in range(0, self.win)], [j + l for l in range(0, self.win)] ])) for j in range(0, self.dim - self.win + 1)]))
    #check anti-diagonal
    return ans

 

In [3]:
temp = Gomoku(3,3)
temp.board

temp.make_move((1,2), 1)

temp.log

[(1, (1, 2))]

In [4]:
#Network for Q learning
class DQN(nn.Module):
  def __init__(self, dim, num_win):
    super().__init__()
    self.fc1 = nn.Linear(dim*dim, 512)
    self.fc2 = nn.Linear(512, 512)
    self.fc3 = nn.Linear(512, 512)
    self.fc4 = nn.Linear(512, 512)
    self.fc5 = nn.Linear(512, dim*dim)

  def forward(self, x):
    x = torch.flatten(x).to(device)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    x = F.relu(self.fc4(x))
    x = self.fc5(x)
    return x

In [36]:
#Agent class as in week 11 tutorial.
class Agent:
  def __init__(self, policy, qnet, optimizer):
    self.policy = policy
    self.qnet = qnet
    self.optimizer = optimizer  

  def act(self, state):
    with torch.no_grad():
      return self.policy(self.qnet, state)
  
  def train(self, state, action, reward, discount, next_state):
    q_pred = self.qnet(state).view(1,-1)
    actionr = action.view(1,-1)
    
    q_pred = q_pred.gather(1, actionr)
    #q_pred = self.qnet(state).gather(1, action)
    #print(q_pred)
    with torch.no_grad():
      q_target = self.qnet(next_state).view(1,-1)
      #print(torch.max(q_target), reward, discount)
      #q_traget = q_target.max(dim=0)[0].view(-1,1)
      #q_target = self.qnet(next_state).max(dim=1)[0].view(-1, 1)
      q_target = torch.max(q_target).view(-1,1)
      q_target = reward + discount * q_target
      #print(q_target)
   # print(q_pred, q_target)
    loss = F.mse_loss(q_pred, q_target)

    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()


In [44]:
def train_agents(p1, p2, gamma, num_games, dim, num_win):
  
  for i in range(num_games):
    game = Gomoku(dim, num_win)
    curr_player = 1
    end_game = 0 
    rewards = {1: 0, -1:0}
    agent = {1 : p1, -1 : p2}

    while game.winner == 0 and game.total_moves < dim**2:
      #let two agents play each other and learn.
      state = torch.from_numpy(game.board).type(torch.float32).to(device)
      #state = state.to(device)
     # print(state)
      action = agent[curr_player].act(state)
      position = (action//dim, action%dim)
      reward = game.make_move(position, curr_player)
      end_game, next_state = abs(game.winner), torch.from_numpy(game.board).type(torch.float32)
      discount = gamma*(1-end_game)
      #print(action, position, game.board)
      agent[curr_player].train(state, action, reward, discount, next_state)

      rewards[curr_player] += reward
      curr_player = curr_player * -1
    if i % 100 == 0:
      print(f"game {i} completed")
      print(game.board, game.winner, game.total_moves)
   # print(game.board, game.winner, game.total_moves)
      


      

In [45]:
def epsilon_greedy(n_actions, epsilon):
  def policy_fn(q_net, state):
    if torch.rand(1) < epsilon:
      return torch.randint(n_actions, size=(1,), device=device)
    else:
      with torch.no_grad():
        q_pred = q_net(state)
        return torch.argmax(q_pred).view(1,)
  return policy_fn

In [None]:
num_games = 1000
gamma = 0.99
epsilon = 0.1

dim = 3
num_win = 3
num_actions = dim**2

p1 = DQN(dim, num_win).to(device)
p2 = DQN(dim, num_win).to(device)


policy1 = epsilon_greedy(num_actions, epsilon)
policy2 = epsilon_greedy(num_actions, epsilon)

optimizer1 = torch.optim.Adam(p1.parameters(), lr=1e-3)
optimizer2 = torch.optim.Adam(p2.parameters(), lr=1e-3)


agent1 = Agent(policy1, p1, optimizer1)
agent2 = Agent(policy2, p2, optimizer2)

train_agents(agent1, agent2, gamma, num_games, dim, num_win)


game 0 completed
[[ 1. -1. -1.]
 [-1.  1.  1.]
 [ 1. -1.  1.]] 1 9
game 100 completed
[[ 0.  1.  0.]
 [-1. -1. -1.]
 [ 0.  0.  1.]] -1 5
game 200 completed
[[-1. -1.  1.]
 [-1.  1. -1.]
 [ 1.  0.  0.]] 1 7


In [40]:
5//3, 5%3

(1, 2)

In [None]:
012
345
678

In [101]:
hi = torch.tensor([ 0.0212, -0.0384,  0.0191,  0.0014,  0.0157, -0.0082, -0.0513, -0.0141,
         0.0241])
ii = torch.tensor([8])

In [103]:
hi

tensor([ 0.0212, -0.0384,  0.0191,  0.0014,  0.0157, -0.0082, -0.0513, -0.0141,
         0.0241])

In [114]:
hi = hi.view(1,-1)
ii = ii.view(1,-1)

In [116]:
hi.gather(1, ii)

tensor([[0.0241]])

In [115]:
hi, ii

(tensor([[ 0.0212, -0.0384,  0.0191,  0.0014,  0.0157, -0.0082, -0.0513, -0.0141,
           0.0241]]), tensor([[8]]))