In [1]:
import numpy as np
import Gobang as go
import random
import os
import torch
import torch.nn as nn
import torch.optim as optim

In [22]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(128, 100)
        self.fc2 = nn.Linear(100, 64)
    
    def forward(self, x):
        out = torch.relu(self.fc1(x))
        out = self.fc2(out)
        return out


# 初始化模型
eval_model = Net()
eval_optimizer = optim.RMSprop(eval_model.parameters(), lr=1e-3)
target_model = Net()
target_optimizer = optim.RMSprop(target_model.parameters(), lr=1e-3)

In [48]:
class DQN:
    def __init__(self):
        self.buffer = torch.zeros(500, 8*8*2*2 + 2)
        self.buffer_counter = 0
        self.batch_size = 64
        self.gamma = 1e-2
        self.epsilon = 1
        self.epsilon_increment = 1e-4
        self.epsilon_max = .9
        self.replace_target_iter = 50   # 一定的時候替換神經網路參數
        self.learn_step_counter = 0
        if os.path.isfile('eval_record.pt'):
            checkpoint = torch.load('eval_record.pt')
            eval_model.state_dict(checkpoint['model_state_dict'])
            eval_model.train()
        if os.path.isfile('target_record.pt'):
            checkpoint = torch.load('target_record.pt')
            target_model.state_dict(checkpoint['model_state_dict'])
            target_model.train()
        
    # 將 8 * 8 棋盤轉換成 1 * 128 的 tensor 形式
    def board_transform(self, Board):
        black_board = np.zeros((1, 64))
        white_board = np.zeros((1, 64))
        flatten_board = np.reshape(Board, [1, 64])
        b = np.where(flatten_board==1)[1]
        w = np.where(flatten_board==-1)[1]
        black_board[0, b] = 1
        white_board[0, w] = 1
        s = np.hstack((black_board, white_board))
        s = torch.FloatTensor(s)
        return s

    def play(self, Board, chessman):
        step = 0
        for eposide in range(10):
            board = Board.copy()
            chess = chessman
            while go.GetValid(board) != []:
                action = self.choose_action(board)
                observation = self.board_transform(board)
                board[int(action/8), int(action%8)] = chessman
                next_observation = self.board_transform(board)
                reward = self.reward_rule(board, [int(action/8), int(action%8)])
                self.store_transition(observation, action, reward, next_observation)
                chess = -chess
                if step > 200 and step % 100 == 0:
                    self.training()
                step += 1
                if go.IsContinuous(board, [int(action/8), int(action%8)]):
                    break

    
    def store_transition(self, state, action, reward, new_state):
        action = np.reshape(action, [1, 1])
        reward = np.reshape(reward, [1, 1])
        state = np.reshape(state, [1, 8*8*2])
        new_state = np.reshape(new_state, [1, 8*8*2])
        transition = np.hstack((state, action, reward, new_state))
        transition = torch.FloatTensor(transition)

        index = self.buffer_counter % 500   # buffer_size = 500
        self.buffer[index, :] = transition
        self.buffer_counter += 1
    
    def choose_action(self, Board):
        # 選擇最佳解
        if np.random.uniform() <= self.epsilon:
            observation = self.board_transform(Board)
            action_value = eval_model(observation)
            max_value = float('-inf')
            max_action = -1
            for x, y in go.GetValid(Board):
                if action_value[0, 8*x+y].item() > max_value:
                    max_value = action_value[0, 8*x+y].item()
                    max_action = 8 * x + y
            return max_action
        # 選擇隨機點
        else:
            x, y = random.choice(go.GetValid(Board))
            return 8 * x + y  

    def reward_rule(self, Board, action):
        return 1 if go.IsContinuous(Board, action) else 0

    def training(self):
        # 檢查是否替換 target_net 參數
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.replace_target_params()
        if self.buffer_counter > 500:    # buffer_size = 500
            sample_index = np.random.choice(500, self.batch_size)
        else:
            sample_index = np.random.choice(self.buffer_counter, self.batch_size)
        batch_buffer = self.buffer[sample_index, :]
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        print(self.buffer)
        state = batch_buffer[:, -128:]
        next_state = batch_buffer[:, :128]
        Q_eval = target_model(state)
        Q_next = eval_model(next_state)
        Q_target = torch.clone(Q_eval)
        print(batch_buffer[:, 128])
        eval_act_index = batch_buffer[:, 128]

        reward = batch_buffer[:, 128 + 1]
        Q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(Q_next, axis=1)
        loss = nn.MSELoss(Q_target, Q_eval)
        eval_optimizer.zero_grad()
        loss.backward()
        eval_optimizer.step()
    
    def replace_target_params(self):
        for p, q in zip(eval_model.parameters(), target_model.parameters()):
            q.data = p.data
        
    def move(self, Board, action, chessman):
        self.play(Board, chessman)
        action = self.choose_action(Board)
        torch.save({'model_state_dict': eval_model.state_dict()}, 'eval_record.pt')
        torch.save({'model_state_dict': target_model.state_dict()}, 'target_record.pt')
        return [int(action/8), int(action%8)]

    def choose(self, Board, action, chessman):
        return random.choice([-1, 0, 1])

In [4]:
Board = np.zeros((8,8))
Board[4] = [0., 0., 0., 1., 1., -1, 0. ,0.]
print(Board)

[[ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  1. -1.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]]


In [49]:
a = DQN()
a.move(Board, [4,3], chessman=-1)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([30., 49., 12., 19., 59., 51.,  0., 12.,  6.,  9., 14., 33., 56.,  9.,
        50., 47., 50., 47.,  0., 54.,  7., 21.,  6., 43., 43., 23.,  5., 14.,
         9., 48., 48., 19., 46., 63.,  8., 11., 19., 22., 38., 33., 61.,  8.,
        29.,  6.,  1., 29., 45., 29., 13., 51., 46., 48., 61., 21., 27., 50.,
         3., 32.,  5., 16., 56.,  7., 56.,  7.])


AttributeError: 'Tensor' object has no attribute 'astype'

In [13]:
print(check)

7
1
