In [78]:
# -*- coding: utf-8 -*-
from __future__ import print_function
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl
import numpy as np
import sys
import re
import random
import copy

In [79]:
SIZE = 4
NONE = 0
BLACK = 1
WHITE = 2
STONE = ['', '●', '◯']
ROWLABEL = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5, 'f':6, 'g':7, 'h':8}
N2L = ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
REWARD_WIN = 1
REWARD_LOSE = -1
DIR = ((-1,0), (-1,1), (0,1), (1,1), (1,0), (1, -1), (0,-1), (-1,-1))

In [80]:
class QFunction(chainer.Chain):
    def __init__(self, obs_size, n_actions, n_nodes):
        w = chainer.initializers.HeNormal(scale=1.0)
        super(QFunction, self).__init__()
        with self.init_scope():
            self.l1 = L.Linear(obs_size, n_nodes, initialW=w)
            self.l2 = L.Linear(n_nodes, n_nodes, initialW=w)
            self.l3 = L.Linear(n_nodes, n_nodes, initialW=w)
            self.l4 = L.Linear(n_nodes, n_actions, initialW=w)
    def __call__(self, x):
        h = F.relu(self.l1(x))
        h = F.relu(self.l2(h))
        h = F.relu(self.l3(h))
        return chainerrl.action_value.DiscreteActionValue(self.l4(h))

In [81]:
class Board():
    def __init__(self):
        self.board_reset()
        
    def board_reset(self):
        self.board = np.zeros((SIZE, SIZE), dtype=np.float32)
        mid = SIZE//2
        self.board[mid, mid] = WHITE
        self.board[mid-1, mid-1] = WHITE
        self.board[mid-1, mid] = BLACK
        self.board[mid, mid-1] = BLACK
        self.winner = NONE
        self.turn = BLACK
        self.game_end = False
        self.pss = 0
        self.nofb = 0
        self.nofw = 0
        self.available_pos = self.search_positions()
        
    def put_stone(self, pos):
        if self.is_available(pos):
            self.board[pos[0], pos[1]] = self.turn
            self.do_reverse(pos)
            return True
        else:
            return False
        
    def change_turn(self):
        self.turn = WHITE if self.turn==BLACK else BLACK
        self.available_pos = self.search_positions()
            
    def random_action(self):
        if len(self.available_pos) > 0:
            pos = random.choice(self.available_pos)
            pos = pos[0]*SIZE + pos[1]
            return pos
        return False
    
    def agent_action(self, pos):
        self.put_stone(pos)
        self.end_check()
    
    def do_reverse(self, pos):
        for di, dj in DIR:
            opp = BLACK if self.turn==WHITE else WHITE
            boardcopy = self.board.copy()
            i = pos[0]
            j = pos[1]
            flag = False
            while 0<=i<SIZE and 0<=j<SIZE:
                i+=di
                j+=dj
                if 0<=i<SIZE and 0<=j<SIZE and boardcopy[i,j]==opp:
                    flag = True
                    boardcopy[i,j] = self.turn
                elif not(0<=i<SIZE and 0<=j<SIZE) or (flag==False and boardcopy[i,j]!=opp):
                    break
                elif boardcopy[i,j]==self.turn and flag==True:
                    self.board = boardcopy.copy()
                    break

    def search_positions(self):
        pos = []
        emp = np.where(self.board==0)
        for i in range(emp[0].size):
            p = (emp[0][i], emp[1][i])
            if self.is_available(p):
                pos.append(p)
        return pos
    
    def is_available(self, pos):
        if self.board[pos[0], pos[1]] != NONE:
            return False
        opp = BLACK if self.turn == WHITE else WHITE
        for di, dj in DIR:
            i = pos[0]
            j = pos[1]
            flag = False
            while 0<=i<SIZE and 0<=j<SIZE:
                i += di
                j += dj
                if 0<=i<SIZE and 0<=j<SIZE and self.board[i,j]==opp:
                    flag = True
                elif not(0 <= i < SIZE and 0 <= j < SIZE) or (flag == False and self.board[i,j] != opp) or self.board[i,j] == NONE:
                    break
                elif self.board[i,j]==self.turn and flag==True:
                    return True
        return True
    
    def end_check(self):
        if np.count_nonzero(self.board)==SIZE*SIZE or self.pss==2:
            self.game_end = True
            self.nofb = len(np.where(self.board==BLACK)[0])
            self.nofw = len(np.where(self.board==WHITE)[0])
            self.winner = BLACK if len(np.where(self.board==BLACK)[0]) > len(np.where(self.board==WHITE)[0]) else WHITE

In [82]:
board = Board()

obs_size = SIZE*SIZE
n_actions = SIZE*SIZE
n_nodes = 256
q_func = QFunction(obs_size, n_actions, n_nodes)

optimizer = chainer.optimizers.Adam(eps=1e-2)
optimizer.setup(q_func)

gamma = 0.99

explorer = chainerrl.explorers.LinearDecayEpsilonGreedy(
    start_epsilon = 1.0,
    end_epsilon = 0.1,
    decay_steps = 50000,
    random_action_func = board.random_action
)
replay_buffer_b = chainerrl.replay_buffer.PrioritizedReplayBuffer(capacity=10**6)
replay_buffer_w = chainerrl.replay_buffer.PrioritizedReplayBuffer(capacity=10**6)

In [83]:
agent_black = chainerrl.agents.DoubleDQN(
    q_func, optimizer, replay_buffer_b, gamma, explorer,
    replay_start_size = 1000, 
    minibatch_size = 128,
    update_interval = 1,
    target_update_interval = 1000
)
agent_white = chainerrl.agents.DoubleDQN(
    q_func, optimizer, replay_buffer_w, gamma, explorer,
    replay_start_size=1000,
    minibatch_size=128,
    update_interval=1,
    target_update_interval=1000
)
agents = ['', agent_black, agent_white]

In [84]:
n_episodes = 20000
win = 0
lose = 0
draw = 0

In [85]:
for i in range(1, n_episodes+1):
    board.board_reset()
    rewards = [0, 0, 0]
    while not board.game_end:
        if not board.available_pos:
            board.pss += 1
            board.end_check()
        else:
            boardcopy = np.reshape(board.board.copy(), (-1,))
            while True:
                pos = agents[board.turn].act_and_train(boardcopy, rewards[board.turn])
                pos = divmod(pos, SIZE)
                if board.is_available(pos):
                    break
                else:
                    rewards[board.turn] = REWARD_LOSE
                    
            board.agent_action(pos)
            if board.pss == 1:
                board.pss = 0
                
        if board.game_end:
            if board.winner == BLACK:
                rewards[BLACK] = REWARD_WIN  # 黒の勝ち報酬
                rewards[WHITE] = REWARD_LOSE # 白の負け報酬
                win += 1
            elif board.winner == 0:
                draw += 1                    
            else:
                rewards[BLACK] = REWARD_LOSE
                rewards[WHITE] = REWARD_WIN
                lose += 1               
            boardcopy = np.reshape(board.board.copy(), (-1,)) 
            agents[board.turn].stop_episode_and_train(boardcopy, rewards[board.turn], True)
            board.change_turn()
            agents[board.turn].stop_episode_and_train(boardcopy, rewards[board.turn], True)
        else:
            board.change_turn()
            
    if i % 100 == 0:
        print('==== Episode {} : black win {}, black lose {}, draw {} ===='.format(i, win, lose, draw))
        print('<BLACK> statistics: {}, epsilon {}'.format(agent_black.get_statistics(), agent_black.explorer.epsilon))
        print('<WHITE> statistics: {}, epsilon {}'.format(agent_white.get_statistics(), agent_white.explorer.epsilon))
        win = 0
        lose = 0
        draw = 0
        
    if i % 100 == 0:
        agent_black.save('agent_black/agent_black_'+str(i))
        agent_white.save('agent_white/agent_white_'+str(i))

==== Episode 100 : black win 37, black lose 63, draw 0 ====
<BLACK> statistics: [('average_q', 1.2945974315497524), ('average_loss', 0), ('n_updates', 0)], epsilon 0.9892
<WHITE> statistics: [('average_q', 1.2354705560784176), ('average_loss', 0), ('n_updates', 0)], epsilon 0.9892
==== Episode 200 : black win 36, black lose 64, draw 0 ====
<BLACK> statistics: [('average_q', 1.9086178269120093), ('average_loss', 0.22070676960364435), ('n_updates', 411)], epsilon 0.978328
<WHITE> statistics: [('average_q', 1.8218509856511371), ('average_loss', 0.18888248428257604), ('n_updates', 411)], epsilon 0.978328
==== Episode 300 : black win 45, black lose 55, draw 0 ====
<BLACK> statistics: [('average_q', 1.851110775175919), ('average_loss', 0.2335156468761844), ('n_updates', 1633)], epsilon 0.96733
<WHITE> statistics: [('average_q', 1.7647288360244289), ('average_loss', 0.2050619664987062), ('n_updates', 1633)], epsilon 0.96733
==== Episode 400 : black win 41, black lose 59, draw 0 ====
<BLACK> s

==== Episode 2600 : black win 26, black lose 74, draw 0 ====
<BLACK> statistics: [('average_q', 1.0500643648220938), ('average_loss', 0.03407518805319377), ('n_updates', 29807)], epsilon 0.712558
<WHITE> statistics: [('average_q', 0.9013836343120137), ('average_loss', 0.0391121558625769), ('n_updates', 29807)], epsilon 0.712558
==== Episode 2700 : black win 36, black lose 64, draw 0 ====
<BLACK> statistics: [('average_q', 1.0553195455143882), ('average_loss', 0.03619690847366243), ('n_updates', 31017)], epsilon 0.701632
<WHITE> statistics: [('average_q', 0.9048747022752422), ('average_loss', 0.03936304494848761), ('n_updates', 31017)], epsilon 0.701632
==== Episode 2800 : black win 31, black lose 69, draw 0 ====
<BLACK> statistics: [('average_q', 1.0429671889145393), ('average_loss', 0.03436913215030413), ('n_updates', 32249)], epsilon 0.690364
<WHITE> statistics: [('average_q', 0.8853568812036915), ('average_loss', 0.040405347039760744), ('n_updates', 32249)], epsilon 0.690364
==== Ep

==== Episode 5100 : black win 34, black lose 66, draw 0 ====
<BLACK> statistics: [('average_q', 0.8075111071563832), ('average_loss', 0.03450547934377355), ('n_updates', 60737)], epsilon 0.43204600000000004
<WHITE> statistics: [('average_q', 0.6942077949804386), ('average_loss', 0.040444634428843565), ('n_updates', 60737)], epsilon 0.43204600000000004
==== Episode 5200 : black win 39, black lose 61, draw 0 ====
<BLACK> statistics: [('average_q', 0.8090954900715882), ('average_loss', 0.03439987562195419), ('n_updates', 61990)], epsilon 0.4204899999999999
<WHITE> statistics: [('average_q', 0.6783649699982417), ('average_loss', 0.03783804232727761), ('n_updates', 61990)], epsilon 0.4204899999999999
==== Episode 5300 : black win 34, black lose 66, draw 0 ====
<BLACK> statistics: [('average_q', 0.782147826354341), ('average_loss', 0.03427881262130395), ('n_updates', 63238)], epsilon 0.40934800000000005
<WHITE> statistics: [('average_q', 0.6666179969829064), ('average_loss', 0.03817485198467

==== Episode 7500 : black win 25, black lose 75, draw 0 ====
<BLACK> statistics: [('average_q', 0.6268889664057844), ('average_loss', 0.03190645845134477), ('n_updates', 90850)], epsilon 0.158068
<WHITE> statistics: [('average_q', 0.5840267188726181), ('average_loss', 0.03495367240297744), ('n_updates', 90850)], epsilon 0.158068
==== Episode 7600 : black win 22, black lose 78, draw 0 ====
<BLACK> statistics: [('average_q', 0.6475665364484715), ('average_loss', 0.03164995671878779), ('n_updates', 92132)], epsilon 0.14636799999999994
<WHITE> statistics: [('average_q', 0.6019443299755709), ('average_loss', 0.03556959432901408), ('n_updates', 92132)], epsilon 0.14636799999999994
==== Episode 7700 : black win 29, black lose 71, draw 0 ====
<BLACK> statistics: [('average_q', 0.6360543482931122), ('average_loss', 0.03276069477564543), ('n_updates', 93384)], epsilon 0.13522599999999996
<WHITE> statistics: [('average_q', 0.6014201281538113), ('average_loss', 0.03550286874655539), ('n_updates', 

==== Episode 10000 : black win 41, black lose 59, draw 0 ====
<BLACK> statistics: [('average_q', 0.5301789743935762), ('average_loss', 0.030307037112082904), ('n_updates', 123392)], epsilon 0.1
<WHITE> statistics: [('average_q', 0.4174306367432388), ('average_loss', 0.031301825669282864), ('n_updates', 123392)], epsilon 0.1
==== Episode 10100 : black win 27, black lose 73, draw 0 ====
<BLACK> statistics: [('average_q', 0.5047004781152518), ('average_loss', 0.028962591564187828), ('n_updates', 124665)], epsilon 0.1
<WHITE> statistics: [('average_q', 0.4823300403053112), ('average_loss', 0.030475044233047727), ('n_updates', 124665)], epsilon 0.1
==== Episode 10200 : black win 37, black lose 63, draw 0 ====
<BLACK> statistics: [('average_q', 0.5282448519801922), ('average_loss', 0.02880516814032535), ('n_updates', 125913)], epsilon 0.1
<WHITE> statistics: [('average_q', 0.5000316406835715), ('average_loss', 0.030665396432441414), ('n_updates', 125913)], epsilon 0.1
==== Episode 10300 : bl

==== Episode 12600 : black win 31, black lose 69, draw 0 ====
<BLACK> statistics: [('average_q', 0.4930808775893174), ('average_loss', 0.026065909590304533), ('n_updates', 156906)], epsilon 0.1
<WHITE> statistics: [('average_q', 0.4975460263942039), ('average_loss', 0.027637003617802056), ('n_updates', 156906)], epsilon 0.1
==== Episode 12700 : black win 36, black lose 64, draw 0 ====
<BLACK> statistics: [('average_q', 0.48177252472333804), ('average_loss', 0.02591453701333867), ('n_updates', 158157)], epsilon 0.1
<WHITE> statistics: [('average_q', 0.4966660542179012), ('average_loss', 0.028555291267266383), ('n_updates', 158157)], epsilon 0.1
==== Episode 12800 : black win 25, black lose 75, draw 0 ====
<BLACK> statistics: [('average_q', 0.47340596992575584), ('average_loss', 0.02611449296860773), ('n_updates', 159441)], epsilon 0.1
<WHITE> statistics: [('average_q', 0.4776992173792052), ('average_loss', 0.02866523677847442), ('n_updates', 159441)], epsilon 0.1
==== Episode 12900 : bl

==== Episode 15200 : black win 32, black lose 68, draw 0 ====
<BLACK> statistics: [('average_q', 0.3812253986852147), ('average_loss', 0.022928102946832424), ('n_updates', 189979)], epsilon 0.1
<WHITE> statistics: [('average_q', 0.43766312688569536), ('average_loss', 0.02533900157677844), ('n_updates', 189979)], epsilon 0.1
==== Episode 15300 : black win 50, black lose 50, draw 0 ====
<BLACK> statistics: [('average_q', 0.3710163841115112), ('average_loss', 0.022807219930674628), ('n_updates', 191307)], epsilon 0.1
<WHITE> statistics: [('average_q', 0.4142703562937362), ('average_loss', 0.02544571270239986), ('n_updates', 191307)], epsilon 0.1
==== Episode 15400 : black win 36, black lose 64, draw 0 ====
<BLACK> statistics: [('average_q', 0.3743312449161995), ('average_loss', 0.023049596840018404), ('n_updates', 192680)], epsilon 0.1
<WHITE> statistics: [('average_q', 0.4179851203565809), ('average_loss', 0.02492225336853675), ('n_updates', 192680)], epsilon 0.1
==== Episode 15500 : bla

==== Episode 17800 : black win 36, black lose 64, draw 0 ====
<BLACK> statistics: [('average_q', 0.40147333621475423), ('average_loss', 0.022270920545650434), ('n_updates', 223148)], epsilon 0.1
<WHITE> statistics: [('average_q', 0.4243017082507781), ('average_loss', 0.022706207024280965), ('n_updates', 223148)], epsilon 0.1
==== Episode 17900 : black win 35, black lose 65, draw 0 ====
<BLACK> statistics: [('average_q', 0.40166505890947823), ('average_loss', 0.02263862204494127), ('n_updates', 224388)], epsilon 0.1
<WHITE> statistics: [('average_q', 0.41894651986587106), ('average_loss', 0.022627458077286295), ('n_updates', 224388)], epsilon 0.1
==== Episode 18000 : black win 34, black lose 66, draw 0 ====
<BLACK> statistics: [('average_q', 0.3899390481691235), ('average_loss', 0.021393413310567787), ('n_updates', 225635)], epsilon 0.1
<WHITE> statistics: [('average_q', 0.43958607252430854), ('average_loss', 0.023022077674036), ('n_updates', 225635)], epsilon 0.1
==== Episode 18100 : b