In [1]:
import numpy as np
import pygame
import random
import sys

class Marubatsu:
    def __init__(
            self,
            size=3, # ボードのサイズ
            current_player = random.choice([1, -1])# ランダムに先攻を決定
            ):
        self.size = size 
        self.board = np.array([[0]*size for _ in range(size)]) # サイズをもとにボードを作成
        self.current_player = current_player  
        self.winner = 0 # 勝利者
        self.reward = 0# 報酬
    
    def game_reset(self):
        """ボードの初期化"""
        self.board = np.array([[0]*self.size for _ in range(self.size)])
        self.current_player = random.choice([1, -1])  
        self.winner = 0
        self.reward = 0

    def done(self):
        """終了判定"""
        # 行と列のチェック
        player = self.current_player*-1
        for i in range(self.size):
            if np.all(self.board[i, :] == player) or np.all(self.board[:, i] == player):
                self.winner = player
                self.reward += 1 if player == 1 else -1
                return True
        
        # 対角線のチェック
        if np.all(np.diag(self.board) == player) or np.all(np.diag(np.fliplr(self.board)) == player):
            self.winner = player
            self.reward += 1 if player == 1 else -1
            return True

        # ドローのチェック
        if not np.any(self.board == 0):
            self.winner = 0
            self.reward += 0.3
            return True

        return False

    def step(self, act):
        """状態を更新"""
        x, y = divmod(act, self.size)
        # actを受け取って、次の状態にする
        if self.board[x, y] == 0: # 受け取ったactが有効なら
            x, y = divmod(act, self.size)
            self.board[x, y] = self.current_player
            self.reward -= 0.01
            self.current_player *= -1
        else: # 無効ならペナルティーを与える
            self.reward -= 0.1

    def pygame_init(self):
        """pygame開始"""
        # pygameを開始する
        pygame.init()
        self.screen = pygame.display.set_mode((self.size * 100, self.size * 100))
        self.font = pygame.font.Font(None, 100)
        pygame.display.set_caption("Marubatsu Game")
        self.pygame_render(self.board)

    def pygame_render(self, board):
        """描画関数"""
        # pygameでboardの内容を描画する
        WHITE = (255, 255, 255)
        BLACK = (0, 0, 0)
        self.screen.fill(WHITE)
        for x in range(1, self.size):
            pygame.draw.line(self.screen, BLACK, (x * 100, 0), (x * 100, self.size * 100), 3)
            pygame.draw.line(self.screen, BLACK, (0, x * 100), (self.size * 100, x * 100), 3)
        
        for i in range(self.size):
            for j in range(self.size):
                if board[i, j] == 1:
                    text = self.font.render('X', True, BLACK)
                    self.screen.blit(text, (j * 100 + 25, i * 100 + 15))
                elif board[i, j] == -1:
                    text = self.font.render('O', True, BLACK)
                    self.screen.blit(text, (j * 100 + 25, i * 100 + 15))

        pygame.display.flip()

class RandomAgent:
    def __init__(self, size):
        self.size = size

    def act(self, board):
        available_moves = np.argwhere(board == 0)
        move = random.choice(available_moves)
        act = move[0] * self.size + move[1]
        return act

class RandomalfaAgent:
    def __init__(self, size):
        self.size = size

    def act(self, board, player):
        for i in range(self.size):
            for j in range(self.size):
                if board[i, j] == 0:
                    board[i, j] = player
                    if self.check_win(board, player):
                        board[i, j] = 0
                        return i * self.size + j
                    board[i, j] = 0
        
        available_moves = np.argwhere(board == 0)
        move = random.choice(available_moves)
        act = move[0] * self.size + move[1]
        return act

    def check_win(self, board, player):
        for i in range(self.size):
            if np.all(board[i, :] == player) or np.all(board[:, i] == player):
                return True
        
        if np.all(np.diag(board) == player) or np.all(np.diag(np.fliplr(board)) == player):
            return True
        
        return False

class PlayerAgent:
    def __init__(self, size):
        self.size = size

    def act(self, board):
        while True:
            for event in pygame.event.get():  # Pygameのイベントを処理する
                if event.type == pygame.MOUSEBUTTONDOWN and event.button == 1:
                    pos = pygame.mouse.get_pos()  # マウスの位置を取得する
                    x, y = pos[1] // 100, pos[0] // 100  # マウスの位置をボードのセルに変換する
                    act = x * self.size + y  # 行と列を1次元のインデックスに変換する
                    if board[x, y] == 0:
                        return act
                elif event.type == pygame.QUIT:
                    pygame.quit()
                    sys.exit()

pygame 2.5.2 (SDL 2.28.3, Python 3.10.14)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from collections import deque
import csv
import os

class Memory:
    """経験再生のメモリクラス"""
    def __init__(self, memory_size=100, batch_size=30):
        self.memory_size = memory_size
        self.buffer = deque(maxlen=batch_size)

    def add(self, experience):
        # 右側に経験を追加
        self.buffer.append(experience)
        # 保存上限に達したら、左側の経験を削除
        if len(self.buffer) > self.memory_size:
            self.buffer.popleft()

    def sample(self, batch_size):
        # バッチサイズ分の経験をサンプリングする
        idx = np.random.choice(np.arange(len(self.buffer)), size=batch_size, replace=False)
        return [self.buffer[i] for i in idx]

    def __len__(self):
        return len(self.buffer)


class DQNAgent:
    def __init__(self, size, 
                 gamma = 0.9, # 割引率
                 epsilon = 0.1, # 乱雑度
                 memory_size = 100, # 経験の保存数
                 batch_size = 30, # 学習で使用する経験の数
                 target_interval = 30 # ターゲットを更新する間隔
                ):
        
        # パラメータ
        self.input_size = (size,size)
        self.n_act = size**2
        self.gamma = gamma 
        self.epsilon = epsilon  
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.target_interval = target_interval
        self.model = self._build_Qnet()
        self.time = 0
        
        # 学習過程の記録関連
        self.hist_rwds = []
        self.hist_wnrs = []
        
        # ターゲットモデルの生成
        self.model_target = self._build_Qnet()
        # メモリのインスタンスを作成
        self.memory = Memory(memory_size=self.memory_size, batch_size=batch_size)

    def _build_Qnet(self):
        # Qネットワークの構築
        model = Sequential()
        model.add(Flatten(input_shape=self.input_size))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(self.n_act, activation='linear'))
        
        # 勾配法のパラメータの定義
        model.compile(loss='mse', optimizer='Adam')
        
        return model

    def act(self, obs):
        # 確率でε-greedy法ではない
        if np.random.rand() <= self.epsilon:
            act = random.randrange(self.n_act)
        else:
            # Q値を予測する
            Q = self.get_Q(obs)
            act = np.argmax(Q) # 最大となるQ値を出力
        return act
    
    def get_Q(self, obs, type='main'):
        # obsを入力し出力を得る
        if type == 'main':
            # Qネットワークに観測obsを入力し出力を得る
            Q = self.model.predict(obs.reshape((1,) + self.input_size), verbose=0)[0, :]
        elif type == 'target':
            # ターゲットネットに観測obsを入力し出力を得る
            Q = self.model_target.predict(obs.reshape((1,) + self.input_size), verbose=0)[0, :]

        return Q
             

    def learn(self, obs, act, rwd, done, next_obs):
        if rwd is None:
            return
        
        self.memory.add((obs, act, rwd, done, next_obs))
        
        # 学習
        self._fit()

        # target_intervalの周期でQネットワークの重みをターゲットネットにコピー
        if self.time % self.target_interval == 0 and self.time > 0:
            self.model_target.set_weights(self.model.get_weights())

        self.time += 1
        
    def _fit(self):
        # 記憶された「経験」のデータの量がバッチサイズに満たない場合は戻る
        if len(self.memory) < self.batch_size:
            return
        
        # 学習に使うデータを出力
        outs = self.memory.sample(self.batch_size)

        # 観測とターゲットのバッチを入れる配列を準備
        obs = outs[0][0]  # 1番目の経験の観測を利用して、配列を作る
        obss = np.zeros((self.batch_size,) + obs.shape,dtype=int)
        targets = np.zeros((self.batch_size, self.n_act))
        
        for i, out in enumerate(outs):
            # 経験を要素に分解
            obs, act, rwd, done, next_obs = out

            # obs に対するQネットワークの出力 yを得る
            y = self.get_Q(obs)

            # target にyの内容をコピーする
            target = y.copy()

            if done is False:
                # 最終状態でなかったら next_obsに対する next_yを得る
                next_y = self.get_Q(next_obs)

                # Q[obs][act]のtarget_act を作成
                target_act = rwd + self.gamma * max(next_y)
            else:
                # 最終状態の場合は報酬だけでtarget_actを作成
                target_act = rwd

            # targetのactの要素だけtarget_actにする
            target[act] = target_act

            # obsとtargetをバッチの配列に入れる
            obss[i, :] = obs
            targets[i, :] = target
        
        # obssと targets のバッチのペアを与えて学習
        self.model.fit(obss, targets, verbose=0, epochs=1)

    
    def save_weights(self, filepath='agt_data/noname'):
        self.model.save(filepath + '.keras', overwrite=True)
        
        # episodeとrwdをCSVファイルに保存
        with open(filepath + '.csv', "w", newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(['episode','reward','winner'])
            for i, (rwd, wnr) in enumerate(zip(self.hist_rwds, self.hist_wnrs)):
                writer.writerow([i+1, rwd, wnr])

    def load_weights(self, filepath='agt_data/noname'):
        # モデルの重みを読み込む
        self.model = tf.keras.models.load_model(filepath + '.keras')
        
        # episodeとrwdをCSVファイルから読み込む
        with open(filepath + '.csv', "r") as csv_file:
            reader = csv.reader(csv_file)
            next(reader)
            for row in reader:
                self.hist_rwds.append(float(row[1]))
                self.hist_wnrs.append(float(row[2]))

In [5]:
def train():
    env = Marubatsu()

    agent1 = DQNAgent(env.size)
    agent2 = RandomalfaAgent(env.size)
    
    filepath='agt_data/tictaktoe_DQN'
    
    episode = 1
    
    if os.path.exists(filepath + '.keras'):
        agent1.load_weights(filepath)
        episode += len(agent1.hist_rwds)
    
    env.pygame_init()
    running = True
    
    while running:
        obs = env.board
        env.pygame_render(obs)
        
        if env.current_player == 1:
            act = agent1.act(obs)
        elif env.current_player == -1:
            act = agent2.act(obs,-1)

        # 状態を更新
        env.step(act)
        next_obs = env.board
        done = env.done()
        rwd = round(env.reward,2)
        
        # 描画
        env.pygame_render(obs)
        
        # 学習
        agent1.learn(obs, act, rwd, done, next_obs)
        
        if done:
            # 記録
            agent1.hist_rwds.append(rwd)
            agent1.hist_wnrs.append(env.winner)

            # 勝率
            win = (agent1.hist_wnrs.count(1) / episode) * 100

            # 結果を表示
            print(f"episode{episode}/Reward: {rwd}/Win %: {win} ")
            # 初期化
            env.game_reset()
            episode += 1
        
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                running = False
        
        if episode > 50000:
            running = False

    pygame.quit()
    
    # 重みパラメータの保存
    agent1.save_weights(filepath)

In [7]:
if __name__ == "__main__":
    train()

episode1/Reward: -1.05/Win %: 0.0 
episode2/Reward: 0.95/Win %: 50.0 
episode3/Reward: -1.69/Win %: 66.66666666666666 
episode4/Reward: -4.98/Win %: 50.0 
episode5/Reward: -1.05/Win %: 40.0 
episode6/Reward: -1.77/Win %: 50.0 
episode7/Reward: -2.26/Win %: 42.857142857142854 
episode8/Reward: -4.49/Win %: 50.0 
episode9/Reward: -1.25/Win %: 44.44444444444444 
episode10/Reward: -0.95/Win %: 50.0 
episode11/Reward: -3.18/Win %: 45.45454545454545 
episode12/Reward: -5.38/Win %: 41.66666666666667 
episode13/Reward: -2.27/Win %: 46.15384615384615 
episode14/Reward: -4.29/Win %: 50.0 
episode15/Reward: -5.65/Win %: 46.666666666666664 
episode16/Reward: -1.85/Win %: 43.75 
episode17/Reward: -1.35/Win %: 41.17647058823529 
episode18/Reward: -6.79/Win %: 38.88888888888889 
episode19/Reward: -5.46/Win %: 36.84210526315789 
episode20/Reward: -6.59/Win %: 35.0 
episode21/Reward: -2.16/Win %: 33.33333333333333 
episode22/Reward: -1.86/Win %: 31.818181818181817 
episode23/Reward: -1.25/Win %: 30.434

In [10]:
if __name__ == "__main__":
    env = Marubatsu(3)
    
    agent1 = PlayerAgent(env.size)
    agent2 = DQNAgent(env.size, epsilon = 0.0)
    
    env.pygame_init()  # Pygameを初期化し、ゲームウィンドウを設定する
    running = True
    
    while running:
        env.pygame_render(env.board)
        
        # 行動を決める
        if env.current_player == 1:
            act = agent1.act(env.board)
        elif env.current_player == -1:
            act = agent2.act(env.board)
        
        env.step(act)
        env.pygame_render(env.board)
        
        # 終了判定 
        if env.done():
            print(f"Reward: {env.reward}/Winner: {'X' if env.winner == 1 else 'O' if env.winner == -1 else 'Draw'}")
            # 初期化
            env.game_reset()
        elif env.reward < -5:
            print("time up!")
            env.game_reset()
        
        for event in pygame.event.get():  # Pygameのイベントを処理する
            if event.type == pygame.QUIT:  # ウィンドウの閉じるボタンがクリックされたとき
                running = False  # メインループを終了する

    pygame.quit()  # Pygameを終了する

time up!
time up!


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
