In [4]:
import pygame
import random
import sys

class TicTacToe:
    def __init__(self, size=3, current_player=random.choice([1, -1])):
        self.size = size 
        self.board = [[0]*size for _ in range(size)]  # サイズをもとにボードを作成
        self.current_player = current_player  
        self.winner = 0  # 勝利プレイヤー
        self.reward = 0  # 報酬
        self.invalid_move = None  # 無効な動作を記録

    def game_reset(self):
        """ボードの初期化"""
        self.board = [[0]*self.size for _ in range(self.size)]
        self.current_player = random.choice([1, -1])  
        self.winner = 0
        self.reward = 0
        self.invalid_move = None

    def done(self):
        """終了判定"""
        player = self.current_player * -1
        
        # 行と列のチェック
        for i in range(self.size):
            if all(self.board[i][j] == player for j in range(self.size)) or all(self.board[j][i] == player for j in range(self.size)):
                self.winner = player
                self.reward += 1 if player == 1 else -1
                return True
        
        # 対角線のチェック
        if all(self.board[i][i] == player for i in range(self.size)) or all(self.board[i][self.size-i-1] == player for i in range(self.size)):
            self.winner = player
            self.reward += 1 if player == 1 else -1
            return True

        # ドローのチェック
        if not any(self.board[i][j] == 0 for i in range(self.size) for j in range(self.size)):
            self.winner = 0
            self.reward += 0.3
            return True

        return False

    def step(self, act):
        """状態を更新"""
        x, y = divmod(act, self.size)
        # actを受け取って、次の状態にする
        if self.board[x][y] == 0:  # 受け取ったactが有効なら
            self.board[x][y] = self.current_player
            self.invalid_move = None
            # プレイヤー交代
            self.current_player *= -1
        else:  # 無効ならペナルティーを与える
            self.reward -= 0.1
            self.invalid_move = (x, y)        
    
    def pygame_init(self):
        """pygame開始"""
        # pygameを開始する
        pygame.init()
        self.screen = pygame.display.set_mode((self.size * 100, self.size * 100))
        self.font = pygame.font.Font(None, 100)
        pygame.display.set_caption("Tic Tac Toe")
        self.pygame_render(self.board)

    def pygame_render(self, board):
        """描画関数"""
        # pygameでboardの内容を描画する
        WHITE = (255, 255, 255)
        BLACK = (0, 0, 0)
        RED = (255, 0, 0, 150)
        self.screen.fill(WHITE)
        for x in range(1, self.size):
            pygame.draw.line(self.screen, BLACK, (x * 100, 0), (x * 100, self.size * 100), 3)
            pygame.draw.line(self.screen, BLACK, (0, x * 100), (self.size * 100, x * 100), 3)
            
        for i in range(self.size):
            for j in range(self.size):
                if board[i][j] == 1:
                    text = self.font.render('O', True, BLACK)
                    self.screen.blit(text, (j * 100 + 25, i * 100 + 15))
                elif board[i][j] == -1:
                    text = self.font.render('X', True, BLACK)
                    self.screen.blit(text, (j * 100 + 25, i * 100 + 15))
            
        if self.invalid_move is not None: 
            (i, j) = self.invalid_move
            pygame.draw.rect(self.screen, RED, (j * 100, i * 100, 100, 100), 3)
        
        pygame.display.flip()

pygame 2.5.2 (SDL 2.28.3, Python 3.10.14)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [69]:
def progress(agent1, agent2, size=3, render=True):
    env = TicTacToe(size)
    
    if render:    
        env.pygame_init()  # Pygameを初期化し、ゲームウィンドウを設定する
    running = True
    
    while running:
        if render:
            env.pygame_render(env.board)
        
        # 行動を決める
        if env.current_player == 1:
            act = agent1.act(env.board)
        elif env.current_player == -1:
            board = [[1 if cell == -1 else -1 if cell == 1 else cell for cell in row] for row in env.board]
            act = agent2.act(board)
        
        env.step(act)
        if render:
            env.pygame_render(env.board)
                
        # 終了判定 
        if env.done():
            print(f"Reward: {env.reward}/Winner: {'O' if env.winner == 1 else 'X' if env.winner == -1 else 'Draw'}")
            # 初期化
            env.game_reset()
        elif env.reward < -5:
            print("time up!")
            env.game_reset()
        
        if render:
            for event in pygame.event.get():  # Pygameのイベントを処理する
                if event.type == pygame.QUIT:  # ウィンドウの閉じるボタンがクリックされたとき
                    running = False  # メインループを終了する

    pygame.quit()  # Pygameを終了する

# ランダムとランダムαと人間

In [6]:
class RandomAgent:
    def __init__(self, size, my_turn):
        self.size = size
        self.my_turn = my_turn

    def act(self, board):
        available_moves = [(i, j) for i in range(self.size) for j in range(self.size) if board[i][j] == 0]
        move = random.choice(available_moves)
        act = move[0] * self.size + move[1]
        return act

class RandomalfaAgent:
    def __init__(self, size, my_turn):
        self.size = size
        self.my_turn = my_turn
        
    def act(self, board):
        for i in range(self.size):
            for j in range(self.size):
                if board[i][j] == 0:
                    board[i][j] = self.my_turn
                    if self.check_win(board):
                        board[i][j] = 0
                        return i * self.size + j
                    board[i][j] = 0
        
        available_moves = [(i, j) for i in range(self.size) for j in range(self.size) if board[i][j] == 0]
        move = random.choice(available_moves)
        act = move[0] * self.size + move[1]
        return act

    def check_win(self, board):
        for i in range(self.size):
            if all(board[i][j] == self.my_turn for j in range(self.size)) or all(board[j][i] == self.my_turn for j in range(self.size)):
                return True
        
        if all(board[i][i] == self.my_turn for i in range(self.size)) or all(board[i][self.size-i-1] == self.my_turn for i in range(self.size)):
            return True
        
        return False

class HumanAgent:
    def __init__(self, size, my_turn):
        self.size = size
        self.my_turn = my_turn

    def act(self, board):
        while True:
            for event in pygame.event.get():  # Pygameのイベントを処理する
                if event.type == pygame.MOUSEBUTTONDOWN and event.button == 1:
                    pos = pygame.mouse.get_pos()  # マウスの位置を取得する
                    x, y = pos[1] // 100, pos[0] // 100  # マウスの位置をボードのセルに変換する
                    act = x * self.size + y  # 行と列を1次元のインデックスに変換する
                    if board[x][y] == 0:
                        return act
                elif event.type == pygame.QUIT:
                    pygame.quit()
                    sys.exit()

In [32]:
size = 3
agent1 = RandomalfaAgent(size, 1)
agent2 = RandomalfaAgent(size,-1)
progress(agent1,agent2,render=False)

Reward: -1.07/Winner: X
Reward: 0.95/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: -1.05/Winner: X
Reward: 0.92/Winner: O
Reward: -1.06/Winner: X
Reward: 0.91/Winner: O
Reward: 0.94/Winner: O
Reward: -1.05/Winner: X
Reward: -1.08/Winner: X
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: 0.21/Winner: Draw
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.07/Winner: X
Reward: -1.06/Winner: X
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: -1.07/Winner: X
Reward: -1.07/Winner: X
Reward: 0.95/Winner: O
Reward: -1.06/Winner: X
Reward: 0.94/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: -1.06/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: -1.06/Winner: X
Reward: 0.95/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.21/Winner: Draw
Reward: -1

Reward: -1.06/Winner: X
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: -1.06/Winner: X
Reward: 0.21/Winner: Draw
Reward: 0.9299999999999999/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.08/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: -1.05/Winner: X
Reward: 0.94/Winner: O
Reward: -1.05/Winner: X
Reward: 0.21/Winner: Draw
Reward: 0.9299999999999999/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.08/Winner: X
Reward: 0.95/Winner: O
Reward: 0.92/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.94/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: -1.06/Winner: X
Reward: -1.05/Winner: X
Reward: -1.0

Reward: 0.94/Winner: O
Reward: 0.94/Winner: O
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: -1.07/Winner: X
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: -1.09/Winner: X
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: -1.07/Winner: X
Reward: -1.06/Winner: X
Reward: 0.94/Winner: O
Reward: 0.92/Winner: O
Reward: 0.21/Winner: Draw
Reward: 0.94/Winner: O
Reward: 0.94/Winner: O
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: 0.94/Winner: O
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.06/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: -1.07/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Rewa

Reward: -1.07/Winner: X
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: 0.94/Winner: O
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.08/Winner: X
Reward: -1.06/Winner: X
Reward: -1.06/Winner: X
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: 0.91/Winner: O
Reward: 0.95/Winner: O
Reward: -1.07/Winner: X
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.06/Winner: X
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: 0.21/Winner: Draw
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.08/Winner: X
Reward: 0.94/Winner: O
Reward: 0.94/Winner: O
Reward: -1.06/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: -1.05/Winner:

Reward: -1.07/Winner: X
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: 0.94/Winner: O
Reward: -1.05/Winner: X
Reward: -1.06/Winner: X
Reward: -1.05/Winner: X
Reward: -1.07/Winner: X
Reward: -1.09/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.06/Winner: X
Reward: -1.06/Winner: X
Reward: 0.95/Winner: O
Reward: -1.06/Winner: X
Reward: -1.07/Winner: X
Reward: -1.05/Winner: X
Reward: -1.06/Winner: X
Reward: -1.06/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: 0.91/Winner: O
Reward: -1.05/Winner: X
Reward: -1.07/Winner: X
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.09/Winner: X
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: -1.06/Winner: X
Reward: -1.06/Winner: X
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: 0.94/Winner: O
Reward: -1.05/Winner: X
Reward

Reward: -1.08/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: 0.21/Winner: Draw
Reward: -1.06/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: 0.95/Winner: O
Reward: -1.06/Winner: X
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.06/Winner: X
Reward: -1.09/Winner: X
Reward: 0.21/Winner: Draw
Reward: -1.07/Winner: X
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: -1.07/Winner: X
Reward: 0.94/Winner: O
Reward: -1.09/Winner: X
Reward: 0.94/Winner: O
Reward: -1.05/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.94/Winner: O
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: -1.07/Winner: X
Reward: -1.05/Winner: X
Reward: -1.07/Winner: X
Reward: -1.07/Winner: X
Reward: -1.06/Winner: X
Reward

Reward: -1.06/Winner: X
Reward: 0.95/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: 0.94/Winner: O
Reward: -1.07/Winner: X
Reward: -1.05/Winner: X
Reward: -1.06/Winner: X
Reward: -1.06/Winner: X
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: -1.06/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: 0.95/Winner: O
Reward: 0.92/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.91/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.06/Winner: X
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.07/Winner: X
Reward: 0.21/Winner: Draw
Reward: 0.95/Winner: O
Reward: -1.06/Winner: X
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: 0.92/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.08/Win

Reward: 0.95/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.21/Winner: Draw
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: -1.07/Winner: X
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.07/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: 0.21/Winner: Draw
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: -1.06/Winner: X
Reward: -1.05/Winner: X
Reward: 0.95/Winner: O
Reward: -1.08/Winner: X
Reward: 0.91/Winner: O
Reward: -1.07/Winner: X
Reward: -1.05/Winner: X
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.08/Winner: X
Reward: -1.09/Winner: X
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.21/Winner: Draw
Reward: -1.07/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: 0.94/Winner: O
Reward: 0.21/Win

Reward: -1.07/Winner: X
Reward: -1.07/Winner: X
Reward: -1.08/Winner: X
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: 0.94/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: 0.95/Winner: O
Reward: -1.07/Winner: X
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.21/Winner: Draw
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: -1.06/Winner: X
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: 0.9299999999999999/Winner: O
Reward: -1.05/Winner: X
Reward: 0.91/Winner: O
Reward: -1.07/Winner: X
Reward: 0.95/Winner: O
Reward: 0.91/Winner: O
Reward: 0.92/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: 0.21/Winner: Draw
Reward: -1.06/Winner: X
Reward: -1.07/Winner: X
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: 0.92/Winner: O
Reward: -1.05/Winner: X
Reward: -1.06/Winner: X
Reward: -1.06/Winner: X
Re

Reward: -1.06/Winner: X
Reward: -1.05/Winner: X
Reward: 0.91/Winner: O
Reward: -1.09/Winner: X
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: -1.06/Winner: X
Reward: 0.94/Winner: O
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: -1.06/Winner: X
Reward: 0.95/Winner: O
Reward: -1.05/Winner: X
Reward: -1.06/Winner: X
Reward: 0.94/Winner: O
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: -1.07/Winner: X
Reward: -1.05/Winner: X
Reward: -1.06/Winner: X
Reward: 0.95/Winner: O
Reward: -1.06/Winner: X
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.94/Winner: O
Reward: -1.05/Winner: X
Reward: -1.08/Winner: X
Reward: -1.05/Winner: X
Reward: -1.09/Winner: X
Reward: -1.05/Winner: X
Reward: -1.05/Winner: X
Reward: -1.07/Winner: X
Reward: -1.07/Winner: X
Reward: -1.05/Winner: X
Reward: -1.06/Winner: X
Reward: 0.9299999999999999/Winner: O
Reward: 0.95/Winner: O
Reward: 0.21/Winn

KeyboardInterrupt: 

# モンテカルロ法

In [73]:
import random

class MCAgent:
    def __init__(self, size, my_turn):
        self.size = size
        self.my_turn = my_turn

    def win_or_rand(self, board, turn):
        available_moves = [(i, j) for i in range(self.size) for j in range(self.size) if board[i][j] == 0]
        for move in available_moves:
            tempboard = [row[:] for row in board]
            tempboard[move[0]][move[1]] = turn
            if self.check_winner(tempboard, turn):
                return move[0] * self.size + move[1]
        move = random.choice(available_moves)
        return move[0] * self.size + move[1]

    def check_winner(self, board, player):
        for i in range(self.size):
            if all(board[i][j] == player for j in range(self.size)) or all(board[j][i] == player for j in range(self.size)):
                return True
        if all(board[i][i] == player for i in range(self.size)) or all(board[i][self.size-i-1] == player for i in range(self.size)):
            return True
        return False

    def trial(self, score, board, act):
        tempboard = [row[:] for row in board]
        x, y = divmod(act, self.size)
        tempboard[x][y] = self.my_turn
        temp_turn = self.my_turn
        while True:
            temp_turn *= -1
            if self.check_winner(tempboard, temp_turn):
                if temp_turn == self.my_turn:
                    score[act] += 1
                else:
                    score[act] -= 1
                break
            if not any(tempboard[i][j] == 0 for i in range(self.size) for j in range(self.size)):
                break
            next_move = self.win_or_rand(tempboard, temp_turn)
            x, y = divmod(next_move, self.size)
            tempboard[x][y] = temp_turn

    def act(self, board):
        acts = [i * self.size + j for i in range(self.size) for j in range(self.size) if board[i][j] == 0]
        scores = {act: 0 for act in acts}
        n = 100
        for act in acts:
            for _ in range(n):
                self.trial(scores, board, act)
            scores[act] /= n
        return max(scores, key=scores.get)


In [74]:
size = 3
agent1 = HumanAgent(size, 1)
agent2 = MCAgent(size,-1)
progress(agent1,agent2)

Reward: -1.07/Winner: X
Reward: 0.21/Winner: Draw
Reward: -1.09/Winner: X
Reward: -1.05/Winner: X
Reward: 0.21/Winner: Draw
Reward: 0.95/Winner: O
Reward: -1.07/Winner: X
Reward: 0.95/Winner: O
Reward: 0.95/Winner: O
Reward: 0.21/Winner: Draw
Reward: 0.21/Winner: Draw
Reward: 0.21/Winner: Draw
Reward: 0.9299999999999999/Winner: O
Reward: -1.09/Winner: X
Reward: -1.09/Winner: X
Reward: 0.95/Winner: O


SystemExit: 

# QLarning

In [97]:
import sys
import pickle
import numpy as np
import random

class QLAgent:
    def __init__(self, size, 
                 gamma=0.9,  # 割引率
                 epsilon=0.2,  # 乱雑度
                 alpha=0.3,  # 学習率
                 memory_size=500000):
        
        # パラメータ
        self.input_size = (size, size)
        self.n_act = size**2
        self.gamma = gamma 
        self.alpha = alpha
        self.epsilon = epsilon  
        self.init_val_Q = 0
        self.memory_size = memory_size 

        # Qテーブル関連
        self.Q = {}     # Qテーブル
        self.len_Q = 0  # Qテーブルに登録した観測の数

    def act(self, obs):
        """観測に対して行動を出力"""
        # obsを文字列に変換
        obs = str(obs)

        # obs が登録されていなかったら初期値を与えて登録
        self._check_and_add_observation(obs)

        # 可能な行動を取得
        available_moves = self.get_possible_moves(obs)
        
        # 確率的に処理を分岐
        if np.random.rand() < self.epsilon:
            # epsilon の確率
            act = random.choice(available_moves)  # ランダム行動
        else:
            # 1-epsilon の確率
            q_values = [self.Q[obs][move] for move in available_moves]
            max_q = max(q_values)
            max_q_moves = [move for move in available_moves if self.Q[obs][move] == max_q]
            act = random.choice(max_q_moves)  # Qを最大にする行動
        return act

    def get_possible_moves(self, obs):
        """可能な行動を取得"""
        obs_array = np.array(eval(obs))
        return [i * self.input_size[0] + j for i in range(self.input_size[0]) for j in range(self.input_size[1]) if obs_array[i, j] == 0]

    def _check_and_add_observation(self, obs):
        """obs が登録されていなかったら初期値を与えて登録"""
        if obs not in self.Q: 
            self.Q[obs] = [self.init_val_Q] * self.n_act
            self.len_Q += 1
            if self.len_Q % 1000 == 0:
                print(f'The number of obs in Q-table: {self.len_Q}')

    def learn(self, obs, act, rwd, done, next_obs):
        """学習"""
        if rwd is None:  # rwdがNoneだったら戻る
            return
        # obs, next_obs を文字列に変換
        obs = str(obs)
        next_obs = str(next_obs)

        # next_obs が登録されていなかったら初期値を与えて登録
        self._check_and_add_observation(next_obs)

        # 学習のターゲットを作成
        if done:
            target = rwd
        else:
            target = rwd + self.gamma * max(self.Q[next_obs])

        # Qをターゲットに近づける
        self.Q[obs][act] = (1 - self.alpha) * self.Q[obs][act] + self.alpha * target

    def get_Q(self, obs):
        """観測に対するQ値を出力"""
        obs = str(obs)
        if obs in self.Q:   # obsがQにある
            val = self.Q[obs]
            Q = np.array(val)
        else:               # obsがQにない
            Q = None
        return Q

    def save_weights(self,filepath='agt_data/noname'):
        """方策のパラメータの保存"""
        # Qテーブルの保存
        filepath = filepath + '.pkl'
        with open(filepath, mode='wb') as f:
            pickle.dump(self.Q, f)

    def load_weights(self,filepath='agt_data/noname'):
        """方策のパラメータの読み込み"""
        # Qテーブルの読み込み
        filepath = filepath + '.pkl'
        with open(filepath, mode='rb') as f:
            self.Q = pickle.load(f)


In [111]:
def trainQL():
    env = TicTacToe()

    agent = QLAgent(env.size,1)
    
    filepath='agt_data/tictactoe_QL'
    
    agent.load_weights(filepath)
    
    episode = 1

    running = True
    
    while running:
        obs = env.board
        
        if env.current_player == 1:
            act = agent.act(obs)
            # 状態を更新
            env.step(act)
            next_obs = env.board
            done = env.done()
            rwd = round(env.reward,2)

            # 学習
            agent.learn(obs, act, rwd, done, next_obs)
            
        elif env.current_player == -1:
            obs = [[1 if cell == -1 else -1 if cell == 1 else cell for cell in row] for row in obs]
            act = agent.act(obs)
            # 状態を更新
            env.step(act)
            next_obs = env.board
            done = env.done()
            rwd = round(env.reward,2)*-1

            # 学習
            agent.learn(obs, act, rwd, done, next_obs)
        
        
        if done:
            # 初期化
            env.game_reset()
            agent.epsilon = agent.epsilon/(episode//10000+1)
            episode += 1
            print(episode, end='\r')
        
        if episode > 100000:
            running = False
    
    # 重みパラメータの保存
    agent.save_weights(filepath)

In [112]:
trainQL()

100001

In [113]:
size = 3
agent1 = QLAgent(size, 1,epsilon=0)
agent2 = HumanAgent(size,-1)
agent1.load_weights('agt_data/tictactoe_QL')

progress(agent1,agent2)

Reward: -1/Winner: X
Reward: -1/Winner: X
Reward: 0.3/Winner: Draw
Reward: 0.3/Winner: Draw
Reward: -1/Winner: X
Reward: -1/Winner: X


SystemExit: 

# DQN

In [42]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from collections import deque
import random
import csv
import os
import numpy as np

class Memory:
    """経験再生のメモリクラス"""
    def __init__(self, memory_size=100, batch_size=30):
        self.memory_size = memory_size
        self.buffer = deque(maxlen=memory_size)

    def add(self, experience):
        # 右側に経験を追加
        self.buffer.append(experience)

    def sample(self, batch_size):
        # バッチサイズ分の経験をサンプリングする
        idx = random.sample(range(len(self.buffer)), batch_size)
        return [self.buffer[i] for i in idx]

    def __len__(self):
        return len(self.buffer)


class DQNAgent:
    def __init__(self, size, 
                 gamma = 0.9, # 割引率
                 epsilon = 0.1, # 乱雑度
                 memory_size = 1, # 経験の保存数
                 batch_size = 1, # 学習で使用する経験の数
                 target_interval = 1 # ターゲットを更新する間隔
                ):
        
        # パラメータ
        self.input_size = (size, size)
        self.n_act = size**2
        self.gamma = gamma 
        self.epsilon = epsilon  
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.target_interval = target_interval
        self.model = self._build_Qnet()
        self.time = 0
        
        # 学習過程の記録関連
        self.hist_rwds = []
        self.hist_wnrs = []
        
        # ターゲットモデルの生成
        self.model_target = self._build_Qnet()
        # メモリのインスタンスを作成
        self.memory = Memory(memory_size=self.memory_size, batch_size=batch_size)

    def _build_Qnet(self):
        # Qネットワークの構築
        model = Sequential()
        model.add(Flatten(input_shape=self.input_size))
        model.add(Dense(32))
        model.add(Dense(32))
        model.add(Dense(self.n_act, activation='linear'))
        
        # 勾配法のパラメータの定義
        model.compile(loss='mse', optimizer='Adam')
        
        return model

    def act(self, obs):
        # 確率でε-greedy法ではない
        if random.random() <= self.epsilon:
            act = random.randrange(self.n_act)
        else:
            # Q値を予測する
            Q = self.get_Q(obs)
            act = Q.index(max(Q))  # 最大となるQ値を出力
        return act
    
    def get_Q(self, obs, type='main'):
        # obsを入力し出力を得る
        obs_reshaped = np.array([obs])
        if type == 'main':
            # Qネットワークに観測obsを入力し出力を得る
            Q = self.model.predict(obs_reshaped, verbose=0)[0, :]
        elif type == 'target':
            # ターゲットネットに観測obsを入力し出力を得る
            Q = self.model_target.predict(obs_reshaped, verbose=0)[0, :]

        return Q.tolist()
             

    def learn(self, obs, act, rwd, done, next_obs):
        if rwd is None:
            return
        
        self.memory.add((obs, act, rwd, done, next_obs))
        
        # 学習
        self._fit()

        # target_intervalの周期でQネットワークの重みをターゲットネットにコピー
        if self.time % self.target_interval == 0 and self.time > 0:
            self.model_target.set_weights(self.model.get_weights())
        print(self.time, end='\r')

        self.time += 1
        
    def _fit(self):
        # 記憶された「経験」のデータの量がバッチサイズに満たない場合は戻る
        if len(self.memory) < self.batch_size:
            return
        
        # 学習に使うデータを出力
        outs = self.memory.sample(self.batch_size)

        # 観測とターゲットのバッチを入れる配列を準備
        obs_shape = self.input_size
        obss = np.zeros((self.batch_size,) + obs_shape)
        targets = np.zeros((self.batch_size, self.n_act))
        
        for i, (obs, act, rwd, done, next_obs) in enumerate(outs):
            # obs に対するQネットワークの出力 yを得る
            y = self.get_Q(obs)

            # target にyの内容をコピーする
            target = y[:]

            if not done:
                # 最終状態でなかったら next_obsに対する next_yを得る
                next_y = self.get_Q(next_obs)

                # Q[obs][act]のtarget_act を作成
                target_act = rwd + self.gamma * max(next_y)
            else:
                # 最終状態の場合は報酬だけでtarget_actを作成
                target_act = rwd

            # targetのactの要素だけtarget_actにする
            target[act] = target_act

            # obsとtargetをバッチの配列に入れる
            obss[i] = obs
            targets[i] = target
        
        # obssと targets のバッチのペアを与えて学習
        self.model.fit(obss, targets, verbose=0, epochs=1)

    
    def save_weights(self, filepath='agt_data/noname'):
        self.model.save(filepath + '.keras', overwrite=True)
        
        # episodeとrwdをCSVファイルに保存
        with open(filepath + '.csv', "w", newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(['episode','reward','winner'])
            for i, (rwd, wnr) in enumerate(zip(self.hist_rwds, self.hist_wnrs)):
                writer.writerow([i+1, rwd, wnr])

    def load_weights(self, filepath='agt_data/noname'):
        # モデルの重みを読み込む
        self.model = tf.keras.models.load_model(filepath + '.keras')
        
        # episodeとrwdをCSVファイルから読み込む
        with open(filepath + '.csv', "r") as csv_file:
            reader = csv.reader(csv_file)
            next(reader)
            for row in reader:
                self.hist_rwds.append(float(row[1]))
                self.hist_wnrs.append(float(row[2]))

In [43]:
def train():
    env = TicTacToe()

    agent1 = DQNAgent(env.size,1)
    agent2 = RandomalfaAgent(env.size,-1)
    
    filepath='agt_data/tictaktoe_DQN'
    
    episode = 1
    
    if os.path.exists(filepath + '.keras'):
        agent1.load_weights(filepath)
        episode += len(agent1.hist_rwds)
    
    env.pygame_init()
    running = True
    
    while running:
        obs = env.board
        env.pygame_render(obs)
        
        if env.current_player == 1:
            act = agent1.act(obs)
        elif env.current_player == -1:
            act = agent2.act(obs)

        # 状態を更新
        env.step(act)
        next_obs = env.board
        done = env.done()
        rwd = round(env.reward,2)
        
        # 描画
        env.pygame_render(obs)
        
        # 学習
        agent1.learn(obs, act, rwd, done, next_obs)
        
        if done:
            # 記録
            agent1.hist_rwds.append(rwd)
            agent1.hist_wnrs.append(env.winner)

            # 勝率
            win = (agent1.hist_wnrs.count(1) / episode) * 100

            # 結果を表示
            print(f"episode{episode}/Reward: {rwd}/Win %: {win} ")
            # 初期化
            env.game_reset()
            episode += 1
        
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                running = False
        
        if episode > 50000:
            running = False

    pygame.quit()
    
    # 重みパラメータの保存
    agent1.save_weights(filepath)

In [50]:
if __name__ == "__main__":
    train()

  super().__init__(**kwargs)


4