In [None]:
import numpy as np
import random
from collections import defaultdict

class TicTacToeEnv():
    def __init__(self):
        super(TicTacToeEnv, self).__init__()
        self.board = np.zeros((3, 3), dtype=np.int8)
        self.q_table = defaultdict(lambda: np.zeros(9)) 
        self.current_player = 1 
        self.gamma = 0.99
        self.alpha = 0.1

    def reset(self):
        self.board = np.zeros((3, 3), dtype=np.int8)
        self.current_player = 1  

    def choose_action(self, training=True):
        state = self.get_state(self.board)
        valid_moves = self.valid_moves(self.board)
        if training: 
            return random.choice(valid_moves)
        
        q_values = self.q_table[state]
        if self.current_player == 1:
            return max(valid_moves, key=lambda a: q_values[a])
        else:
            return min(valid_moves, key=lambda a: q_values[a])

    def step(self, action):
        row, col = divmod(action, 3)
        self.board[row, col] = self.current_player
        done, reward = self.check_winner()
        self.current_player *= -1  
        return self.board.copy(), reward, done

    
        

    def check_winner(self):
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3:  # Row win
                return True, 1 if self.board[i, 0] == 1 else -1
            if abs(sum(self.board[:, i])) == 3:  # Column win
                return True, 1 if self.board[0, i] == 1 else -1

        if abs(sum(self.board.diagonal())) == 3:  # Diagonal win
            return True, 1 if self.board[0, 0] == 1 else -1
        if abs(sum(np.fliplr(self.board).diagonal())) == 3:  # Anti-diagonal win
            return True, 1 if self.board[0, 2] == 1 else -1

        if not (self.board == 0).any():  # Draw
            return True, 0
        return False, 0
    
    def valid_moves(self, board):
        return [i for i in range(9) if board.flatten()[i] == 0]
            
    
    def get_state(self, board):
        return tuple(board.flatten())
    
    def update_q_table(self, action, reward, state, next_board, done):
        next_state = self.get_state(next_board)
        
        # 상대방의 턴에 대한 best_next_q 계산 (최선의 방어)
        if done:
            best_next_q = 0
        else:
            next_q_values = self.q_table[next_state].tolist()
            # 상대방의 턴에서 최선의 방어를 가정한 Q-value
            best_next_q = -max(next_q_values) if self.current_player == 1 else max(next_q_values)

        # 현재 플레이어의 Q-value 업데이트
        self.q_table[state][action] += self.alpha * (reward + self.gamma * best_next_q - self.q_table[state][action])


    def render(self):
        symbols = {0: '.', 1: 'X', -1: 'O'}
        for row in self.board:
            print(" ".join(symbols[cell] for cell in row))
        print()


    def episode(self):
        done = False
        while not done:
            action = self.choose_action(training=True)
            state = self.get_state(self.board)
            
            next_state, reward, done = self.step(action)
            # print(reward)
            self.update_q_table(action, reward, state, next_state, done)
            state = next_state
    
  
        

In [2]:
# 학습 실행
env = TicTacToeEnv()
env.render()

num_episodes = 100000
for episode in range(num_episodes):
    env.reset()
    env.episode()
    if (episode + 1) % 10000 == 0:
        env.epsilon *= 0.9  
        print(f"Episode {episode+1} completed")

print("Training complete!")

. . .
. . .
. . .

Episode 10000 completed
Episode 20000 completed
Episode 30000 completed
Episode 40000 completed
Episode 50000 completed
Episode 60000 completed
Episode 70000 completed
Episode 80000 completed
Episode 90000 completed
Episode 100000 completed
Training complete!


In [91]:
env.epsilon = 0  # 탐험 없이 실행
for _ in range(10):  
    env.reset()
    env.render()
    done = False
    while not done:
        action = env.choose_action(training=False)
        _, _, done = env.step(action)
        env.render()


. . .
. . .
. . .

X . .
. . .
. . .

X O .
. . .
. . .

X O X
. . .
. . .

X O X
O . .
. . .

X O X
O X .
. . .

X O X
O X O
. . .

X O X
O X O
X . .

. . .
. . .
. . .

X . .
. . .
. . .

X O .
. . .
. . .

X O X
. . .
. . .

X O X
O . .
. . .

X O X
O X .
. . .

X O X
O X O
. . .

X O X
O X O
X . .

. . .
. . .
. . .

X . .
. . .
. . .

X O .
. . .
. . .

X O X
. . .
. . .

X O X
O . .
. . .

X O X
O X .
. . .

X O X
O X O
. . .

X O X
O X O
X . .

. . .
. . .
. . .

X . .
. . .
. . .

X O .
. . .
. . .

X O X
. . .
. . .

X O X
O . .
. . .

X O X
O X .
. . .

X O X
O X O
. . .

X O X
O X O
X . .

. . .
. . .
. . .

X . .
. . .
. . .

X O .
. . .
. . .

X O X
. . .
. . .

X O X
O . .
. . .

X O X
O X .
. . .

X O X
O X O
. . .

X O X
O X O
X . .

. . .
. . .
. . .

X . .
. . .
. . .

X O .
. . .
. . .

X O X
. . .
. . .

X O X
O . .
. . .

X O X
O X .
. . .

X O X
O X O
. . .

X O X
O X O
X . .

. . .
. . .
. . .

X . .
. . .
. . .

X O .
. . .
. . .

X O X
. . .
. . .

X O X
O . .
