- 일단 중복 고려 안함. 이동 시 발생하는 reward 0.
- 오직 게임이 끝나고 win, lose, draw에 따라서만 reward 발생
- 원래는 모든 게임트리가 아니라 어느정도 깊이만 탐색함.
- 우선은 전체를 계산하는 다이내믹 프로그래밍 방식으로 구현하겠음.

In [2]:
import time
import os
import pickle
import numpy as np
import pandas as pd
from typing import Tuple
from collections import deque
import copy
from scipy.special import softmax
import random
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# 틱택토 환경

In [3]:
class Environment():
    def __init__(self):
        self.n = 3
        self.num_actions = self.n**2
        self.present_state = np.zeros((self.n, self.n))
        self.action_space = np.arange(self.num_actions)
        self.reward_dict = {'win':1, 'lose':-1, 'draw': -0.1, 'good_action':0, 'overlapped':0}
        self.done = False


    def step(self, action_idx:int, max_player:bool):
        '''
        에이전트가 선택한 action에 따라 주어지는 next_state, reward, done
        '''
        x, y = np.divmod(action_idx, self.n)
        if max_player:
            sign = 1
        else:
            sign = -1

        is_overlap = self.is_overlap(action_idx)

        if is_overlap:
            next_state = self.present_state
            reward = self.reward_dict['overlapped']
            done, is_win = self.is_done(next_state)

        else:
            self.present_state[x,y] = sign
            next_state = self.present_state
            done, is_win = self.is_done(next_state)
            reward = self.reward_dict['good_action']

            if done:
                if is_win == "win":
                    reward = self.reward_dict['win']
                elif is_win == "lose":
                    reward = self.reward_dict['lose']
                else:
                    reward = self.reward_dict['draw']

        self.done = done

        return next_state, reward, done, is_win


    def reset(self):
        '''
        게임판 초기화
        '''
        self.present_state = np.zeros((self.n, self.n))
        self.done = False


    def render(self):
        '''
        print the current state
        '''
        render_state = np.array([['.','.','.'],
                                ['.','.','.'],
                                ['.','.','.']])
        render_str = ""
        for i in range(self.num_actions):
            x, y = np.divmod(i, 3)
            if self.present_state[x,y] == 1:
                render_state[x,y] = 'X'
            elif self.present_state[x,y] == -1:
                render_state[x,y] = 'O'

            render_str += f" {render_state[x,y]}"
            if (i+1) % 3 == 0:
                render_str += "\n" + "-"*11 + "\n"
            else:
                render_str += " |"

        print(render_str)


    def is_overlap(self, action_idx):
        '''
        action이 중복인지 판단하는 함수
        '''
        is_overlap = False
        x, y = np.divmod(action_idx, self.n)
        if self.present_state[x,y] != 0:
            is_overlap = True

        return is_overlap


    def is_done(self, state):
        '''
        틱택토 게임 종료 조건 및 승리 여부 확인하는 함수
        '''
        is_done, is_win = False, "null"

        # 무승부 여부 확인
        if (state==0).sum()==0:
            is_done, is_win = True, "draw"

        else:
            axis_sum = np.concatenate((state.sum(axis=0), state.sum(axis=1)))
            diag_sum = np.array([state.trace(), np.fliplr(state).trace()])

            sum_array = np.concatenate((axis_sum, diag_sum))
            max_sum = np.max(sum_array)
            min_sum = np.min(sum_array)

            if max_sum == 3:
                is_done, is_win = True, "win"
            elif min_sum == -3:
                is_done, is_win = True, "lose"
            else:
                is_done, is_win = False, "null"

        return is_done, is_win

In [4]:
env = Environment()
_, reward, done, is_win = env.step(0, False)
print(reward, done, is_win)
env.render()

0 False null
 O | . | .
-----------
 . | . | .
-----------
 . | . | .
-----------



In [5]:
_, reward, done, is_win = env.step(4, False)
print(reward, done, is_win)
env.render()

0 False null
 O | . | .
-----------
 . | O | .
-----------
 . | . | .
-----------



# minimax 알고리즘
- 입력 받은 상태에서 얻을 수 있는 최대값이 뭔지 알려주는 함수
- 개선점: 최대값을 얻을 수 있는 행동이 무엇인지 반환해야한다.
- 현재 모든 경우의 수에 대해 계산하는 minimax 함수이다. 틱택토 정도의 작은 상황에서는 가능하지만, 상태의 수가 많아지면 depth를 도입해 일정 깊이만큼만 탐색하도록 해야한다.

In [None]:
class Agent():
    def __init__(self, env, max_player:bool):
        self.env = env
        self.n = self.env.n
        self.num_actions = self.env.num_actions
        self.actions = torch.tensor(self.num_actions)

        self.best_action_list = []
        self.best_action = 0
        self.depth = 0

        self.max_player = max_player
        if self.max_player:
            self.sign = 1
        else:
            self.sign = -1


    def minimax(self, present_state, alpha, beta, max_player:bool):
        # self.depth += 1
        temp_env = Environment()
        state = copy.deepcopy(present_state)
        temp_env.present_state = state
        done, is_win = temp_env.is_done(state)
        reward = 0

        remain_actions = np.argwhere(state == 0)


        if (done == True):
            if is_win == "win":
                reward = self.sign*temp_env.reward_dict['win']
            elif is_win == "lose":
                reward = self.sign*temp_env.reward_dict['lose']
            else:
                reward = temp_env.reward_dict['draw']

            return reward

        if max_player:
            maxEval = -np.Inf
            for (x, y) in remain_actions:
                idx = 3*x + y
                child, _, _, _ = temp_env.step(idx, True)

                eval = self.minimax(child, alpha, beta, False)

                if maxEval < eval:
                #     self.best_action_list = []
                #     self.best_action_list.append(idx)
                    self.best_action = idx

                # if maxEval == eval:
                #     self.best_action_list.append(idx)

                maxEval = max(maxEval, eval)

                alpha = max(alpha, eval)
                if beta <= alpha:
                    break

            return maxEval

        else:
            minEval = np.Inf
            for (x, y) in remain_actions:
                idx = 3*x + y
                child, _, _, _ = temp_env.step(idx, False)

                eval = self.minimax(child, alpha, beta, True)
                minEval = min(minEval, eval)

                beta = min(alpha, eval)
                if beta <= alpha:
                    break

            return minEval


    def get_action(self, state, agent_turn):
        if not agent_turn:
            state = -state

        # self.best_action_list = []
        # self.minimax(state, -np.Inf, np.Inf, True)
        # self.best_action = random.choice(self.best_action_list)


        return self.best_action

# 테스트

In [None]:
env = Environment()
max_agent = Agent(env, True)
min_agent = Agent(env, False)
env.render()

 . | . | .
-----------
 . | . | .
-----------
 . | . | .
-----------



In [None]:
action = max_agent.get_action(env.present_state, True)
next_state, reward, done, is_win = env.step(action, True)
print(max_agent.best_action_list)
print(action, reward, done, is_win)
env.render()

[6, 7, 8, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]
3 0 False null
 . | . | .
-----------
 X | . | .
-----------
 . | . | .
-----------



In [None]:
action = min_agent.get_action(env.present_state, False)
next_state, reward, done, is_win = env.step(action, False)
print(min_agent.best_action_list)
print(action, reward, done, is_win)
env.render()

[8, 6, 7, 8]
8 0 False null
 . | . | .
-----------
 X | . | .
-----------
 . | . | O
-----------



In [None]:
action = max_agent.get_action(env.present_state, True)
next_state, reward, done, is_win = env.step(action, True)
print(max_agent.best_action_list)
print(action, reward, done, is_win)
env.render()

[5, 6, 7, 1, 2, 4, 5, 6, 7]
7 0 False null
 . | . | .
-----------
 X | . | .
-----------
 . | X | O
-----------



In [None]:
action = 0
next_state, reward, done, is_win = env.step(action, False)
print(action, reward, done, is_win)
env.render()

0 0 False null
 O | . | .
-----------
 X | O | .
-----------
 . | . | X
-----------



In [None]:
action = max_agent.get_action(env.present_state, True)
next_state, reward, done, is_win = env.step(action, True)
print(action, reward, done, is_win)
env.render()

6 0 False null
 O | . | .
-----------
 X | O | .
-----------
 X | . | X
-----------



In [None]:
action = min_agent.get_action(env.present_state, False)
next_state, reward, done, is_win = env.step(action, False)
print(action, reward, done, is_win)
env.render()

5 0 False null
 O | . | .
-----------
 X | O | O
-----------
 X | . | X
-----------



In [None]:
action = max_agent.get_action(env.present_state, True)
next_state, reward, done, is_win = env.step(action, True)
print(action, reward, done, is_win)
env.render()

1 0 False null
 O | X | .
-----------
 X | O | O
-----------
 X | . | X
-----------



In [None]:
action = min_agent.get_action(env.present_state, False)
next_state, reward, done, is_win = env.step(action, False)
print(action, reward, done, is_win)
env.render()

2 0 False null
 O | X | O
-----------
 X | O | O
-----------
 X | . | X
-----------



In [None]:
action = max_agent.get_action(env.present_state, True)
next_state, reward, done, is_win = env.step(action, True)
print(action, reward, done, is_win)
env.render()

7 1 True win
 O | X | O
-----------
 X | O | O
-----------
 X | X | X
-----------



In [None]:
env.reset()

In [None]:
action = 5
next_state, reward, done, is_win = env.step(action, True)
print(action, reward, done, is_win)
env.render()

5 0 False null
 X | O | .
-----------
 . | X | X
-----------
 . | . | O
-----------



In [None]:
action = 6
next_state, reward, done, is_win = env.step(action, False)
print(action, reward, done, is_win)
env.render()

6 0 False null
 X | O | .
-----------
 . | X | X
-----------
 O | . | O
-----------



## 수동 코딩으로 오류 잡음

In [None]:
temp_env = Environment()
temp_env.present_state = state
done, is_win = temp_env.is_done(state)
is_draw = temp_env.is_draw(state)
reward = 0

remain_actions = np.argwhere(state == 0)

In [None]:
remain_actions

array([[0, 2],
       [1, 0],
       [2, 1]])

In [None]:
x, y = remain_actions[0]
idx = 3*x + y
idx

2

In [None]:
# max turn
maxEval = -np.Inf
max(maxEval, 1)

1

In [None]:
child, _, _, _ = temp_env.step(idx, True)
child

array([[ 1., -1.,  1.],
       [ 0.,  1.,  1.],
       [-1.,  0., -1.]])

In [None]:
state = child

In [None]:
# min turn
temp_env = Environment()
temp_env.present_state = state
done, is_win = temp_env.is_done(state)
is_draw = temp_env.is_draw(state)
reward = 0

remain_actions = np.argwhere(state == 0)

In [None]:
print(done, is_win, is_draw)

False False False


In [None]:
x, y = remain_actions[0]
idx = 3*x + y
idx

3

In [None]:
minEval = np.Inf
min(minEval, 1)

1

In [None]:
child, _, _, _ = temp_env.step(idx, False)
child

array([[ 1., -1.,  1.],
       [-1.,  1.,  1.],
       [-1.,  0., -1.]])

In [None]:
state = child

In [None]:
# max turn
temp_env = Environment()
temp_env.present_state = state
done, is_win = temp_env.is_done(state)
is_draw = temp_env.is_draw(state)
reward = 0

remain_actions = np.argwhere(state == 0)

In [None]:
print(done, is_win, is_draw)

False False False


In [None]:
x, y = remain_actions[0]
idx = 3*x + y
idx

7

In [None]:
maxEval = -np.Inf
child, _, _, _ = temp_env.step(idx, True)
child

array([[ 1., -1.,  1.],
       [-1.,  1.,  1.],
       [-1.,  1., -1.]])

In [None]:
state = child

In [None]:
# 결과값
temp_env = Environment()
temp_env.present_state = state
done, is_win = temp_env.is_done(state)
is_draw = temp_env.is_draw(state)
reward = 0

remain_actions = np.argwhere(state == 0)

print(done, is_win, is_draw)

False False True


- 수동으로!!! 확인해서!!! 어디서 논리오류 난지 알아냄!!!