In [135]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [136]:
import numpy as np
import pickle

In [137]:
BOARD_ROWS = 3
BOARD_COLS = 3

# Board State

Reflect and judge the state of the board.

Two players within the game, called p1 and p2.

Player 1 is represented by the symbol 1 in the board, player 2 is represented by the symbol 2, and a vacant tile on the board is represented by a 0.

In [138]:
class State:
    def __init__(self, p1, p2):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.is_end = False
        self.board_hash = None
        # Player 1 starts the game.
        self.player_symbol = 1

    # Get a unique hash of the current board state.
    def get_hash(self):
        self.board_hash = str(self.board.reshape(BOARD_ROWS * BOARD_COLS))
        return self.board_hash

    def winner(self):
        # Three in a row.
        for i in range(BOARD_ROWS):
            row_sum = sum(self.board[i, :])

            if row_sum == 3:
                self.is_end = True
                return 1

            if row_sum == -3:
                self.is_end = True
                return -1

        # Three in a column.
        for i in range(BOARD_COLS):
            col_sum = sum(self.board[:, i])

            if col_sum == 3:
                self.is_end = True
                return 1

            if col_sum == -3:
                self.is_end = True
                return -1

        # Three in a diagonal.
        first_diagonal = [self.board[i, i] for i in range(BOARD_COLS)]
        first_diagonal_sum = sum(first_diagonal)

        if first_diagonal_sum == 3:
            self.is_end = True
            return 1

        if first_diagonal_sum == -3:
            self.is_end = True
            return -1

        second_diagonal = [self.board[i, BOARD_COLS - i - 1] for i in range(BOARD_COLS)]
        second_diagonal_sum = sum(second_diagonal)

        if second_diagonal_sum == 3:
            self.is_end = True
            return 1

        if second_diagonal_sum == -3:
            self.is_end = True
            return -1

        # Tie game.
        if len(self.available_positions()) == 0:
            self.is_end = True
            return 0

        # Game still going
        self.is_end = False
        return None

    def available_positions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i, j] == 0:
                    positions.append((i, j))
        return positions

    def update_state(self, position):
        self.board[position] = self.player_symbol
        # Switch to the other player
        self.player_symbol = -1 if self.player_symbol == 1 else 1

    # Only give the reward once the game ends
    def give_reward(self):
        result = self.winner()

        # Backpropogate the reward.
        if result == 1:
            self.p1.feed_reward(1)
            self.p2.feed_reward(0)
        elif result == -1:
            self.p1.feed_reward(0)
            self.p2.feed_reward(1)
        else:
            self.p1.feed_reward(0.1)
            self.p2.feed_reward(0.5)

    # Rest the board to be a clean slate.
    def reset(self):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.board_hash = None
        self.is_end = False
        self.player_symbol = 1

    def play_against_computer(self, rounds=100):
        for i in range(rounds):
            if i % 1000 == 0:
                print(f"Rounds {i}")

            while not self.is_end:
                # Player 1's turn
                positions = self.available_positions()
                p1_action = self.p1.choose_action(
                    positions, self.board, self.player_symbol
                )
                self.update_state(p1_action)
                board_hash = self.get_hash()
                self.p1.add_state(board_hash)

                # Check if the game is over.
                win = self.winner()
                if win is not None:
                    self.give_reward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else:
                    # Player 2's turn
                    positions = self.available_positions()
                    p2_action = self.p2.choose_action(
                        positions, self.board, self.player_symbol
                    )
                    self.update_state(p2_action)
                    board_hash = self.get_hash()
                    self.p2.add_state(board_hash)

                    # Check if the game is over.
                    win = self.winner()
                    if win is not None:
                        self.give_reward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break

    def play_against_human(self):
        while not self.is_end:
            # Player 1's turn
            positions = self.available_positions()
            p1_action = self.p1.choose_action(positions, self.board, self.player_symbol)
            self.update_state(p1_action)
            self.show_board()

            win = self.winner()
            if win is not None:
                if win == 1:
                    print(f"{self.p1.name} wins!")
                else:
                    print("tie!")

                self.reset()
                break

            else:
                # Player 2's turn
                positions = self.available_positions()
                p2_action = self.p2.choose_action(positions)
                self.update_state(p2_action)
                self.show_board()

                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(f"{self.p2.name} wins!")
                    else:
                        print("tie!")

                    self.reset()
                    break

    def show_board(self):
        for i in range(BOARD_ROWS):
            print("-------------")
            out = "| "
            for j in range(BOARD_COLS):
                if self.board[i, j] == 1:
                    token = "x"
                if self.board[i, j] == -1:
                    token = "o"
                if self.board[i, j] == 0:
                    token = " "
                out += token + " | "
            print(out)
        print("-------------")

In [139]:
class Player:
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []  # record all positions taken
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.decay_gamma = 0.9
        self.states_value = {}  # state -> value

    def get_hash(self, board):
        board_hash = str(board.reshape(BOARD_COLS * BOARD_ROWS))
        return board_hash

    def choose_action(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_board_hash = self.get_hash(next_board)
                value = (
                    0
                    if self.states_value.get(next_board_hash) is None
                    else self.states_value.get(next_board_hash)
                )
                # print("value", value)
                if value >= value_max:
                    value_max = value
                    action = p
        # print("{} takes action {}".format(self.name, action))
        return action

    # append a hash state
    def add_state(self, state):
        self.states.append(state)

    # at the end of game, backpropagate and update states value
    def feed_reward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr * (
                self.decay_gamma * reward - self.states_value[st]
            )
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    def save_policy(self):
        fw = open("policy_" + str(self.name), "wb")
        pickle.dump(self.states_value, fw)
        fw.close()

    def load_policy(self, file):
        fr = open(file, "rb")
        self.states_value = pickle.load(fr)
        fr.close()

In [140]:
class HumanPlayer:
    def __init__(self, name):
        self.name = name

    def choose_action(self, positions):
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action

    # append a hash state
    def add_state(self, state):
        pass

    # at the end of game, backpropagate and update states value
    def feed_reward(self, reward):
        pass

    def reset(self):
        pass

In [141]:
p1 = Player("p1")
p2 = Player("p2")

st = State(p1, p2)
print("training...")
st.play_against_computer(50000)

training...
Rounds 0
Rounds 1000
Rounds 2000
Rounds 3000
Rounds 4000
Rounds 5000
Rounds 6000
Rounds 7000
Rounds 8000
Rounds 9000
Rounds 10000
Rounds 11000
Rounds 12000
Rounds 13000
Rounds 14000
Rounds 15000
Rounds 16000
Rounds 17000
Rounds 18000
Rounds 19000
Rounds 20000
Rounds 21000
Rounds 22000
Rounds 23000
Rounds 24000
Rounds 25000
Rounds 26000
Rounds 27000
Rounds 28000
Rounds 29000
Rounds 30000
Rounds 31000
Rounds 32000
Rounds 33000
Rounds 34000
Rounds 35000
Rounds 36000
Rounds 37000
Rounds 38000
Rounds 39000
Rounds 40000
Rounds 41000
Rounds 42000
Rounds 43000
Rounds 44000
Rounds 45000
Rounds 46000
Rounds 47000
Rounds 48000
Rounds 49000


In [142]:
p1.save_policy()
p2.save_policy()

In [143]:
p1.load_policy("policy_p1")

# Human vs Computer
Now let's see how our model does playing against a human.

In [144]:
p1 = Player("computer", exp_rate=0)
p1.load_policy("policy_p1")

p2 = HumanPlayer("human")

st = State(p1, p2)
st.play_against_human()

-------------
| x |   |   | 
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------


Input your action row: 1
Input your action col: 1


-------------
| x |   |   | 
-------------
|   | o |   | 
-------------
|   |   |   | 
-------------
-------------
| x |   |   | 
-------------
|   | o |   | 
-------------
| x |   |   | 
-------------


Input your action row: 1
Input your action col: 0


-------------
| x |   |   | 
-------------
| o | o |   | 
-------------
| x |   |   | 
-------------
-------------
| x |   |   | 
-------------
| o | o | x | 
-------------
| x |   |   | 
-------------


Input your action row: 0
Input your action col: 1


-------------
| x | o |   | 
-------------
| o | o | x | 
-------------
| x |   |   | 
-------------
-------------
| x | o |   | 
-------------
| o | o | x | 
-------------
| x | x |   | 
-------------


Input your action row: 2
Input your action col: 2


-------------
| x | o |   | 
-------------
| o | o | x | 
-------------
| x | x | o | 
-------------
-------------
| x | o | x | 
-------------
| o | o | x | 
-------------
| x | x | o | 
-------------
tie!
