# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

Credits:

* [Source](https://github.com/MJeremy2017/reinforcement-learning-implementation/blob/master/TicTacToe/ticTacToe.py)
* Code has been analyzed, fixed, commented

Main changes: 

* Fixed diagonal win condition
* Added possibility for human player to start first
* Improved training time for better results

Tic Tac Toe
---
Two players against each other

<img style="float:left" src="board.png" alt="drawing" width="200"/>

In [7]:
import numpy as np
import pickle

In [8]:
BOARD_ROWS = 3
BOARD_COLS = 3

### Board State
---
Reflect & Judge the state

2 players p1 and p2; p1 uses symbol 1 and p2 uses symbol 2, vacancy as 0

In [9]:
class State:

    # game initialization: 1 empty board, 2 players
    def __init__(self, p1, p2):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.boardHash = None
        # init p1 plays first (p1->1   p2-> -1); 
        self.playerSymbol = 1
    
    # get unique hash of current board state
    def getHash(self):
        self.boardHash = str(self.board.reshape(BOARD_COLS*BOARD_ROWS))
        return self.boardHash
    
    # function to check if there is a winner
    def winner(self):
        # row
        for i in range(BOARD_ROWS):
            if sum(self.board[i, :]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[i, :]) == -3:
                self.isEnd = True
                return -1
        # col
        for i in range(BOARD_COLS):
            if sum(self.board[:, i]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[:, i]) == -3:
                self.isEnd = True
                return -1
        # diagonal
        diag_sum1 = sum([self.board[i, i] for i in range(BOARD_COLS)])
        diag_sum2 = sum([self.board[i, BOARD_COLS-i-1] for i in range(BOARD_COLS)])
        diag_sum = max(diag_sum1, diag_sum2)
        if diag_sum1 == 3 or diag_sum2 == 3:
            self.isEnd = True
            return 1
        if diag_sum1 == -3 or diag_sum2 == -3:
            self.isEnd = True
            return -1
        
        # tie
        # no available positions
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        # not end
        self.isEnd = False
        return None
    
    def availablePositions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i, j] == 0:
                    positions.append((i, j))  # need to be tuple
        return positions
    
    def updateState(self, position):
        self.board[position] = self.playerSymbol
        # switch to another player
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1
    
    # only when game ends
    # +1 given to the winning player
    # +0.1 given to p1, +0.5 given to p2 in case of a tie -> since p1 starts first, it looks fair to give more ponts to p2
    def giveReward(self):
        result = self.winner()
        # backpropagate reward
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.1)
            self.p2.feedReward(0.5)
    
    # board reset
    def reset(self):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1
    
    #training games
    def play(self, rounds=100):
        for i in range(rounds):
            if i%1000 == 0:
                print("Rounds {}".format(i))
            while not self.isEnd:
                # Player 1
                positions = self.availablePositions()
                p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                # take action and upate board state
                self.updateState(p1_action)
                board_hash = self.getHash()
                self.p1.addState(board_hash)
                # check board status if it is end

                win = self.winner()
                if win is not None:
                    # self.showBoard()
                    # ended with p1 either win or draw
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else:
                    # Player 2
                    positions = self.availablePositions()
                    p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                    self.updateState(p2_action)
                    board_hash = self.getHash()
                    self.p2.addState(board_hash)
                    
                    win = self.winner()
                    if win is not None:
                        # self.showBoard()
                        # ended with p2 either win or draw
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break
    
    # play with human; Computer starts first
    def play2(self):
        while not self.isEnd:
            # Player 1
            positions = self.availablePositions()
            p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
            # take action and upate board state
            self.updateState(p1_action)
            self.showBoard()
            # check board status if it is end
            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.p1.name, "wins!")
                else:
                    print("tie!")
                self.reset()
                break

            else:
                # Player 2
                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions)

                self.updateState(p2_action)
                self.showBoard()
                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(self.p2.name, "wins!")
                    else:
                        print("tie!")
                    self.reset()
                    break
        
    # play with human; Human starts first
    def play3(self):
        self.showBoard()

        while not self.isEnd:
            # Player 1
            positions = self.availablePositions()
            p1_action = self.p1.chooseAction(positions)
            # take action and upate board state
            self.updateState(p1_action)
            self.showBoard()
            # check board status if it is end
            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.p1.name, "wins!")
                else:
                    print("tie!")
                self.reset()
                break

            else:
                # Player 2
                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)

                self.updateState(p2_action)
                self.showBoard()
                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(self.p2.name, "wins!")
                    else:
                        print("tie!")
                    self.reset()
                    break

    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, BOARD_ROWS):
            print('-------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')    

In [10]:
class Player:
    # init player
    # exp_rate balance between exploration and exploitation
    # 30% random move -> exploration
    # 70% best move -> exploitation
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []    # record all positions taken during a game for the training process
        self.lr = 0.2       # step length
        self.exp_rate = exp_rate
        self.discount = 0.9     # gamma is high -> winning the game is more important than the immediate reward
        self.states_value = {}  # record all positions with their correspondive score
    
    def getHash(self, board):
        boardHash = str(board.reshape(BOARD_COLS*BOARD_ROWS))
        return boardHash
    
    def chooseAction(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
        # it's picked the board that grants the best scenario after the move is made
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_boardHash = self.getHash(next_board)
                value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                # print("value", value)
                if value >= value_max:
                    value_max = value
                    action = p
        # print("{} takes action {}".format(self.name, action))
        return action
    
    # append a hash state
    def addState(self, state):
        self.states.append(state)
    
    # at the end of game, backpropagate and update states value
    # Q learning formula
    # if it's the first time we found that state 0 is assigned
    def feedReward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr*(self.discount*reward - self.states_value[st])
            reward = self.states_value[st]
            
    def reset(self):
        self.states = []
        
    def savePolicy(self):
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()

    def loadPolicy(self, file):
        fr = open(file,'rb')
        self.states_value = pickle.load(fr)
        fr.close()

In [11]:
class HumanPlayer:
    def __init__(self, name):
        self.name = name 
    
    def chooseAction(self, positions):
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action
    
    # append a hash state
    def addState(self, state):
        pass
    
    # at the end of game, backpropagate and update states value
    def feedReward(self, reward):
        pass
            
    def reset(self):
        pass

### Training

In [12]:
p1 = Player("p1")
p2 = Player("p2")

st = State(p1, p2)
print("training...takes around 13 minutes to train, feel free to take a coffee :D")
st.play(200000)


training...takes around 13 minutes to train, feel free to take a coffee :D
Rounds 0
Rounds 1000
Rounds 2000
Rounds 3000
Rounds 4000
Rounds 5000
Rounds 6000
Rounds 7000
Rounds 8000
Rounds 9000
Rounds 10000
Rounds 11000
Rounds 12000
Rounds 13000
Rounds 14000
Rounds 15000
Rounds 16000
Rounds 17000
Rounds 18000
Rounds 19000
Rounds 20000
Rounds 21000
Rounds 22000
Rounds 23000
Rounds 24000
Rounds 25000
Rounds 26000
Rounds 27000
Rounds 28000
Rounds 29000
Rounds 30000
Rounds 31000
Rounds 32000
Rounds 33000
Rounds 34000
Rounds 35000
Rounds 36000
Rounds 37000
Rounds 38000
Rounds 39000
Rounds 40000
Rounds 41000
Rounds 42000
Rounds 43000
Rounds 44000
Rounds 45000
Rounds 46000
Rounds 47000
Rounds 48000
Rounds 49000
Rounds 50000
Rounds 51000
Rounds 52000
Rounds 53000
Rounds 54000
Rounds 55000
Rounds 56000
Rounds 57000
Rounds 58000
Rounds 59000
Rounds 60000
Rounds 61000
Rounds 62000
Rounds 63000
Rounds 64000
Rounds 65000
Rounds 66000
Rounds 67000
Rounds 68000
Rounds 69000
Rounds 70000
Rounds 71000
Ro

In [13]:
p1.savePolicy()
p2.savePolicy()

In [14]:
p2 = Player("computer", exp_rate=0)
p2.loadPolicy("policy_p2")
#sorted(p2.states_value.items(), key=lambda e: e[1], reverse=True)[:10]


### Computer(p1) vs Human(p2)

In [15]:
p1 = Player("computer", exp_rate=0) # we have already trained our model, exploration=0 during testing
p1.loadPolicy("policy_p1") # policy_p1 is the trained agent when the player starts first

p2 = HumanPlayer("human")

st = State(p1, p2)
st.play2()

-------------
|   |   |   | 
-------------
|   | x |   | 
-------------
|   |   |   | 
-------------
-------------
|   |   |   | 
-------------
|   | x |   | 
-------------
|   |   | o | 
-------------
-------------
|   |   | x | 
-------------
|   | x |   | 
-------------
|   |   | o | 
-------------
-------------
|   |   | x | 
-------------
|   | x |   | 
-------------
| o |   | o | 
-------------
-------------
|   |   | x | 
-------------
|   | x |   | 
-------------
| o | x | o | 
-------------
-------------
|   | o | x | 
-------------
|   | x |   | 
-------------
| o | x | o | 
-------------
-------------
|   | o | x | 
-------------
|   | x | x | 
-------------
| o | x | o | 
-------------
-------------
|   | o | x | 
-------------
| o | x | x | 
-------------
| o | x | o | 
-------------
-------------
| x | o | x | 
-------------
| o | x | x | 
-------------
| o | x | o | 
-------------
tie!


### Human(p1) vs Computer(p2)

In [16]:
p2 = Player("computer", exp_rate=0) 
p2.loadPolicy("policy_p2") #policy_p2 is the trained agent when the computer starts second

p1 = HumanPlayer("human")

st = State(p1, p2)
st.play3()

-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   |   | x | 
-------------
-------------
|   |   |   | 
-------------
|   | o |   | 
-------------
|   |   | x | 
-------------
-------------
| x |   |   | 
-------------
|   | o |   | 
-------------
|   |   | x | 
-------------
-------------
| x |   |   | 
-------------
| o | o |   | 
-------------
|   |   | x | 
-------------
-------------
| x |   | x | 
-------------
| o | o |   | 
-------------
|   |   | x | 
-------------
-------------
| x |   | x | 
-------------
| o | o | o | 
-------------
|   |   | x | 
-------------
computer wins!
