# Tic-Tac-Toe Game with Reinforccement Learning
## Using Value Iteration
 - Author : Rudransh Jaiswal

In [1]:
import numpy as np
import pickle

In [2]:
# Make the environment
class State:
    def __init__(self,player1,player2,size=3):
        self.size=size
        self.p1=player1
        self.p2=player2
        self.board=np.zeros((size, size))
        self.boardHash=None
        self.playerID=1 # p1:1, p2:-1
        self.gameEnds=False
        
    def displayBoard(self): # p1: x  p2: o
        for i in range(0, self.size):
            print('-------------')
            out = '| '
            for j in range(0, self.size):
                if self.board[i, j] == 1: token = 'x'
                if self.board[i, j] == -1: token = 'o'
                if self.board[i, j] == 0: token = ' '
                out += token + ' | '
            print(out)
        print('-------------')
    
    def reset(self):
        self.board = np.zeros((self.size, self.size))
        self.boardHash = None
        self.playerID=1
        self.gameEnds=False
        
    def getHash(self): # denotes a particular State of system
        self.boardHash = str(self.board.reshape((self.size)**2))
        return self.boardHash
        
    def availablePositions(self):
        positions = []
        for i in range(self.size):
            for j in range(self.size):
                if self.board[i, j] == 0: positions.append((i, j))
        return positions
    
    def decision(self): #win loose or draw
        if len(self.availablePositions()) ==0: # draw
            self.gameEnds=True
            return 0
        
        for i in range(self.size):
            sum_row = sum(self.board[i, :])
            sum_col = sum(self.board[:, i])
            if sum_row==3 or sum_col==3:
                self.gameEnds=True
                return 1 #p1 wins
            if sum_row==-3 or sum_col==-3:
                self.gameEnds=True
                return -1 #p2 wins
        sum_diag1=sum([self.board[i, i] for i in range(self.size)])
        sum_diag2=sum([self.board[i, self.size-i-1] for i in range(self.size)])
        if sum_diag1==3 or sum_diag2==3:
            self.gameEnds=True
            return 1
        if sum_diag1==-3 or sum_diag2==-3:
            self.gameEnds=True
            return -1
    
        self.gameEnds=False # default continue the game
        return None
        
    def updateState(self, position):
        self.board[position] = self.playerID
        self.playerID = -1 if self.playerID == 1 else 1 # switch the player
    
    def giveReward(self): # winner gets reward 1, loser get 0, if draw optimise score for better learning
        dec = self.decision()
        if dec == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif dec == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.1)
            self.p2.feedReward(0.5)
        
    def learn(self, rounds=100): #play comp vs comp and learn optimal policy
        for ii in range(rounds):
            if (ii+1)%2000 == 0: print("Learning : rounds ",ii+1," /",rounds)
            
            while not self.gameEnds: 
                positions = self.availablePositions() # p1 plays
                p1_action = self.p1.chooseAction(positions, self.board, self.playerID)
                self.updateState(p1_action)
                board_hash = self.getHash()
                self.p1.addState(board_hash)

                dec = self.decision() # check if game has ended
                if dec is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else: # p2 plays his move
                    positions = self.availablePositions()
                    p2_action = self.p2.chooseAction(positions, self.board, self.playerID)
                    self.updateState(p2_action)
                    board_hash = self.getHash()
                    self.p2.addState(board_hash)
                    
                    dec = self.decision() # check if game has ended
                    if dec is not None:
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break

    def playWithHuman(self):
        while not self.gameEnds:
            positions = self.availablePositions() #computer plays first
            p1_action = self.p1.chooseAction(positions, self.board, self.playerID)
            self.updateState(p1_action)
            self.displayBoard()
            dec = self.decision()
            if dec is not None: #either p1 wins or a game leads to tie
                if dec == 1: print(self.p1.name, "wins!")
                else: print("tie!")
                self.reset()
                break
                
            else: #p2:(Human) plays
                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions)
                self.updateState(p2_action)
                self.displayBoard()
                dec = self.decision()
                if dec is not None: #either p2 wins or a game leads to tie
                    if dec == -1: print(self.p2.name, "wins!")
                    else: print("tie!")
                    self.reset()
                    break



In [3]:
class Player:
    def __init__(self,name,exp_rate=0.3,lr=0.2,gamma=0.9,size=3):
        self.name=name
        self.states=[]
        self.size=size
        self.exp_rate=exp_rate
        self.lr=lr
        self.decay_gamma=gamma
        self.states_value={} #state:value
        
    def getHash(self,board):
        boardHash = str(board.reshape((self.size)**2))
        return boardHash
    
    def chooseAction(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_boardHash = self.getHash(next_board)
                value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                if value >= value_max:
                    value_max = value
                    action = p
        return action
    
    def addState(self, state):
        self.states.append(state)
    
    def feedReward(self, reward): # update value for each state travelling backwards
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr*(self.decay_gamma*reward - self.states_value[st])
            reward = self.states_value[st]
            
    def reset(self):
        self.states = []
        
    def savePolicy(self):
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()
    
    def getPolicy(self):
        return self.states_value

    def loadPolicy(self, file):
        fr = open(file,'rb')
        self.states_value = pickle.load(fr)
        fr.close()
        
    def usePolicy(self, states_value_input):
        self.states_value = states_value_input

In [4]:
class HumanPlayer:
    def __init__(self, name):
        self.name = name 
    
    def chooseAction(self, positions):
        while True: # until a feasible action is chosen
            rcip = input("Input r,c separated by space:")
            rc=rcip.split()
            row,col=int(rc[0]),int(rc[1])
            action = (row, col)
            if action in positions:
                return action
    
    def addState(self, state):
        pass
    
    def feedReward(self, reward):
        pass
            
    def reset(self):
        pass

In [5]:
p1 = Player("computer1")
p2 = Player("computer2")
st = State(p1, p2)
st.learn(50000)

Learning : rounds  2000  / 50000
Learning : rounds  4000  / 50000
Learning : rounds  6000  / 50000
Learning : rounds  8000  / 50000
Learning : rounds  10000  / 50000
Learning : rounds  12000  / 50000
Learning : rounds  14000  / 50000
Learning : rounds  16000  / 50000
Learning : rounds  18000  / 50000
Learning : rounds  20000  / 50000
Learning : rounds  22000  / 50000
Learning : rounds  24000  / 50000
Learning : rounds  26000  / 50000
Learning : rounds  28000  / 50000
Learning : rounds  30000  / 50000
Learning : rounds  32000  / 50000
Learning : rounds  34000  / 50000
Learning : rounds  36000  / 50000
Learning : rounds  38000  / 50000
Learning : rounds  40000  / 50000
Learning : rounds  42000  / 50000
Learning : rounds  44000  / 50000
Learning : rounds  46000  / 50000
Learning : rounds  48000  / 50000
Learning : rounds  50000  / 50000


In [6]:
policy = p1.getPolicy()
p1.savePolicy()

In [7]:
p3 = Player("computer", exp_rate=0)
p3.usePolicy(policy)
p4 = HumanPlayer("tictacNoob")
st = State(p3, p4)
st.playWithHuman()

-------------
|   |   | x | 
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
Input r,c separated by space:0 0
-------------
| o |   | x | 
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
-------------
| o |   | x | 
-------------
|   |   | x | 
-------------
|   |   |   | 
-------------
Input r,c separated by space:2 2
-------------
| o |   | x | 
-------------
|   |   | x | 
-------------
|   |   | o | 
-------------
-------------
| o |   | x | 
-------------
|   | x | x | 
-------------
|   |   | o | 
-------------
Input r,c separated by space:2 0
-------------
| o |   | x | 
-------------
|   | x | x | 
-------------
| o |   | o | 
-------------
-------------
| o |   | x | 
-------------
| x | x | x | 
-------------
| o |   | o | 
-------------
computer wins!
