In [1]:
import numpy as np
import random
import time

random.seed()

class QLearner(object):
    # initializes Q matrix to start board only
    def __init__(self):
        self.Q = [[[0,0,0,0,0,0,0,0,0], [0,0,0,0,0,0,0,0,0]]]
     
    # This funcion chooses a random value between 0 and 8
    def randChoice(self, board):
        while True:
            rand_spot = np.random.randint(0,9)
            if board[rand_spot]==0:
                break
        return rand_spot
    
    #This function compares a board state with the saved boards in the Q matrix. It includes a search for possible transformations
    # of the same board. It returns first the 1st index of the matching board and its possible rotation number
    def searchMatrix(self, board, matrix):
        match = 0
        index = -1
        board1 = board #normal board
        board2 = [board[6], board[3], board[0], board[7], board[4], board[1], board[8], board[5], board[2]] 
        board3 = [board[6], board[7], board[8], board[3], board[4], board[5], board[0], board[1], board[2]] 
        board4 = [board[8], board[5], board[2], board[7], board[4], board[1], board[6], board[3], board[0]] 
        for i in range(len(matrix)):
            if matrix[i][0]==board1:
                #match = 1 #normal board
                #index = i
                return i, 1
            if matrix[i][0]==board1[::-1]:
                #match = 2 # 180 degree rotation
                #index = i
                return i, 2
            if matrix[i][0]==board2:
                #match = 3 #270 degree rotation
                #index = i
                return i, 3
            if matrix[i][0]==board2[::-1]:
                #match = 4 #90 degree rotation
                #index = i
                return i, 4
            if matrix[i][0]==board3:
                #match = 5 #horizontal flip
                #index = i
                return i, 5
            if matrix[i][0]==board3[::-1]:
                #match = 6 #vertical flip
                #index = i
                return i, 6
            if matrix[i][0]==board4:
                #match = 7 #top right diagonal flip
                #index = i
                return i, 7
            if matrix[i][0]==board4[::-1]:
                #match = 8 #top left diagonal flip
                #index = i
                return i, 8
        return -1, 0
    
    # This function takes an index from a transformed state of a board and converts it back to the corresponding
    # index on the non-transformed version of the board.
    def reverseTransform(self, index, transform_number):
        if transform_number==1:
            mydict = {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8}
        if transform_number==2:
            mydict = {0:8, 1:7, 2:6, 3:5, 4:4, 5:3, 6:2, 7:1, 8:0}
        if transform_number==3:
            mydict = {0:6, 1:3, 2:0, 3:7, 4:4, 5:1, 6:8, 7:5, 8:2}
        if transform_number==4:
            mydict = {0:2, 1:5, 2:8, 3:1, 4:4, 5:7, 6:0, 7:3, 8:6}
        if transform_number==5:
            mydict = {0:6, 1:7, 2:8, 3:3, 4:4, 5:5, 6:0, 7:1, 8:2}
        if transform_number==6:
            mydict = {0:2, 1:1, 2:0, 3:5, 4:4, 5:3, 6:8, 7:7, 8:6}
        if transform_number==7:
            mydict = {0:8, 1:5, 2:2, 3:7, 4:4, 5:1, 6:6, 7:3, 8:0}
        if transform_number==8:
            mydict = {0:0, 1:3, 2:6, 3:1, 4:4, 5:7, 6:2, 7:5, 8:8}
        index_new = mydict[index]
        return index_new
    
    # This function takes the board state and spot choice and rewards for a win and penalizes for a loss or for playing in
    # an occupied space. Returns a number value 100 for win, -100 for loss and -110 for playing in used space
    def rewardFunction(self, board, choice, x=True):
        if board[choice]==1 or board[choice]==-1:
            return -110
        new_board = board
        if x:
            new_board[choice] = 1
        else:
            new_board[choice] = -1
        if (new_board[0]==1 and new_board[1]==1 and new_board[2]==1) or \
        (new_board[3]==1 and new_board[4]==1 and new_board[5]==1) or \
        (new_board[6]==1 and new_board[7]==1 and new_board[8]==1) or \
        (new_board[0]==1 and new_board[3]==1 and new_board[6]==1) or \
        (new_board[1]==1 and new_board[4]==1 and new_board[7]==1) or \
        (new_board[2]==1 and new_board[5]==1 and new_board[8]==1) or \
        (new_board[0]==1 and new_board[4]==1 and new_board[8]==1) or \
        (new_board[2]==1 and new_board[4]==1 and new_board[6]==1):
            return 100
        if (new_board[0]==-1 and new_board[1]==-1 and new_board[2]==-1) or \
        (new_board[3]==-1 and new_board[4]==-1 and new_board[5]==-1) or \
        (new_board[6]==-1 and new_board[7]==-1 and new_board[8]==-1) or \
        (new_board[0]==-1 and new_board[3]==-1 and new_board[6]==-1) or \
        (new_board[1]==-1 and new_board[4]==-1 and new_board[7]==-1) or \
        (new_board[2]==-1 and new_board[5]==-1 and new_board[8]==-1) or \
        (new_board[0]==-1 and new_board[4]==-1 and new_board[8]==-1) or \
        (new_board[2]==-1 and new_board[4]==-1 and new_board[6]==-1):
            return -100
        else:
            return 0
        
    def qInitialize(self, board):
        new = [0,0,0,0,0,0,0,0,0]
        for i in range(9):
            if board[i]==1 or board[i]==-1:
                new[i] = -400
        return new
    
    
    # This is the workhorse function that trains the Q matrix to play tic tac toe effectively
    # Populates self.Q with values so that it can play well.
    def learn(self, games, lrate, disc, ep):
        for i in range(games):
            if i%10000==0:
                print('game ', i)
            play = True #if someone wins, the loop is broken
            board = [0,0,0,0,0,0,0,0,0]
            j = 0 # this variable is used to ensure only nine total moves
            while play==True and j < 5:
                saved_board_index, transform_number = self.searchMatrix(tuple(board), self.Q)
                if saved_board_index==-1:
                    minilist = []
                    q_initial = self.qInitialize(board)
                    minilist.append(tuple(board))
                    minilist.append(q_initial)
                    self.Q.append(minilist)
                    #print('comp chooses ' + str(spot_choice))
                    index = len(self.Q) - 1
                    spot_choice = self.randChoice(board)
                    reward = self.rewardFunction(board, spot_choice)
                    board[spot_choice] = 1
                    
                else:
                    index = saved_board_index
                    if np.random.random()>ep:
                        spot_choice = np.argmax(self.Q[index][1])
                        spot_choice0 = self.reverseTransform(spot_choice, transform_number)
                           
                    else:
                        spot_choice = self.randChoice(self.Q[index][0])
                        spot_choice0 = self.reverseTransform(spot_choice, transform_number)
                        
                    #print('comp chooses ' + str(spot_choice0))
                    reward = self.rewardFunction(board, spot_choice0)
                    board[spot_choice0] = 1
                #print_board(board)
                if reward==-110:
                    self.Q[index][1][spot_choice] += lrate*(reward+disc*(-110) - self.Q[index][1][spot_choice])
                if reward==100:
                    self.Q[index][1][spot_choice] += lrate*(reward+disc*(100) - self.Q[index][1][spot_choice])
                    play = False
                            
                            
                if play and j<4:
                    rand_spot = self.randChoice(board)
                    #print('rand chooses ' + str(rand_spot))
                    reward = self.rewardFunction(board, int(rand_spot), x=False)
                    board[rand_spot] = -1
                    #print_board(board)
                    if reward==-100:
                        self.Q[index][1][spot_choice] += lrate*(reward+disc*(-100) - self.Q[index][1][spot_choice])
                        play = False
                    else:
                        index_prime, trans_num = self.searchMatrix(tuple(board), self.Q)
                        if index_prime==-1:
                            max_q_prime = 0
                        else:
                            max_q_prime = max(self.Q[index_prime][1])
                        self.Q[index][1][spot_choice] += lrate*(reward+disc*max_q_prime - self.Q[index][1][spot_choice])
                j += 1
        print(len(self.Q))

    def play(self, games):
        for i in range(games):
            play = True #if someone wins, the loop is broken
            board = [0,0,0,0,0,0,0,0,0]
            j = 0 # this variable is used to ensure only nine total move
            while play==True and j < 5:
                saved_board_index, transform_number = self.searchMatrix(tuple(board), self.Q)
                if saved_board_index==-1:
                    print('board not found in Q!')
                    spot_choice = self.randChoice(board)
                    print('comp chooses ' + str(spot_choice))
                    reward = self.rewardFunction(board, int(spot_choice))
                    board[spot_choice] = 1

                else:
                    #print(self.Q[saved_board_index][1])
                    #print(saved_board_index, transform_number)
                    index = saved_board_index
                    spot_choice0 = np.argmax(self.Q[index][1])
                    spot_choice = self.reverseTransform(spot_choice0, transform_number)
                    print('comp chooses ' + str(spot_choice))
                    reward = self.rewardFunction(board, int(spot_choice))
                    board[spot_choice] = 1
                print_board(board)
                if reward==100:
                    print('Computer wins!!')
                    play = False

                if play and j<4:
                    while True:
                        player_choice = int(input('Choose a spot '))-1
                        if board[player_choice]==0:
                            break
                        else:
                            print('Not valid spot')

                    reward = self.rewardFunction(board, player_choice, x=False)
                    board[player_choice] = -1
                    print_board(board)
                    if reward==-100:
                        print('Player wins!!')
                        play = False
                j += 1
            print('Game Over')
    
    
def print_board(board):
    new_board = [0,0,0,0,0,0,0,0,0]
    for i in range(9):
        if board[i] == 0:
            new_board[i] = '.'
        if board[i] == 1:
            new_board[i] = 'X'
        if board[i] == -1:
            new_board[i] = 'O'
    print(new_board[0], new_board[1], new_board[2])
    print(new_board[3], new_board[4], new_board[5])
    print(new_board[6], new_board[7], new_board[8], '\n')
                    
TTT = QLearner()
start = time.time()
TTT.learn(10000, .1, 1, .1)

end = time.time()
print('time: ', end-start)

game  0
898
time:  10.608197927474976


In [2]:
TTT.play(2)

comp chooses 2
. . X
. . .
. . . 

Choose a spot 1
O . X
. . .
. . . 

comp chooses 8
O . X
. . .
. . X 

Choose a spot 2
O O X
. . .
. . X 

comp chooses 5
O O X
. . X
. . X 

Computer wins!!
Game Over
comp chooses 2
. . X
. . .
. . . 

Choose a spot 6
. . X
. . O
. . . 

comp chooses 8
. . X
. . O
. . X 

Choose a spot 5
. . X
. O O
. . X 

comp chooses 3
. . X
X O O
. . X 

Choose a spot 2
. O X
X O O
. . X 

comp chooses 7
. O X
X O O
. X X 

Choose a spot 1
O O X
X O O
. X X 

comp chooses 6
O O X
X O O
X X X 

Computer wins!!
Game Over
