In [25]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
import time
import random


#The following block is where the ANN is actually built.
#The one I'm using has only one hidden layer of 20 neurons.
network = Sequential()
network.add(Dense(40, init='normal', input_dim=9))
network.add(Activation('relu'))
#network.add(Dense(50, init='normal'))
#network.add(Activation('relu'))
network.add(Dense(9, init='normal'))
network.add(Activation('sigmoid'))
network.compile(loss='mse', optimizer='sgd')

#This function takes the board and chooses randomly among the spots that are not taken.
def randChoice(board):
    while True:
        rand_spot = np.random.randint(0,9)
        if board[rand_spot]==0:
            break
    return rand_spot

#This function creates an expected output array with negative values in all spots that 
#are already full.
def training_board(inboard, output):
    board_train = output
    for i in range(9):
        if inboard[i]==1 or inboard[i]==-1:
            board_train[0][i] = -90
    return board_train

#This function takes the input board and the chosen spot as inputs.
#It determines whether the following state is a win or loss or illegal and give positive or negative rewards.
def rewardFunction(board, choice, x=True):
    if board[choice]==1 or board[choice]==-1:
        return -90
    new_board = board
    if x:
        new_board[choice] = 1
    else:
        new_board[choice] = -1
    if (new_board[0]==1 and new_board[1]==1 and new_board[2]==1) or \
    (new_board[3]==1 and new_board[4]==1 and new_board[5]==1) or \
    (new_board[6]==1 and new_board[7]==1 and new_board[8]==1) or \
    (new_board[0]==1 and new_board[3]==1 and new_board[6]==1) or \
    (new_board[1]==1 and new_board[4]==1 and new_board[7]==1) or \
    (new_board[2]==1 and new_board[5]==1 and new_board[8]==1) or \
    (new_board[0]==1 and new_board[4]==1 and new_board[8]==1) or \
    (new_board[2]==1 and new_board[4]==1 and new_board[6]==1):
        return 100
    if (new_board[0]==-1 and new_board[1]==-1 and new_board[2]==-1) or \
    (new_board[3]==-1 and new_board[4]==-1 and new_board[5]==-1) or \
    (new_board[6]==-1 and new_board[7]==-1 and new_board[8]==-1) or \
    (new_board[0]==-1 and new_board[3]==-1 and new_board[6]==-1) or \
    (new_board[1]==-1 and new_board[4]==-1 and new_board[7]==-1) or \
    (new_board[2]==-1 and new_board[5]==-1 and new_board[8]==-1) or \
    (new_board[0]==-1 and new_board[4]==-1 and new_board[8]==-1) or \
    (new_board[2]==-1 and new_board[4]==-1 and new_board[6]==-1):
        return -100
    else:
        return 0

#This function trains the neural network to play tic tac toe.
def trainNetwork(games, gamma, ep):
    #All of the training is contained in this loop which iterates according to number of games entered.
    for i in range(games):
        if i%10000==0:
            print('game ', i)
        
        #The play board is initialized here
        board = np.zeros((9,1))
        play = True #if someone wins, the loop is broken
        j = 0 # this variable is used to ensure only nine total moves
        
        #This while loop contains on complete game. Broken when game is finished.
        while play==True and j < 5:
            innum1 = board
            
            #ANN picks the highest output value with probability 1 - ep. Otherwise is chooses randomly.
            outnum1 = network.predict(innum1.reshape(1,9), batch_size=1)
            if random.random()>ep:
                move_choice = np.argmax(outnum1)
            else:
                move_choice = randChoice(board)
                
            #if i>30000 and i%1000==0:
                #print('Input: \n', innum1)
                #print('Output: \n', outnum1)
            
            #Reward is calculated for whichever choice
            reward = rewardFunction(innum1, move_choice)
            
            #Here a random unoccupied spot is chosen if the NN outputs the highest value at a taken spot.
            if reward==-90:
                move_choice = randChoice(board)
                reward = rewardFunction(innum1, move_choice)
            
            #Board spot is filled with a 1, which is an X
            board[move_choice] = 1
            
            #If the X's win, the neural networks is trained to predict the chosen value more often.
            #The loop is also broken
            if reward==100:
                expected_out = training_board(innum1,outnum1)
                #print('\n\n\n', expected_out)
                expected_out[0][move_choice] = reward
                network.fit(innum1.reshape(1,9), expected_out.reshape(1,9), batch_size=1, nb_epoch=1, verbose=0)
                play = False
            
            #This second part contains the random player's turn
            if play and j<4:
                #Random player chooses a spot, it is put in the board and it is determined if it has won.
                rand_spot = randChoice(board)
                reward = rewardFunction(board, int(rand_spot), x=False)
                board[rand_spot] = -1
                
                #If the random player wins, the ANN is trained with a negative value on the move it chose last,
                #since that allowed a loss. Loop is broken
                if reward == -100:
                    expected_out = training_board(innum1,outnum1)
                    #print('\n\n\n', expected_out)
                    expected_out[0][move_choice] = reward
                    network.fit(innum1.reshape(1,9), expected_out.reshape(1,9), batch_size=1, nb_epoch=1, verbose=0)
                    play = False
                
                #If it is not a win, the ANN is trained based on the highest valued next available state.
                else:
                    innum2 = board
                    outnum2 = network.predict(innum2.reshape(1,9), batch_size=1)
                    maxQ = max(outnum2[0])
                    expected_out = training_board(innum1,outnum1)
                    if innum1[move_choice]==0:
                        expected_out[0][move_choice] = gamma*maxQ
                    #print(innum1, '\n\n', expected_out)
                    network.fit(innum1.reshape(1,9), expected_out.reshape(1,9), batch_size=1, nb_epoch=1, verbose=0)
            j+=1
        
        #This statement drops ep each iteration until it reaches 0.1
        if ep > 0.1:
            ep -= 1/games
            
def print_board(board):
    new_board = [0,0,0,0,0,0,0,0,0]
    for i in range(9):
        if board[i] == 0:
            new_board[i] = '.'
        if board[i] == 1:
            new_board[i] = 'X'
        if board[i] == -1:
            new_board[i] = 'O'
    print(new_board[0], new_board[1], new_board[2])
    print(new_board[3], new_board[4], new_board[5])
    print(new_board[6], new_board[7], new_board[8], '\n')
    
    
    
def play(games):
    for i in range(games):
        board = np.zeros((9,1))
        play = True #if someone wins, the loop is broken
        j = 0 # this variable is used to ensure only nine total moves
        while play==True and j < 5:
            innum1 = board
            outnum1 = network.predict(innum1.reshape(1,9), batch_size=1)
            move_choice = np.argmax(outnum1)
            reward = rewardFunction(innum1, move_choice)
            print("AI's Move...")
            board[move_choice] = 1
            print_board(board)
            if reward==1:
                print('AI wins!!!')
                play = False
                    
            if play and j<4:
                rand_spot = randChoice(board)
                reward = rewardFunction(board, int(rand_spot), x=False)
                print("Random's Move...")
                board[rand_spot] = -1
                print_board(board)
                if reward == -1:
                    print('Player wins!!!')
                    play = False
            j+=1
    

########################################################################################################    
            
games = 10000
gamma = 0.99
ep = 0.5
start_time = time.time()
trainNetwork(games, gamma, ep)
end_time = time.time()
print('Training Time: ', end_time - start_time)

game  0
Training Time:  29.313282251358032


In [24]:
play(1)

AI's Move...
. . .
. . .
. X . 

Random's Move...
. . .
. O .
. X . 

AI's Move...
. . .
X O .
. X . 

Random's Move...
O . .
X O .
. X . 

AI's Move...
O . .
X O .
. X . 

Random's Move...
O . .
X O .
. X O 

AI's Move...
O . .
X X .
. X O 

Random's Move...
O O .
X X .
. X O 

AI's Move...
O O .
X X X
. X O 

