# Starting by building the game and the user interface

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import RMSprop
import keras

Using TensorFlow backend.


In [74]:
def showBoard(board):
    '''Shows the board'''
    for val in board:
        print(val)

def checkFree(x,board):
    '''Takes an x,y postion and checks if that point on the board is free'''
    b = board.reshape(1,9)
    if b[0][x] ==0:
        return True
    else:
        return False
def checkWin(toggle, board):
    '''checks wins for each player for diagonals, rows, columns'''
    if toggle:
        high = 6
    else:
        high = 15
    if board.diagonal().sum() == high:
        return True
    if np.flip(board,1).diagonal().sum() == high:
        return True
    for val in range(0,3):
        if board[val,:].sum()==high:
            return True
    for val in range(0,3):
        if board[:,val].sum()==high:
            return True
    return False

def getAvailablePositions(board):
    pos = []
    for i in range(9):
        if checkFree(i,board):
            pos.append(i)
    return pos

def placePiece(x,y,nought_or_cross,board):
    '''Takes a x, y position and a X or O with X=1, and O=2'''
    new_board = np.zeros((3,3))
    for i in range(3):
        for j in range(3):
            new_board [i][j] = board[i][j]
    
    new_board[x,y] = nought_or_cross
    return new_board

def getReward(result,num_moves):
    '''Reward a game won or lost'''
    if result == 'lost':
        return -1+(-num_moves/9)
    elif result == 'win':
        return 1+(-num_moves/9)
    else:
        return 0+(-num_moves/9)
    
def getMove(action):
    '''Gets the move based on the chosen action number'''
    moves = [[0,0],[0,1],[0,2],[1,0],[1,1],[1,2],[2,0],[2,1],[2,2]]
    return moves[action]

def convert_to_normal(board):
    '''Converts the new board to a (1,9) shape and replaces the 5 and 2 with a 1 and -1'''
    b = np.zeros((1,9))
    count = 0
    for i in range(3):
        for j in range(3):
            b[0][count] = board[i][j]
            count+=1
    for i in range(9):
        if b[0][i]==5:
            b[0][i] = 1
        elif b[0][i]==2:
            b[0][i] = -1
    return b

def convert_to_normal_diff(board):
    '''Converts the new board to a (1,9) shape and replaces the 5 and 2 with a 1 and -1'''
    b = np.zeros((9,))
    count = 0
    for i in range(3):
        for j in range(3):
            b[count] = board[i][j]
            count+=1
    for i in range(9):
        if b[i]==5:
            b[i] = 1
        elif b[i]==2:
            b[i] = -1
    return b
    

In [64]:
def createBrain():
    model = Sequential()
    model.add(Dense(164, kernel_initializer='lecun_uniform', input_shape=(9,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.2)) #I'm not using dropout, but maybe you wanna give it a try?

    model.add(Dense(150, kernel_initializer='lecun_uniform'))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))

    model.add(Dense(9, kernel_initializer='lecun_uniform'))
    model.add(Activation('linear')) #linear output so we can have range of real-valued outputs

    #rms = RMSprop()
    adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss='mse', optimizer=adam)
    
    return model

In [88]:
import random
#run function

epsilon = 1
gamma = 0.9
epochs = 100000
batchSize = 40
buffer = 80
replay = []
replay2 = []

model1 = createBrain()
model2 = createBrain()

h = 0

for epoch in range(0,epochs):
    
    toggle = False
    board = np.zeros((3,3))
    count1 = 0
    count2 = 0
    result = 'play'
    if epoch%1000 == 0:
        print("Game #: %s" % (epoch,))
    while result != 'lose' and result != 'win' and result !='draw':
        toggle = not toggle
        if toggle == True:
            qval = model1.predict(convert_to_normal(board), batch_size=1)
            if epsilon > random.random():
                action = np.random.choice(getAvailablePositions(board))
            else:
                #print(getAvailablePositions(board))
                action = getAvailablePositions(board)[np.argmax(qval[0][getAvailablePositions(board)])]
            move = getMove(action)
            x = move[0]
            y = move[1]
            n_or_c = 5
        else:
            qval = model2.predict(convert_to_normal(board), batch_size=1)
            if epsilon > random.random():
                action = np.random.choice(getAvailablePositions(board))
            else:
                #print(getAvailablePositions(board))
                action = getAvailablePositions(board)[np.argmax(qval[0][getAvailablePositions(board)])]
            move = getMove(action)
            x = move[0]
            y = move[1]
            n_or_c = 2
        
        new_board = placePiece(int(x),int(y),n_or_c, board)
        
        if checkWin(toggle, new_board):
             result = 'lose'
        elif checkWin(not toggle, new_board):
            result = 'win'
        elif not getAvailablePositions(new_board):
            result = 'draw'
        
        if toggle == True:
            reward = getReward(result, count1)
            if (len(replay) < buffer): #if buffer not filled, add to it
                replay.append((board, action, reward, new_board))
            else:
                if (h < (buffer-1)):
                    h += 1
                else:
                    h = 0
                replay[h] = (board, action, reward, new_board)
                #randomly sample our experience replay memory
                minibatch = random.sample(replay, batchSize)
                X_train = []
                y_train = []
                for memory in minibatch:
                    state, action, reward, new_state = memory
                    old_qval = model1.predict(convert_to_normal(state), batch_size=1)
                    newQ = model1.predict(convert_to_normal(new_state), batch_size=1)
                    maxQ = np.max(newQ)
                    y = np.zeros((1,9))
                    y[:] = old_qval[:]
                    if result != 'lose' or result != 'win' or result !='draw': #non-terminal state
                        update = (reward + (gamma * maxQ))
                    else: #terminal state
                        update = reward
                    y[0][action] = update #target output
                    X_train.append(convert_to_normal_diff(state))
                    y_train.append(y.reshape(9,))
                
                X_train = np.array(X_train)
                y_train = np.array(y_train)
                model1.fit(X_train, y_train, batch_size=batchSize, epochs=1, verbose=0)
            count1 = count1 + 1
        else:
            reward = getReward(result, count2)
            if (len(replay2) < buffer): #if buffer not filled, add to it
                replay2.append((board, action, reward, new_board))
            else:
                if (h < (buffer-1)):
                    h += 1
                else:
                    h = 0
                replay2[h] = (board, action, reward, new_board)
                #randomly sample our experience replay memory
                minibatch = random.sample(replay2, batchSize)
                X_train = []
                y_train = []
                for memory in minibatch:
                    state, action, reward, new_state = memory
                    old_qval = model2.predict(convert_to_normal(state), batch_size=1)
                    newQ = model2.predict(convert_to_normal(new_state), batch_size=1)
                    maxQ = np.max(newQ)
                    y = np.zeros((1,9))
                    y[:] = old_qval[:]
                    if result != 'lose' or result != 'win' or result !='draw': #non-terminal state
                        update = (reward + (gamma * maxQ))
                    else: #terminal state
                        update = reward
                    y[0][action] = update #target output
                    X_train.append(convert_to_normal_diff(state))
                    y_train.append(y.reshape(9,))
                
                X_train = np.array(X_train)
                y_train = np.array(y_train)
                model2.fit(X_train, y_train, batch_size=batchSize, epochs=1, verbose=0)
            count2 = count2 + 1
            
        board = new_board
        #showBoard(board)
        if epsilon > 0.1:
            epsilon -= (1/epochs)


Game #: 0
Game #: 1000
Game #: 2000
Game #: 3000
Game #: 4000
Game #: 5000
Game #: 6000
Game #: 7000
Game #: 8000
Game #: 9000
Game #: 10000
Game #: 11000
Game #: 12000
Game #: 13000
Game #: 14000
Game #: 15000
Game #: 16000
Game #: 17000
Game #: 18000
Game #: 19000
Game #: 20000
Game #: 21000
Game #: 22000
Game #: 23000
Game #: 24000
Game #: 25000
Game #: 26000
Game #: 27000
Game #: 28000
Game #: 29000
Game #: 30000
Game #: 31000
Game #: 32000
Game #: 33000
Game #: 34000
Game #: 35000
Game #: 36000
Game #: 37000
Game #: 38000
Game #: 39000
Game #: 40000
Game #: 41000
Game #: 42000
Game #: 43000
Game #: 44000
Game #: 45000
Game #: 46000
Game #: 47000
Game #: 48000
Game #: 49000
Game #: 50000
Game #: 51000
Game #: 52000
Game #: 53000
Game #: 54000
Game #: 55000
Game #: 56000
Game #: 57000
Game #: 58000
Game #: 59000
Game #: 60000
Game #: 61000
Game #: 62000
Game #: 63000
Game #: 64000
Game #: 65000
Game #: 66000
Game #: 67000
Game #: 68000
Game #: 69000
Game #: 70000
Game #: 71000
Game 

In [89]:
toggle = False
board = np.zeros((3,3))
result = 'play'

while result != 'lose' and result != 'win' and result !='draw':
    toggle = not toggle
    
    if toggle == True:
        qval = model1.predict(convert_to_normal(board), batch_size=1)
        action = getAvailablePositions(board)[(np.argmax(qval[0][getAvailablePositions(board)]))]
        move = getMove(action)
        x = move[0]
        y = move[1]
        n_or_c = 5
    else:
        qval = model2.predict(convert_to_normal(board), batch_size=1)
        action = getAvailablePositions(board)[(np.argmax(qval[0][getAvailablePositions(board)]))]
        move = getMove(action)
        x = move[0]
        y = move[1]
        n_or_c = 2
    
    new_board = placePiece(int(x),int(y),n_or_c, board)

    if checkWin(toggle, new_board):
         result = 'lose'
    elif checkWin(not toggle, new_board):
        result = 'win'
        print(n_or_c,result)
    elif not getAvailablePositions(new_board):
        result = 'draw'

    board = new_board
    showBoard(board)
    print('--------------')


[ 0.  0.  0.]
[ 0.  0.  0.]
[ 0.  0.  5.]
--------------
[ 2.  0.  0.]
[ 0.  0.  0.]
[ 0.  0.  5.]
--------------
[ 2.  0.  0.]
[ 0.  5.  0.]
[ 0.  0.  5.]
--------------
[ 2.  0.  0.]
[ 0.  5.  0.]
[ 0.  2.  5.]
--------------
[ 2.  5.  0.]
[ 0.  5.  0.]
[ 0.  2.  5.]
--------------
[ 2.  5.  2.]
[ 0.  5.  0.]
[ 0.  2.  5.]
--------------
[ 2.  5.  2.]
[ 0.  5.  0.]
[ 5.  2.  5.]
--------------
[ 2.  5.  2.]
[ 2.  5.  0.]
[ 5.  2.  5.]
--------------
[ 2.  5.  2.]
[ 2.  5.  5.]
[ 5.  2.  5.]
--------------


## As can be seen, the game is played to a draw. However there are some small issues of the players not capitalizing on the errors of the opposing player. This could be due to not enough training. As we only ran 100 Games.

## Ideally we want the game to be a draw. This is the solved game state. 

## After 1000 Games player 2 is winning. Which is not ideal. 

## After 10000 games the other player is winning. I wonder if there is an issue with the way it understands the actions based on the turn. 

## Plays to a draw - This works
## Implemented experience replay - batch training - ran it for a 100000 episodes - 