In [None]:
import numpy as np
import random
import time

random.seed(1)

#plagarized code from book, mostly
class Network(object):
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.layers = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) for y, x in zip(sizes[1:], sizes[:-1])]
        
    # This funcion chooses a random value between 0 and 8
    def randChoice(self, board):
        while True:
            rand_spot = np.random.randint(0,9)
            if board[rand_spot]==0:
                break
        return rand_spot
        
    def feedforward(self, a):
        """Return the output of the network if ``a`` is input."""
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a
    
    def trainNetwork(self, games, eta, gamma, ep):
        for i in range(games):
            if i%10000==0:
                print('game ', i)
            board = np.zeros((9,1))
            play = True #if someone wins, the loop is broken
            j = 0 # this variable is used to ensure only nine total moves
            while play==True and j < 5:
                innum1 = board
                if random.random()>ep:
                    outnum1 = self.feedforward(innum1)
                    move_choice = np.argmax(outnum1)
                else:
                    move_choice = self.randChoice(board)
                reward = self.rewardFunction(innum1, move_choice)
                if reward==-0.9999:
                    expected_out = outnum1
                    expected_out[move_choice] = reward
                    for i in range(3):
                        self.update(eta, innum1, expected_out)
                    move_choice = self.randChoice(board)
                    reward = self.rewardFunction(innum1, move_choice)
                board[move_choice] = 1
                if reward==1:
                    expected_out = outnum1
                    expected_out[move_choice] = reward
                    self.update(eta, innum1, expected_out)
                    play = False
                    
                if play and j<4:
                    rand_spot = self.randChoice(board)
                    reward = self.rewardFunction(board, int(rand_spot), x=False)
                    board[rand_spot] = -1
                    if reward == -1:
                        expected_out = outnum1
                        expected_out[move_choice] = reward
                        self.update(eta, innum1, expected_out)
                        play = False
                    else:
                        innum2 = board
                        outnum2 = self.feedforward(innum2)
                        maxQ = max(outnum2)
                        expected_out = outnum1
                        expected_out[move_choice] = gamma*maxQ
                        self.update(eta, innum1, expected_out)
                j+=1
                    
    def update(self, eta, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        delta_nabla_b, delta_nabla_w = self.backprop(x, y)
        nabla_b = [(nb + dnb) for nb, dnb in zip(nabla_b, delta_nabla_b)]
        nabla_w = [(nw + dnw) for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [w-eta*nw for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-eta*nb for b, nb in zip(self.biases, nabla_b)]
        
    def backprop(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        #feedforward
        activation = x
        activations = [x]
        zs = []
        p = 0
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activations[p]) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
            p += 1
        #backward pass
        delta = (activations[-1] - y)*sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, np.transpose(activations[-2]))
        for L in range(2, self.num_layers):
            z = zs[-L]
            sp = sigmoid_prime(z)
            delta = np.dot(np.transpose(self.weights[-L+1]), delta)*sp
            nabla_b[-L] = delta
            nabla_w[-L] = np.dot(delta, np.transpose(activations[-L-1]))
        return(nabla_b, nabla_w)
        
        
    def cost_derivative(a, y):
        return (a - y)
    
    def play(self, games):
        for i in range(games):
            play = True #if someone wins, the loop is broken
            board = np.zeros((9,1))
            j = 0 # this variable is used to ensure only nine total move
            while play==True and j < 5:
                spot_choice = np.argmax(self.feedforward(board))
                print(self.feedforward(board))
                print('Comp chooses ' + str(spot_choice))
                reward = self.rewardFunction(board, int(spot_choice))
                board[spot_choice] = 1
                print_board(board)
                if reward==1:
                    print('Computer wins!!')
                    play = False

                if play and j<4:
                    while True:
                        player_choice = int(input('Choose a spot '))
                        if board[player_choice]==0:
                            break
                        else:
                            print('Not valid spot')

                    reward = self.rewardFunction(board, player_choice, x=False)
                    board[player_choice] = -1
                    print_board(board)
                    if reward==-1:
                        print('Player wins!!')
                        play = False
                j += 1
            print('Game Over')
                    
    def rewardFunction(self, board, choice, x=True):
        if board[choice]==1 or board[choice]==-1:
            return -0.9999
        new_board = board
        if x:
            new_board[choice] = 1
        else:
            new_board[choice] = -1
        if (new_board[0]==1 and new_board[1]==1 and new_board[2]==1) or \
        (new_board[3]==1 and new_board[4]==1 and new_board[5]==1) or \
        (new_board[6]==1 and new_board[7]==1 and new_board[8]==1) or \
        (new_board[0]==1 and new_board[3]==1 and new_board[6]==1) or \
        (new_board[1]==1 and new_board[4]==1 and new_board[7]==1) or \
        (new_board[2]==1 and new_board[5]==1 and new_board[8]==1) or \
        (new_board[0]==1 and new_board[4]==1 and new_board[8]==1) or \
        (new_board[2]==1 and new_board[4]==1 and new_board[6]==1):
            return 1
        if (new_board[0]==-1 and new_board[1]==-1 and new_board[2]==-1) or \
        (new_board[3]==-1 and new_board[4]==-1 and new_board[5]==-1) or \
        (new_board[6]==-1 and new_board[7]==-1 and new_board[8]==-1) or \
        (new_board[0]==-1 and new_board[3]==-1 and new_board[6]==-1) or \
        (new_board[1]==-1 and new_board[4]==-1 and new_board[7]==-1) or \
        (new_board[2]==-1 and new_board[5]==-1 and new_board[8]==-1) or \
        (new_board[0]==-1 and new_board[4]==-1 and new_board[8]==-1) or \
        (new_board[2]==-1 and new_board[4]==-1 and new_board[6]==-1):
            return -1
        else:
            return 0

    
def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x)*(1-sigmoid(x))

def print_board(board):
    new_board = [0,0,0,0,0,0,0,0,0]
    for i in range(9):
        if board[i] == 0:
            new_board[i] = '.'
        if board[i] == 1:
            new_board[i] = 'X'
        if board[i] == -1:
            new_board[i] = 'O'
    print(new_board[0], new_board[1], new_board[2])
    print(new_board[3], new_board[4], new_board[5])
    print(new_board[6], new_board[7], new_board[8], '\n')
#################################################################################################################
NN = Network([9,20, 10, 9])

start = time.time()
NN.trainNetwork(200000, 0.1, 1, 0.1)  #convergence takes about  training games with learning rate 
end = time.time()
print("Time: ", end - start)

game  0
game  10000
game  20000
game  30000
game  40000


In [None]:
NN.play(1)