## Tic-Tac-Toe Agent

#### Importing libraries

In [1]:
import collections
import numpy as np
import random
import pickle
import collections
from matplotlib import pyplot as plt
import time

In [2]:
class Qlearning:
    # Hyper Parameters are set here
    def __init__(self, epsilon=0.2, alpha=0.3, gamma=0.9, values=[]):
        self.epsilon=epsilon
        self.alpha=alpha
        self.gamma=gamma
        self.Q = {} #Q table
        self.last_board=None
        self.q_last=0.0
        self.state_action_last=None
        self.values = values
        self.options = []

    # Reset 
    def game_begin(self):
        self.last_board = None
        self.q_last = 0.0
        self.state_action_last = None
        self.options = self.values.copy()

    # esplion greedy stratergy
    def epslion_greedy(self, state, possible_moves): #esplion greedy algorithm
        #return  action
        self.last_board = tuple(state)
        if(random.random() < self.epsilon):
            move = random.choice(possible_moves) ##action
            self.state_action_last=(self.last_board,move)
            self.q_last=self.getQ(self.last_board,move)
            return move
        else: #greedy strategy
            Q_list=[]
            for action in possible_moves:
                Q_list.append(self.getQ(self.last_board,action))
            maxQ=max(Q_list)

            if Q_list.count(maxQ) > 1:
                # more than 1 best option; choose among them randomly
                best_options = [i for i in range(len(possible_moves)) if Q_list[i] == maxQ]
                i = random.choice(best_options)
            else:
                i = Q_list.index(maxQ)
            self.state_action_last = (self.last_board, possible_moves[i])
            self.q_last = self.getQ(self.last_board, possible_moves[i])
            return possible_moves[i]

    
    def getQ(self, state, action): #get Q states
        if(self.Q.get((state,action))) is None:
            self.Q[(state,action)] = 1.0
        return self.Q.get((state,action))

    def updateQ(self, reward, state, possible_moves): # update Q states using Qleanning
        q_list=[]
        for moves in possible_moves:
            q_list.append(self.getQ(tuple(state), moves))
        if q_list:
            max_q_next = max(q_list)
        else:
            max_q_next = 0.0
        self.Q[self.state_action_last] = self.q_last + self.alpha * ((reward + self.gamma*max_q_next) - self.q_last)

    def saveQ(self,file_name):  #save table
        with open(file_name +'.pkl', 'wb') as handle:
            pickle.dump(self.Q, handle, protocol=pickle.HIGHEST_PROTOCOL)
            print(len(self.Q.keys())) 

    def loadQ(self,file_name): # load table
        with open(file_name + '.pkl', 'rb') as handle:
            self.Q = pickle.load(handle)
    

In [3]:
from TC_Env import TicTacToe

game = TicTacToe() #game instance
player1 =  Qlearning(values=[1,3,5,7,9]) #player1 learning agent
player2 =  Qlearning(values=[2,4,6,8]) #player2 learning agent
game.startTraining(player1,player2, 200000, verbose=False) #train for 200,000 iterations
game.saveStates()  #save Qtables

Training Started
Training Complete
1650078
896386


In [4]:
odd_player_Q, even_player_Q = game.getQ()

In [5]:
odd_player_Q

{((0, 0, 0, 0, 0, 0, 0, 0, 0), 3): -2.8262311171339585,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 1): 1.0,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 2): 1.0,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 4): 1.0,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 5): 1.0,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 6): 1.0,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 7): 1.0,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 8): 1.0,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 9): 1.0,
 ((0, 0, 3, 0, 0, 0, 0, 0, 6), 1): 2.3558314300000003,
 ((0, 0, 3, 0, 0, 0, 0, 0, 6), 2): 0.16411,
 ((0, 0, 3, 0, 0, 0, 0, 0, 6), 4): 0.02941390000000002,
 ((0, 0, 3, 0, 0, 0, 0, 0, 6), 5): 3.594940372155051,
 ((0, 0, 3, 0, 0, 0, 0, 0, 6), 6): 0.02941390000000002,
 ((0, 0, 3, 0, 0, 0, 0, 0, 6), 7): -0.009410269999999978,
 ((0, 0, 3, 0, 0, 0, 0, 0, 6), 8): 0.0495796764281522,
 ((9, 0, 3, 0, 0, 0, 0, 0, 6), 2): 1.0,
 ((9, 0, 3, 0, 0, 0, 0, 0, 6), 4): 1.0,
 ((9, 0, 3, 0, 0, 0, 0, 0, 6), 5): 1.0,
 ((9, 0, 3, 0, 0, 0, 0, 0, 6), 6): 1.0,
 ((9, 0, 3, 0, 0, 0, 0, 0, 6), 7): 1.0,
 ((9, 0, 3, 0, 0, 0, 0, 0, 6), 8): 1.0,
 ((0, 

In [6]:
even_player_Q

{((0, 0, 3, 0, 0, 0, 0, 0, 0), 1): -9.09999779434748,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 2): -9.099996849067828,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 4): -9.09999779434748,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 5): -9.09999779434748,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 6): -9.099996849067828,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 7): -9.09999779434748,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 8): -9.09999779434748,
 ((0, 0, 3, 0, 0, 0, 0, 0, 0), 9): -9.09999779434748,
 ((9, 0, 3, 0, 0, 0, 0, 0, 6), 2): 1.0,
 ((9, 0, 3, 0, 0, 0, 0, 0, 6), 4): 1.0,
 ((9, 0, 3, 0, 0, 0, 0, 0, 6), 5): 1.0,
 ((9, 0, 3, 0, 0, 0, 0, 0, 6), 6): 1.0,
 ((9, 0, 3, 0, 0, 0, 0, 0, 6), 7): 1.0,
 ((9, 0, 3, 0, 0, 0, 0, 0, 6), 8): 1.0,
 ((0, 0, 0, 0, 0, 0, 0, 5, 0), 1): -9.099998919230265,
 ((0, 0, 0, 0, 0, 0, 0, 5, 0), 2): -9.099998919230265,
 ((0, 0, 0, 0, 0, 0, 0, 5, 0), 3): -9.099998919230265,
 ((0, 0, 0, 0, 0, 0, 0, 5, 0), 4): -9.099998919230265,
 ((0, 0, 0, 0, 0, 0, 0, 5, 0), 5): -9.099999243461186,
 ((0, 0, 0, 0, 0, 0, 0, 5, 0), 6): -9.0999992434611

In [8]:
odd_player_Q[((0, 0, 0, 0, 0, 0, 0, 0, 0), 1)]

-4.30264394610117

In [9]:
odd_player_Q[((0, 0, 0, 0, 0, 0, 0, 0, 0), 2)]

-2.160378810317757

In [10]:
odd_player_Q[((0, 0, 0, 0, 0, 0, 0, 0, 0), 3)]

-2.8262311171339585

In [11]:
odd_player_Q[((0, 0, 0, 0, 0, 0, 0, 0, 0), 4)]

-2.9097058746905238

In [12]:
odd_player_Q[((0, 0, 0, 0, 0, 0, 0, 0, 0), 5)]

-3.07958605383777

In [13]:
odd_player_Q[((0, 0, 0, 0, 0, 0, 0, 0, 0), 6)]

-3.0223566221310194

In [14]:
odd_player_Q[((0, 0, 0, 0, 0, 0, 0, 0, 0), 7)]

-3.1384727683710043

In [15]:
odd_player_Q[((0, 0, 0, 0, 0, 0, 0, 0, 0), 8)]

-2.8261601825832

In [16]:
odd_player_Q[((0, 0, 0, 0, 0, 0, 0, 0, 0), 9)]

-0.5911816880020981