## Tic-Tac-Toe Agent

#### Importing libraries

In [1]:
import collections
import numpy as np
import random
import pickle
import collections
from matplotlib import pyplot as plt
import time

In [2]:
class Qlearning:
    # Hyper Parameters are set here
    def __init__(self, epsilon=0.2, alpha=0.3, gamma=0.9, values=[]):
        self.epsilon=epsilon
        self.alpha=alpha
        self.gamma=gamma
        self.Q = {} #Q table
        self.last_board=None
        self.q_last=0.0
        self.state_action_last=None
        self.values = values
        self.options = []

    # Reset 
    def game_begin(self):
        self.last_board = None
        self.q_last = 0.0
        self.state_action_last = None
        self.options = self.values.copy()

    # esplion greedy stratergy
    def epslion_greedy(self, state, possible_moves): #esplion greedy algorithm
        #return  action
        self.last_board = tuple(state)
        if(random.random() < self.epsilon):
            move = random.choice(possible_moves) ##action
            self.state_action_last=(self.last_board,move)
            self.q_last=self.getQ(self.last_board,move)
            return move
        else: #greedy strategy
            Q_list=[]
            for action in possible_moves:
                Q_list.append(self.getQ(self.last_board,action))
            maxQ=max(Q_list)

            if Q_list.count(maxQ) > 1:
                # more than 1 best option; choose among them randomly
                best_options = [i for i in range(len(possible_moves)) if Q_list[i] == maxQ]
                i = random.choice(best_options)
            else:
                i = Q_list.index(maxQ)
            self.state_action_last = (self.last_board, possible_moves[i])
            self.q_last = self.getQ(self.last_board, possible_moves[i])
            return possible_moves[i]

    
    def getQ(self, state, action): #get Q states
        if(self.Q.get((state,action))) is None:
            self.Q[(state,action)] = 1.0
        return self.Q.get((state,action))

    def updateQ(self, reward, state, possible_moves): # update Q states using Qleanning
        q_list=[]
        for moves in possible_moves:
            q_list.append(self.getQ(tuple(state), moves))
        if q_list:
            max_q_next = max(q_list)
        else:
            max_q_next = 0.0
        self.Q[self.state_action_last] = self.q_last + self.alpha * ((reward + self.gamma*max_q_next) - self.q_last)

    def saveQ(self,file_name):  #save table
        with open(file_name +'.pkl', 'wb') as handle:
            pickle.dump(self.Q, handle, protocol=pickle.HIGHEST_PROTOCOL)
            print(len(self.Q.keys())) 

    def loadQ(self,file_name): # load table
        with open(file_name + '.pkl', 'rb') as handle:
            self.Q = pickle.load(handle)
    

In [3]:
from TC_Env import TicTacToe

game = TicTacToe() #game instance
player1 =  Qlearning(values=[1,3,5,7,9]) #player1 learning agent
player2 =  Qlearning(values=[2,4,6,8]) #player2 learning agent
game.startTraining(player1,player2, 200000, verbose=False) #train for 200,000 iterations
game.saveStates()  #save Qtables

Training Started
Training Complete
1758582
1758574


In [4]:
odd_player_Q, even_player_Q = game.getQ()

In [5]:
odd_player_Q

{((0, 0, 0, 0, 0, 0, 0, 0, 0), 1): -2.949019005866651,
 ((0, 0, 0, 0, 0, 0, 0, 0, 0), 2): -1.6768168088678534,
 ((0, 0, 0, 0, 0, 0, 0, 0, 0), 3): -3.3632495300201506,
 ((0, 0, 0, 0, 0, 0, 0, 0, 0), 4): -0.6437370611219675,
 ((0, 0, 0, 0, 0, 0, 0, 0, 0), 5): -2.406244625037909,
 ((0, 0, 0, 0, 0, 0, 0, 0, 0), 6): -4.244868683433749,
 ((0, 0, 0, 0, 0, 0, 0, 0, 0), 7): 0.11959546214057448,
 ((0, 0, 0, 0, 0, 0, 0, 0, 0), 8): -1.7976227444962052,
 ((0, 0, 0, 0, 0, 0, 0, 0, 0), 9): -1.8123575410266912,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 1): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 2): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 3): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 4): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 6): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 7): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 8): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 9): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 6), 1): -2.0823615371231057,
 ((0, 0, 0, 0, 1, 0, 0, 0, 6), 2): -1.1509999999999998,
 ((0, 0, 0, 0, 1, 0, 0, 0, 6), 3): -0.7559952072249101,
 ((0, 0, 0,

In [6]:
even_player_Q

{((0, 0, 0, 0, 1, 0, 0, 0, 0), 1): 1.035709542088035,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 2): -0.130574775247436,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 3): -4.0421384380868135,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 4): -1.2524168683982877,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 6): 0.8262069936737676,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 7): -0.36940010591487527,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 8): 0.49472193006816834,
 ((0, 0, 0, 0, 1, 0, 0, 0, 0), 9): 0.5730759511046681,
 None: 1.2940003726779474,
 ((0, 0, 0, 0, 1, 0, 0, 0, 6), 1): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 6), 2): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 6), 3): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 6), 4): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 6), 6): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 6), 7): 1.0,
 ((0, 0, 0, 0, 1, 0, 0, 0, 6), 8): 1.0,
 ((0, 0, 3, 0, 1, 0, 0, 0, 6), 1): 0.949,
 ((0, 0, 3, 0, 1, 0, 0, 0, 6), 2): 0.9343,
 ((0, 0, 3, 0, 1, 0, 0, 0, 6), 4): -1.1509999999999998,
 ((0, 0, 3, 0, 1, 0, 0, 0, 6), 6): 0.949,
 ((0, 0, 3, 0, 1, 0, 0, 0, 6), 7): 0.949,
 ((0, 0, 3, 0, 1, 0, 0, 