In [1]:
# Functions for setting up and playing TicTacToe
import numpy as numpy
import pickle
import copy
import random

# Generate an empty board
def getEmptyBoard():
    return [0,0,0,0,0,0,0,0,0]


# Generate a random board, never return a board that can't exist when following the game rules
def getRandomBoard():
    board = list(numpy.random.choice([0,1,2],(9)))
    if board.count(1) - board.count(2) < 0 or board.count(1) - board.count(2) > 1:
        return getRandomBoard()
    return board


# Show the board
def showBoard(board):
    S=""
    T=["·","O","X"]
    for i in [0,3,6]:
        S=S+T[board[i]]+" "+T[board[i+1]]+" "+T[board[i+2]]
        S=S+"\n"
    print (S)


# Check for winning line
def score(board):
    line=0
    if board[0]==board[1] and board[1]==board[2] and board[0]!=0:
        line=board[0]
    if board[3]==board[4] and board[4]==board[5] and board[3]!=0:
        line=board[3]
    if board[6]==board[7] and board[7]==board[8] and board[6]!=0:
        line=board[6]
        
    if board[0]==board[3] and board[3]==board[6] and board[0]!=0:
        line=board[0]
    if board[1]==board[4] and board[4]==board[7] and board[1]!=0:
        line=board[1]
    if board[2]==board[5] and board[5]==board[8] and board[2]!=0:
        line=board[2]

    if board[0]==board[4] and board[4]==board[8] and board[0]!=0:
        line=board[0]
    if board[2]==board[4] and board[4]==board[6] and board[2]!=0:
        line=board[2]  
    return line


# Check if the game is over
def gameOver(board):
    s=score(board)
    if board.count(0)==0 or s!=0:
        return True
    return False


# Make a move
def doMove(board,move,who):
    newBoard=copy.deepcopy(board)
    newBoard[move]=who
    return newBoard


# Get all legal moves
def getAllMoves(board):
    ret=[]
    for i in range(9):
        if board[i]==0:
            ret.append(i)
    return ret

In [2]:
# Test empty board
print("Empty board tests:")
testEmptyBoard = getEmptyBoard()
print(testEmptyBoard)
showBoard(testEmptyBoard)
print("Score:",score(testEmptyBoard))

# Test random boards
noOfTests = 3
print("\n%d random board tests:\n" % noOfTests)

for i in range(noOfTests):
    print("Run",i+1)
    testRandomBoard = getRandomBoard()
    showBoard(testRandomBoard)
    print("Score:",score(testRandomBoard),"\n")

# Test doMove
print("\nTest doMove():")
testDoMove=getEmptyBoard()
showBoard(testDoMove)
testDoMove=doMove(testDoMove,4,1)
showBoard(testDoMove)

# Test getAllMoves
print("\nTest getAllMoves():")
testGetAllMoves=getRandomBoard()
showBoard(testGetAllMoves)
print(getAllMoves(testGetAllMoves))

# Test gameOver
print("\nTest gameOver():")
for i in range(4):
    testGameOver=getRandomBoard()
    showBoard(testGameOver)
    print(gameOver(testGameOver),"\n---")

Empty board tests:
[0, 0, 0, 0, 0, 0, 0, 0, 0]
· · ·
· · ·
· · ·

Score: 0

3 random board tests:

Run 1
X X O
X · O
X O O

Score: 1 

Run 2
O O X
O · X
· X ·

Score: 0 

Run 3
· · X
· · X
· O O

Score: 0 


Test doMove():
· · ·
· · ·
· · ·

· · ·
· O ·
· · ·


Test getAllMoves():
X O X
O O O
O X X

[]

Test gameOver():
O · O
X O X
O · X

True 
---
O O X
X · O
O X ·

False 
---
X O O
O X O
· · X

True 
---
O · O
· · X
O · X

False 
---


In [3]:
# Generate a random strategy function
def addPossibleBoards(board,who):
    moves=getAllMoves(board)
    V=numpy.ones(len(moves))
    V=V/V.sum()
    if score(board)==0 and list(board).count(0)!=0:
        randomStrategy[tuple(board)]=V
    flipMove=[0,2,1]
    for move in moves:
        newBoard=doMove(board,move,who)
        addPossibleBoards(newBoard,flipMove[who])


# Generate randomStrategy
randomStrategy=dict()
board=getEmptyBoard()
addPossibleBoards(board,1)

# Show info about the random strategy
print(len(randomStrategy))
print(randomStrategy[tuple(getEmptyBoard())])
board=[1,1,2,2,1,0,0,0,2]
showBoard(board)
print(randomStrategy[tuple(board)])

# Save randomStrategy to a file
#pickle.dump(randomStrategy,open("./inClassRandomPolicy.p","wb"))

4520
[0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
 0.11111111 0.11111111 0.11111111]
O O X
X O ·
· · X

[0.33333333 0.33333333 0.33333333]


In [11]:
import numpy as np

# Initialize the Q-table with some initial values
# The keys are tuples representing the states and the values are arrays of size 9 representing the Q-values for each possible move
# in that state
Q = dict()

# Define the learning rate and the discount factor
alpha = 0.1
gamma = 0.9

# Define the exploration policy, such as epsilon-greedy
epsilon = 0.1

# Define the number of episodes to train the policy
num_episodes = 10000

# flipMove is used to switch between players
flipMove=[0,2,1]

# Define who starts the game
who = 1

# Define a function to get the Q-value for a given state and action
def getQ(state, action):
    if tuple(state) not in Q:
        Q[tuple(state)] = np.zeros(9)
    return Q[tuple(state)][action]

# Define a function to update the Q-value for a given state and action
def updateQ(state, action, value):
    if tuple(state) not in Q:
        Q[tuple(state)] = np.zeros(9)
    Q[tuple(state)][action] = value

# Train the Q-learning policy
for i in range(num_episodes):
    # Start a new game
    board = getEmptyBoard()

    while not gameOver(board):
        # Select the next action using the exploration policy
        moves = getAllMoves(board)
        if np.random.uniform(0, 1) < epsilon:
            # Choose a random move with probability epsilon
            action = np.random.choice(moves)
        else:
            # Choose the move with the highest Q-value with probability 1 - epsilon
            action = np.argmax([getQ(board, move) for move in moves])

        # Make the selected move
        next_board = doMove(board, action, who)

        # Update the Q-value for the current state and action
        reward = score(next_board)
        if gameOver(next_board):
            updateQ(board, action, reward)
        else:
            next_moves = getAllMoves(next_board)
            updateQ(board, action, (1 - alpha) * getQ(board, action) + alpha * (reward + gamma * max([getQ(next_board, next_move) for next_move in next_moves])))
        
        board = next_board
        who = flipMove[who]

# Save the Q-table policy to a file
QlearningStrategy = Q

In [16]:
print(len(QlearningStrategy))
print(QlearningStrategy[tuple(getEmptyBoard())])
board=[1,1,2,2,1,0,0,0,2]
showBoard(board)
print(QlearningStrategy[tuple(board)])

# Save QlearningStrategy to a file
pickle.dump(QlearningStrategy,open("./QlearningStrategy.p","wb"))

5671
[1.16378942 0.63516962 0.84834151 0.71357569 0.66062935 0.57525428
 0.51118246 0.22759441 0.2409163 ]
O O X
X O ·
· · X

[0.20379356 0.         0.         0.         0.         2.
 0.         0.18       0.        ]


In [5]:
# Load policies from files
import pickle
randomPolicy=pickle.load(open("./inClassRandomPolicy.p","rb"))
print("randomPolicy length:", len(randomPolicy))
perfectPolicy=pickle.load(open("./perfectPolicy.p","rb"))
print("perfectPolicy length:", len(perfectPolicy))
#QlearningStrategy=pickle.load(open("./QlearningStrategy.p","rb"))
#print("QlearningStrategy length:", len(QlearningStrategy))

randomPolicy length: 4520
perfectPolicy length: 4520


In [6]:
# Discover policy files

## Perfect policy

#print(perfectPolicy)
#print(list(perfectPolicy.keys()))

# Test perfect policy
#for i in range(10):
#    keys=list(perfectPolicy.keys())
#    which=numpy.random.randint(0,4520)
#    board=keys[which]
#    showBoard(board)
#    print(perfectPolicy[board])
#    print("---")

## Random policy

#print(randomPolicy)
#print(list(randomPolicy.keys()))

# Test perfect policy
for i in range(10):
    keys=list(perfectPolicy.keys())
    which=numpy.random.randint(0,4520)
    board=keys[which]
    showBoard(board)
    print(perfectPolicy[board])
    print("---")

O X O
· · X
· · ·

[0. 1. 0. 0. 0.]
---
O X X
O O ·
· · ·

[1. 0. 0. 0.]
---
· · O
O · X
X X O

[1. 0. 0.]
---
O · ·
· · ·
· X O

[0. 0. 0. 1. 0. 0.]
---
· · ·
X O O
X X O

[0. 0. 1.]
---
· · O
O X X
O X ·

[1. 0. 0.]
---
O X X
O · ·
X · O

[1. 0. 0.]
---
X O O
X O O
· · X

[1. 0.]
---
X X ·
O · ·
· O O

[1. 0. 0. 0.]
---
X X O
O X O
· O ·

[0. 1.]
---


In [7]:
# Test perfect policy
for i in range(10):
    keys=list(perfectPolicy.keys())
    which=numpy.random.randint(0,4520)
    board=keys[which]
    showBoard(board)
    print(perfectPolicy[board])
    print("---")

· · X
O O ·
X O X

[0. 1. 0.]
---
X O O
X X O
· · ·

[0. 0. 1.]
---
· X ·
O O X
X · O

[1. 0. 0.]
---
O X ·
· · ·
X O O

[0. 0. 1. 0.]
---
O O X
· · X
O X ·

[1. 0. 0.]
---
O O X
· · X
X O O

[0. 1.]
---
· · ·
O X X
O O X

[1. 0. 0.]
---
· · X
· · ·
· O ·

[0. 0. 0. 0. 0. 0. 1.]
---
· X ·
O X O
X · O

[0. 1. 0.]
---
X X ·
· O X
O O ·

[1. 0. 0.]
---


In [29]:
# Functions for testing policies against eachother

# Play a game between two policies
def playTwoPolicies(policyA,policyB,verbose=False):
    flipMove=[0,2,1]
    who=1
    board=[0,0,0,0,0,0,0,0,0]
    done=False
    while not done:
        if verbose:
            showBoard(board)
        moves=getAllMoves(board)
        if who==1:
            p=policyA[tuple(board)]
        else:
            p=policyB[tuple(board)]
        if verbose:
            print(p,"\n---")
        p/=p.sum()
        choice=numpy.random.choice(moves,p=p)
        board=doMove(board,choice,who)
        s=score(board)
        if len(moves)==1 or s!=0:
            done=True
        who=flipMove[who]
    if verbose:
        showBoard(board)
    if s==0:
        return 0
    return flipMove[who]


# Sample games between two policies
def sampleGames(policyA,policyB,nrOfGames=100):
    result=[0,0,0]
    for n in range(nrOfGames):
        winner=playTwoPolicies(policyA,policyB)
        result[winner]+=1
    result=numpy.array(result)
    return result/result.sum()

In [31]:
# Play a game between two policies
playTwoPolicies(randomPolicy,randomPolicy,verbose=False)
#playTwoPolicies(randomPolicy,randomPolicy,verbose=True)

# Sample games between policies
#print("randomPolicy vs randomPolicy",sampleGames(randomPolicy,randomPolicy,nrOfGames=100000))
#print("perfectPolicy vs perfectPolicy",sampleGames(perfectPolicy,perfectPolicy,nrOfGames=100000))
#print("perfectPolicy vs randomPolicy",sampleGames(perfectPolicy,randomPolicy,nrOfGames=100000))
#print("randomPolicy vs perfectPolicy",sampleGames(randomPolicy,perfectPolicy,nrOfGames=100000))
#print("randomPolicy vs QlearningStrategy",sampleGames(randomPolicy,QlearningStrategy,nrOfGames=100000))


#print("QlearningStrategy vs QlearningStrategy",sampleGames(QlearningStrategy,QlearningStrategy,nrOfGames=100000))
#print("QlearningStrategy vs randomPolicy",sampleGames(QlearningStrategy,randomPolicy,nrOfGames=100000))
#print("QlearningStrategy vs perfectPolicy",sampleGames(QlearningStrategy,perfectPolicy,nrOfGames=100000))
#print("randomPolicy vs QlearningStrategy",sampleGames(randomPolicy,QlearningStrategy,nrOfGames=100000))
#print("perfectPolicy vs QlearningStrategy",sampleGames(perfectPolicy,QlearningStrategy,nrOfGames=100000))

1