In [173]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice,random
from copy import deepcopy

from tqdm.auto import tqdm
import numpy as np

In [2]:
State = namedtuple('State', ['x', 'o'])

In [3]:
MAGIC = [2, 7, 6, 
         9, 5, 1, 
         4, 3, 8]

In [78]:
def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos.x:
                print('X', end='|')
            elif MAGIC[i] in pos.o:
                print('O', end='|')
            else:
                print('.', end='|')
        print()
    print()

In [5]:
def win(elements):
    """Checks is elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3))

def state_value(pos: State):
    """Evaluate state: +1 first player wins"""
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0
    
    

In [6]:
def random_game():
    trajectory = list()
    state = State(set(), set())
    available = set(range(1, 9+1))
    while available:
        x = choice(list(available))
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x)
        if win(state.x) or not available:
            break

        o = choice(list(available))
        state.o.add(o)
        trajectory.append(deepcopy(state))
        available.remove(o)
        if win(state.o):
            break
    return trajectory

In [7]:
value_dictionary = defaultdict(float)
hit_state = defaultdict(int)
epsilon = 0.001

for steps in tqdm(range(100_000)):
    trajectory = random_game()
    final_reward = state_value(trajectory[-1])
    for state in trajectory:
        hashable_state = (frozenset(state.x), frozenset(state.o))
        hit_state[hashable_state] += 1
        value_dictionary[hashable_state] = value_dictionary[
            hashable_state
        ] + epsilon * (final_reward - value_dictionary[hashable_state])
        

  0%|          | 0/100000 [00:00<?, ?it/s]

In [128]:
def best_move(state:State):
    """Montecarlo learning"""
    possible_moves={1,2,3,4,5,6,7,8,9} 
    available_moves= state.x.union(state.o)
    available_moves=possible_moves.difference(available_moves)
    next_moves=dict()
    if len(available_moves)==0:
        return -1
    for move in available_moves:
        next_move=deepcopy(state)
        next_move.x.add(move)
        hashable_move = (frozenset(next_move.x), frozenset(next_move.o))
        next_moves[hashable_move]=value_dictionary[hashable_move]
    
    m= max(list(next_moves.items()), key=lambda e: e[1])
    return State(set(m[0][0]),set(m[0][1]))
    
        

In [129]:
s=State({1,2,3},{4,5,7})
print_board(s)
best=best_move(s)
s

X|O|.|
.|O|X|
O|X|.|


State(x={1, 2, 3}, o={4, 5, 7})

In [118]:
def random_move(state: State)-> State:
    """plays random move, it is always O"""
    possible_moves={1,2,3,4,5,6,7,8,9} 
    available_moves= state.x.union(state.o)
    available_moves=possible_moves.difference(available_moves)
    new_state=deepcopy(state)
    m = choice(list(available_moves))
    new_state.o.add(m)
    return new_state

In [97]:
def available(state:State)-> bool:
    possible_moves={1,2,3,4,5,6,7,8,9}
    available =possible_moves.difference(state.x.union(state.o))
    return False if len(available)==0 else True

In [159]:
def possible_moves(state:State)->set:
    if win(state.x) or win(state.o):
        return {}
    possible_moves={1,2,3,4,5,6,7,8,9}
    available =possible_moves.difference(state.x.union(state.o))
    return available

In [149]:
def best_random_game(printable=False)->int:
    """game between MonteCarlo and purerandom player"""
    state=State(set(),set())
    if printable:
        print_board(state)
    
    while available(state):
        state=best_move(state)
        if printable:
            print_board(state)
        if win(state.x):
            break
        
        if available(state):
            state=random_move(state)
            if printable:
                print_board(state)
            if win(state.o):
                break
    return int(win(state.x))

In [150]:
best_random_game()

1

In [152]:
"""Results of Montecarlo vs purerandom player"""
wins=0
for _ in tqdm(range(10000)):
    wins+=best_random_game()
wins

  0%|          | 0/10000 [00:00<?, ?it/s]

9907

In [286]:
class Qlearning():
    q_table = {}
    win_REWARD = 10
    loss_REWARD = -10
    def __init__(self, k = None, epsilon = 0.98, alfa = 1, gamma = 1) -> None:        
        self.epsilon = epsilon                      # epsilon
        self.alfa = alfa        # alpha
        self.gamma = gamma      # gamma  
        self.loss_REWARD=-10
        self.win_REWARD=10
        
    def makeKey(self, state):
        """initializes Q Table"""
        if win(state.x) or win(state.o) or len(list(possible_moves(state)))==0:
            return
        poss_moves =list(possible_moves(state))
        an_action = list(possible_moves(state))[0]
        #print(f"poss moves: {poss_moves} an action: {an_action}")
        # creating a Q Table
        hashable_state = (frozenset(state.x), frozenset(state.o))
        if (hashable_state, an_action) not in self.q_table:
            for i in poss_moves:
                self.q_table[(hashable_state,i)] = np.random.uniform(0.0,0.01)
    
    def policy(self,state)->int:
        """returns next move for X"""
        if win(state.x) or win(state.o) or len(list(possible_moves(state)))==0:
            return -1
        hashable_state = (frozenset(state.x), frozenset(state.o))
        poss_moves = list(possible_moves(state))
        if random() < self.epsilon:
            # choose the action with the highest reward
            q_values = [self.q_table[(hashable_state,i)] for i in poss_moves]
            return poss_moves[np.argmax(q_values)]
        else:
            # choose a random action
            return choice(poss_moves)
    def max_val(self,state):
        hashable_state = (frozenset(state.x), frozenset(state.o))
        poss_moves = list(possible_moves(state))
        q_values = [self.q_table[(hashable_state,i)] for i in poss_moves]
        return poss_moves[np.argmax(q_values)]
    def update_q(self,state, action, next_state, next_action):
        hashable_state = (frozenset(state.x), frozenset(state.o))
        hashable_next_state = (frozenset(next_state.x), frozenset(next_state.o))
        reward=0
        if win(next_state.x):
            reward+=self.win_REWARD
        elif win(next_state.o):
            reward+=self.loss_REWARD
        self.q_table[(hashable_state,action)]= (1-self.alfa)* self.q_table[(hashable_state,action)]+ self.alfa*(reward+ self.gamma* self.q_table[(hashable_next_state,next_action)])
        

In [282]:
def training(q,printable=False):
    state=State(set(),set())
    q.makeKey(state)
    if printable:
        print_board(state)
    
    while available(state):
        action=q.policy(state)
        prev_state=deepcopy(state)
        state.x.add(action)
        if printable:
            print_board(state)
        if win(state.x):
            break
        
        if available(state):
            state=random_move(state)
            if printable:
                print_board(state)
            if win(state.o):
                break
        if available(state):            
            q.makeKey(state)
            q.update_q(prev_state,action, state,q.policy(state))
    return int(win(state.x))

In [285]:
"""Results of Qlearning vs pure random player"""
wins=0
q=Qlearning()
for _ in tqdm(range(10000)):
    wins+=training(q)
wins

  0%|          | 0/10000 [00:00<?, ?it/s]

6717