Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
* Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [11]:
from itertools import permutations
import numpy as np
from random import choice
from collections import defaultdict
from tqdm.auto import tqdm
from tabulate import tabulate
import pandas as pd
import seaborn as sns

from copy import deepcopy
class TicTacToe():
    def __init__(self):
        self.MAP = np.array([[1, 6, 5], [8, 4, 0], [3, 2, 7]]);   
        self.x_move=[];
        self.o_move=[];
        
    def display(self,mode="human"):
        board = np.zeros((3, 3), dtype=str)
        for r in range(3):
            for c in range(3):
             if self.MAP[r, c] in self.x_move:
                board[r,c] = "X"
             elif self.MAP[r, c] in self.o_move:
                board[r,c] = "O"
             else:
                board[r,c] = "-"
        if mode == "human":
            board = tabulate(board, tablefmt="fancy_grid")
        print(board)
        
            
    def eval_terminal(self):         ##{x = mosse fatte da x  primo insieme}{o=mosse fatte da o}
        if any(sum(h) == 12 for h in permutations(self.x_move, 3)):
           
            return 1
        elif any(sum(h) == 12 for h in permutations(self.o_move, 3)):
            
            return -1
        else:
       
         return 0
        
    

In [12]:
def random_game()-> list:
    trajectory = list()
    state = TicTacToe()
    available = set(range(0, 8+1))
    while available:
        x = choice(list(available))
        state.x_move.append(x)
        trajectory.append(deepcopy(state))
        available.remove(x)
        if state.eval_terminal() or not available:
            break

        o = choice(list(available))
        state.o_move.append(o)
        trajectory.append(deepcopy(state))
        available.remove(o)
        if state.eval_terminal():
            break
    return trajectory


### Reinforcement  Learning --Montecarlo

In [13]:
value_dictionary = defaultdict(float)
hit_state = defaultdict(int)
epsilon = 0.01

for steps in range(8000_000):
    trajectory = random_game()
    final_reward = trajectory[-1].eval_terminal()
    for state in trajectory:
        hashable_state = (frozenset(state.x_move), frozenset(state.o_move))
        hit_state[hashable_state] += 1
        value_dictionary[hashable_state] = value_dictionary[
            hashable_state
        ] + epsilon * (final_reward - value_dictionary[hashable_state])



### Benchmark

-Random
-Blocking Opponent

In [14]:
class Benchmark():
    def __init__(self,total_game,agent_policies,enemy_strategy):
        self.agent_win=0; 
        self.draw=0;
        self.agent_lost=0;     
        self.total_game=total_game;
        self.agent_policies=agent_policies;
        self.enemy_strategy=enemy_strategy
        
    def run(self):
        
        for _ in range(self.total_game):
            game(self)


    def add_win(self):
            self.agent_win+=1
    def add_draw(self):
            self.agent_lost+=1
    def add_lost(self):
            self.draw+=1

    def result(self):
     
        row = {
            "Enemy Strategy": [self.enemy_strategy.__name__],
            "Win %": [ round((self.agent_win / self.total_game) * 100, 2)],
            "Lost %": [ round((self.agent_lost / self.total_game) * 100,2)],
            "Draw %": [ round((self.draw / self.total_game) * 100,2)]
        }

        return row
         

def find_max_value_for_keys(state_dict, reference_key):
        x_set_ref, o_set_ref = reference_key
        max_value = float('-inf') 
        value=float('-inf')
        count=0
     
        for key in state_dict:
             x_set, o_set = key
             
             
             if (len(x_set) == len(x_set_ref) + 1 and x_set_ref.issubset(x_set)) or \
                (len(o_set) == len(o_set_ref) + 1 and o_set_ref.issubset(o_set)):
                     value = state_dict[key]
                     count+=1
           
                

             if  value > max_value:
                max_value = value   
    
        return max_value

def agent_choice(state,agent_policies,available):
    max_value = float('-inf')
    best_move = float('-inf')
    
    for free in available:
        
        tm_state=deepcopy(state)
        tm_state.x_move.append(free)
        value=find_max_value_for_keys(agent_policies,(frozenset(tm_state.x_move), frozenset(tm_state.o_move)))
        if value > max_value:
                max_value=value
                best_move=free
    
    return best_move



          
def game(benchmark):   #1 metrica
    trajectory = list()
    state = TicTacToe()
    flag_player=1
    won=-1; ##If won remain -1 it's a draw
    available = set(range(0, 8+1))
    while available:
        flag_player = (flag_player + 1) % 2 
        if(flag_player==0): ##agent turn
            agent = agent_choice(state,benchmark.agent_policies,available)
            if agent ==  float('-inf'): #no value for available_move
                agent=random_strategy(available,state)

            state.x_move.append(agent)
            available.remove(agent)

            if state.eval_terminal() or not available:
                won=1;
                benchmark.add_win()
                break
        if flag_player==1: #enemy turn        
            enemy = benchmark.enemy_strategy(available,state)#enemy_random_choice
            state.o_move.append(enemy)
            trajectory.append(deepcopy(state))
            available.remove(enemy)
            if state.eval_terminal()==-1:
                won=0;
                benchmark.add_lost()
                break
    #state.display()
    if(won==-1):
       benchmark.add_draw()


def random_strategy(available,state):
     return choice(list(available)) 

def blocking_opponent_strategy(available,state):
     flag=0 #flag blocked
     for move in available:
          tm_state=deepcopy(state)
          tm_state.x_move.append(move)
          if tm_state.eval_terminal()==1:
            flag=1
            return move
     if flag==0:
       return random_strategy(available,state)
     

def center_strategy(available,state):
     if  4 in available: #take central square
          return 4
     else:
          return random_strategy(available,state)
     


def corner_strategy(available,state):
     #take corner if is free
      corners=[1,5,3,7]
      for corner in corners:
        if  corner in available: #if the corner is available ,take it
            return corner
   
      return random_strategy(available,state)
        
def blocking_plus_center_strategy(available,state):
    flag=0 #flag blocked
    corners=[1,5,3,7]
    for move in available:
          tm_state=deepcopy(state)
          tm_state.x_move.append(move)
          if tm_state.eval_terminal()==1:
            flag=1
            return move
    if flag==0:
        if  4 in available: #take central square
          return 4
        else:
          return random_strategy(available,state)
     
def blocking_plus_center_plus_corner_strategy(available,state):
    flag=0 #flag blocked
    corners=[1,5,3,7]
    for move in available:
          tm_state=deepcopy(state)
          tm_state.x_move.append(move)
          if tm_state.eval_terminal()==1:
            flag=1
            return move
    if flag==0:
        if  4 in available: #take central square
          return 4
        else:
           for corner in corners:
                if  corner in available: #if the corner is available ,take it
                  return corner
   
           return random_strategy(available,state)
     
     
    

def display(rows):
    tabella= pd.DataFrame(data=rows)
    cm=sns.light_palette("green",as_cmap=True)
    tabella.style.background_gradient(cmap=cm)
    tabella.style.bar(color='#fffaf')

    return tabella.head(len(rows))
    
     

### Benchmark Run

In [17]:

enemy_strategy=[random_strategy,center_strategy,blocking_opponent_strategy,corner_strategy,blocking_plus_center_strategy,blocking_plus_center_plus_corner_strategy]

rows=[]
for strategy in enemy_strategy:
    bench= Benchmark(1_000,value_dictionary,strategy)
    bench.run()
    rows.append(bench.result())
    
display(rows)






Unnamed: 0,Enemy Strategy,Win %,Lost %,Draw %
0,[random_strategy],[82.0],[0.0],[18.0]
1,[center_strategy],[80.2],[0.0],[19.8]
2,[blocking_opponent_strategy],[73.3],[0.0],[26.7]
3,[corner_strategy],[100.0],[0.0],[0.0]
4,[blocking_plus_center_strategy],[71.8],[0.0],[28.2]
5,[blocking_plus_center_plus_corner_strategy],[0.0],[0.0],[100.0]
