In [1]:
%load_ext autoreload
%autoreload 2

# Importar entorno y familiarizarse

In [2]:
from boardgame2 import ReversiEnv
import numpy as np

# Crear 3 tipos de jugador
- Random: Selecciona uniformemente una de las acciones válidas
- Greedy: Selecciona la acción que le da más ganancia inmediata (cantidad de piezas que come). Si hay más de una acción que da máxima ganancia samplear uniformemente entre ellas
- Optimum (solo para 4x4): Usando resultados de la PI optima obtenida por policy iteration

Tener en cuenta que:
- ReversiEnv tiene los métodos get_valid y next_step y no es necesario mantener el estado del entorno
- env.PASS ([-1,  0]) es una acción valida posible y debería hacerse cuando no get_valid devuelve una matriz de ceros

Para el optimo en 4x4 bajar usar la PI obtenida en la notebook anterior guardado en /mdp

In [3]:
import random

In [4]:
class GreedyPlayer():
    def __init__(self, player=1, board_shape=None, env=None, flatten_action=False):
        if (env is None) and (board_shape is None):
            print("board_shape and env can't be both None")
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.env = env
        self.player = player # player number. 1 o -1
        self.flatten_action = flatten_action
        self.board_shape = self.env.board.shape[0]
    
    def predict(self, board):
        # Implementar
        # Tiene que devoler la acción en la que come más piezas.
        # A igualdad de piezas comidas, samplear uniformemente
        
        state = (board, self.player)
        
        valid_board = self.env.get_valid(state)
        valid_actions = np.argwhere(valid_board)
        
        if (len(valid_actions) == 0):
            action = self.env.PASS
        else:
            actions_opponent_pieces = dict()

            for a in valid_actions:
                a = tuple(a)
                next_state = self.env.get_next_state(state, a)[0]
                actions_opponent_pieces[a] = len(next_state[next_state == -self.player])

            min_opp_pieces = min(actions_opponent_pieces.values())
            greedy_actions = [a for a, p in actions_opponent_pieces.items() if p == min_opp_pieces]

            action = random.choice(greedy_actions)
        
        if self.flatten_action:
            return action[0] * self.board_shape + action[1]
        else:
            return action
        
class RandomPlayer():
    def __init__(self, player=1, board_shape=None, env=None, flatten_action=False):
        if (env is None) and (board_shape is None):
            print("board_shape and env can't be both None")
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.env = env
        self.player = player
        self.flatten_action = flatten_action
        self.board_shape = self.env.board.shape[0]
    
    def predict(self, board):
        # Muestrea aleatoriamente las acciones válidas
        # Puede usar la función creada en la notebook anterior
        
        state = (board, self.player)
        
        valid_board = self.env.get_valid(state)
        valid_actions = np.argwhere(valid_board)
        
        if (len(valid_actions) == 0):
            action = self.env.PASS
        else:
            action = tuple(random.choice(valid_actions))
        
        if self.flatten_action:
            return action[0] * self.board_shape + action[1]
        else:
            return action
        

class DictPolicyPlayer():
    def __init__(self, player=1, board_shape=4, env=None, flatten_action=False, dict_folder='mdp/pi_mdp.npy'):
        self.pi_dict = np.load(dict_folder, allow_pickle=True).item()
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.env = env
        self.player = player
        self.flatten_action = flatten_action
        self.board_shape = board_shape
    
    def predict(self, board):
        # Elegir la acción optima y devolverla
        
        board = tuple(board.flatten() * self.player)         
        
        if (board in self.pi_dict):
            action = self.pi_dict[board]
        else:
            action = self.env.PASS
        
        if self.flatten_action:
            return action[0] * self.board_shape + action[1]
        else:
            return action

In [5]:
gp = GreedyPlayer(player=1, board_shape=4)
rp = RandomPlayer(player=1, board_shape=4)
pp = DictPolicyPlayer(player=1, board_shape=4)

# Verificar que el pass funciona OK

In [9]:
#Creo un tablero invalido para testear el pass en todas las estrategias
invalid_board = np.array([
    [ 1,  0,  0,  0],
    [ 0,  0,  0,  0],
    [ 0,  0,  0,  0],
    [ 0,  0,  0, -1]]
)

for i in range(10):
    print("Random action: ", rp.predict(invalid_board))
    print("Greedy action: ", gp.predict(invalid_board))
    print("Optimum action: ", pp.predict(invalid_board))
    print()

Random action:  [-1  0]
Greedy action:  [-1  0]
Optimum action:  [-1  0]

Random action:  [-1  0]
Greedy action:  [-1  0]
Optimum action:  [-1  0]

Random action:  [-1  0]
Greedy action:  [-1  0]
Optimum action:  [-1  0]

Random action:  [-1  0]
Greedy action:  [-1  0]
Optimum action:  [-1  0]

Random action:  [-1  0]
Greedy action:  [-1  0]
Optimum action:  [-1  0]

Random action:  [-1  0]
Greedy action:  [-1  0]
Optimum action:  [-1  0]

Random action:  [-1  0]
Greedy action:  [-1  0]
Optimum action:  [-1  0]

Random action:  [-1  0]
Greedy action:  [-1  0]
Optimum action:  [-1  0]

Random action:  [-1  0]
Greedy action:  [-1  0]
Optimum action:  [-1  0]

Random action:  [-1  0]
Greedy action:  [-1  0]
Optimum action:  [-1  0]



### Verifico también que las estrategias funcionen bien con un tablero valido:

In [10]:
board = np.array([
    [ 1,  0,  0, 0],
    [-1,  1, -1, 0],
    [ 0, -1,  1, 0],
    [-1,  1,  0, 0]]
)

for i in range(10):
    print("Random action: ", rp.predict(board))
    print("Greedy action: ", gp.predict(board))
    print("Optimum action: ", pp.predict(board))
    print()

Random action:  (2, 0)
Greedy action:  (2, 0)
Optimum action:  (2, 0)

Random action:  (1, 3)
Greedy action:  (2, 0)
Optimum action:  (2, 0)

Random action:  (1, 3)
Greedy action:  (2, 0)
Optimum action:  (2, 0)

Random action:  (2, 0)
Greedy action:  (2, 0)
Optimum action:  (2, 0)

Random action:  (2, 0)
Greedy action:  (2, 0)
Optimum action:  (2, 0)

Random action:  (1, 3)
Greedy action:  (2, 0)
Optimum action:  (2, 0)

Random action:  (1, 3)
Greedy action:  (2, 0)
Optimum action:  (2, 0)

Random action:  (1, 3)
Greedy action:  (2, 0)
Optimum action:  (2, 0)

Random action:  (0, 2)
Greedy action:  (2, 0)
Optimum action:  (2, 0)

Random action:  (1, 3)
Greedy action:  (2, 0)
Optimum action:  (2, 0)



### Verifico que jueguen bien como player 2 (-1):

In [12]:
board = np.array([
    [ 0,  0,  0, 0],
    [ 0,  1,  1, 1],
    [ 0, -1,  1, 0],
    [ 0,  0,  0, 0]]
)

rp.player = gp.player = pp.player = -1

for i in range(10):
    print("Random action: ", rp.predict(board))
    print("Greedy action: ", gp.predict(board))
    print("Optimum action: ", pp.predict(board))
    print()

Random action:  (2, 3)
Greedy action:  (2, 3)
Optimum action:  (0, 3)

Random action:  (0, 1)
Greedy action:  (0, 1)
Optimum action:  (0, 3)

Random action:  (0, 3)
Greedy action:  (2, 3)
Optimum action:  (0, 3)

Random action:  (0, 1)
Greedy action:  (2, 3)
Optimum action:  (0, 3)

Random action:  (0, 3)
Greedy action:  (0, 3)
Optimum action:  (0, 3)

Random action:  (0, 1)
Greedy action:  (0, 1)
Optimum action:  (0, 3)

Random action:  (2, 3)
Greedy action:  (0, 3)
Optimum action:  (0, 3)

Random action:  (2, 3)
Greedy action:  (2, 3)
Optimum action:  (0, 3)

Random action:  (0, 3)
Greedy action:  (2, 3)
Optimum action:  (0, 3)

Random action:  (0, 1)
Greedy action:  (0, 3)
Optimum action:  (0, 3)



# Completar la función que dado dos jugadores imprima estadísticas de las partidas

Por ejemplo:
(Las estadísticas son relativas el que se pasa primero en la función)

Wins as first: 0.35

Wins as second: 0.55

Plays as first: 2457

Plays as second: 2543

Avg game duration: 5.937

In [13]:
def arena_stats(Player_1, Player_2, board_shape, N=500):
    
    env = ReversiEnv(board_shape=board_shape)
    wins_as_first = 0
    wins_as_second = 0
    ties = 0
    plays_as_first = 0
    plays_as_second = 0
    total_steps = 0
    player_1 = Player_1(player=1, board_shape=board_shape, flatten_action=False)
    player_2 = Player_2(player=-1, board_shape=board_shape, flatten_action=False)
    for i in range(N):
        # Aveces empieza un jugador, a veces el otro
        first_player = np.random.choice([-1, 1])
        player_1.player = first_player
        player_2.player = -first_player
        
        plays_as_first = plays_as_first + (first_player == 1)
        plays_as_second = plays_as_second + (first_player == -1)
        
        done = False
        n_steps = 0
        (board, player) = env.reset()
        
        while not done:
            if first_player == player:
                action = player_1.predict(board)
            else:
                action = player_2.predict(board)
            (board, player), reward, done, info = env.step(action)
            n_steps = n_steps + 1
        total_steps = total_steps + n_steps
        wins_as_first = wins_as_first + (reward == first_player) * (first_player == 1)
        wins_as_second = wins_as_second + (reward == first_player) * (first_player == -1)
        ties = ties + (reward == 0)
    print(f'Wins as first: {wins_as_first/plays_as_first}')
    print(f'Wins as second: {wins_as_second/plays_as_second}')
    print(f'Ties: {ties/N}')
    print(f'Plays as first: {plays_as_first}')
    print(f'Plays as second: {plays_as_second}')
    print(f'Avg game duration: {total_steps/N}')
        
    

In [14]:
arena_stats(DictPolicyPlayer, GreedyPlayer, 4, N=2000)

Wins as first: 0.8404040404040404
Wins as second: 1.0
Ties: 0.0
Plays as first: 990
Plays as second: 1010
Avg game duration: 11.722


In [15]:
arena_stats(DictPolicyPlayer, RandomPlayer, 4, N=1000)

Wins as first: 0.772635814889336
Wins as second: 1.0
Ties: 0.024
Plays as first: 497
Plays as second: 503
Avg game duration: 11.669


In [16]:
arena_stats(RandomPlayer, DictPolicyPlayer, 4, N=1000)

Wins as first: 0.0
Wins as second: 0.12573673870333987
Ties: 0.018
Plays as first: 491
Plays as second: 509
Avg game duration: 11.637


In [17]:
arena_stats(RandomPlayer, GreedyPlayer, 4, N=1000)

Wins as first: 0.4085603112840467
Wins as second: 0.5493827160493827
Ties: 0.09
Plays as first: 514
Plays as second: 486
Avg game duration: 11.658


In [18]:
arena_stats(RandomPlayer, RandomPlayer, 4)

Wins as first: 0.34051724137931033
Wins as second: 0.5783582089552238
Ties: 0.108
Plays as first: 232
Plays as second: 268
Avg game duration: 11.73


In [19]:
arena_stats(GreedyPlayer, GreedyPlayer, 4)

Wins as first: 0.42105263157894735
Wins as second: 0.525691699604743
Ties: 0.072
Plays as first: 247
Plays as second: 253
Avg game duration: 11.64


In [22]:
arena_stats(RandomPlayer, GreedyPlayer, 8, N=1000)

Wins as first: 0.374
Wins as second: 0.352
Ties: 0.036
Plays as first: 500
Plays as second: 500
Avg game duration: 57.157


In [23]:
arena_stats(GreedyPlayer, RandomPlayer, 8, N=1000)

Wins as first: 0.59375
Wins as second: 0.5557692307692308
Ties: 0.054
Plays as first: 480
Plays as second: 520
Avg game duration: 58.177


# Guardar todas las clases de jugadores en un player.py para que luego se puedan importar de la siguiente forma:

from players import RandomPlayer

from players import GreedyPlayer