In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from boardgame2 import ReversiEnv
import numpy as np
from players import RandomPlayer
import gym

# SelfPlayer

En esta notebook se pide armar un entorno al cual se le pase como parámetro la clase de jugador local (DictPlayer, RandomPlayer, GreedyPlayer), y que el entorno devuelva el siguiente paso luego de jugar con el jugador local. Algunas condiciones:
- En la función de reset(), se sortearea si el jugador local juega primero o segundo. 
- El entorno siempre devolverá el tablero como si le tocará jugar al jugador 1. Sea primero o segundo
- La clase se instancia con los siguientes parámetros:
    - LocalPlayer
    - board_shape
    
- El método step recibirá como parámtro la acción pero codificada no como action = [columna, fila], si no como: action = action[0] * board_shape + action[1]
- self.action_space tiene que estar definido acorde al espacio de acción. Por ejemplo: self.action_space = gym.spaces.Discrete(board_shape**2)
- self.observation_space también: self.observation_space = gym.spaces.Box(-1, 1, (1, board_shape,board_shape))


# Ejemplos

El jugador local juega segundo entonces el reset() devuelve (Notar que no se devuelve el player por que siempre juega el 1):

In [None]:
[[ 0,  0,  0,  0],
 [ 0,  1, -1,  0],
 [ 0, -1,  1,  0],
 [ 0,  0,  0,  0]]

El jugador local juega primero entonces el reset() devuelve:

En cuanto a la recompenza tener en cuenta que deberá devolver:
- 1 si gana el jugador externo
- -1 si gana el LocalPlayer

In [3]:
class SelfPlayEnv(ReversiEnv):
    def __init__(self, board_shape=8, LocalPlayer=None):
        super().__init__(board_shape=board_shape)
        
        self.players = [-1, 1]
        self.local_player = LocalPlayer(board_shape=board_shape, flatten_action=False)
        self.board_shape = board_shape
        
        self.action_space = gym.spaces.Discrete(board_shape**2)
        self.observation_space = gym.spaces.Box(-1, 1, (1, board_shape, board_shape))
         
        
    def play(self, observation):
        action = self.local_player.predict(observation)
        (observation, self.current_player_num), reward, done, info = super(SelfPlayEnv, self).step(action)

        return (observation, self.current_player_num), reward, done, info
    
    def encode_observation(self, observation, valid_actions=False):
        # Implementar
        # Simpre devuelve desde el punto de vista del jugador 1
        # No devuleve el jugador sino solo el tablero
        # Tener en cuenta que esto será la entrada a la red neuronal
        board = observation * self.current_player_num
        if valid_actions:
            return np.array([board, self.get_valid((board, 1))])
        else:
            return board
    
    
    def reset(self):
        self.n_step = 0
        self.local_player_num = np.random.choice(self.players)
        self.local_player.player = self.local_player_num
        self.observation, self.current_player_num = super().reset()
        self.allow_pass = True
            
        if self.current_player_num == self.local_player_num:   
            (self.observation, self.current_player_num), _, done, info = self.play(self.observation)
            assert done == False

        return self.encode_observation(self.observation)
    
    def encode_action(self, action):
        # Esta es necesario ya que la salida de la red neuronal será un valor entre 0 y board_shape**2 - 1
        return [action // self.board_shape, action % self.board_shape]
    
    def decode_action(self, action):
        return action[0] * self.board_shape + action[1]

    def step(self, action):
        self.n_step += 1
        action = self.encode_action(action)
        
        (self.observation, self.current_player_num), reward, done, _ = super().step(action)   
        
            
        while not done and (self.current_player_num == self.local_player_num):            
            (self.observation, self.current_player_num), reward, done, info = self.play(self.observation)

        
        encoded_observation = self.encode_observation(self.observation)
        reward = float(reward * -self.local_player_num)

        return encoded_observation, reward, done, {} 

In [4]:
env = SelfPlayEnv(board_shape=4, LocalPlayer=RandomPlayer)

In [5]:
env.reset()

array([[ 0,  0,  0,  0],
       [ 0,  1, -1,  0],
       [ 0, -1,  1,  0],
       [ 0,  0,  0,  0]], dtype=int8)

# Probar entorno

In [6]:
def sample_valid_actions(state):
    # np.argwhere junto con env.get_valid y randint solucionan el problema en pocas lineas pero puede usar otra estrategia
    board_shape = state.shape[0]
    # El player es siempre 1
    player = 1
    valid_actions = np.argwhere(env.get_valid((state, player)) == 1)
    action = valid_actions[np.random.randint(len(valid_actions))]
    return action[0] * board_shape + action[1]

In [7]:
done = False
board = env.reset()
while not done:
    action = sample_valid_actions(board)
    print(board)
    print(f'action: {action}')
    
    board, reward, done, _ = env.step(action)

[[ 0  0  0  0]
 [ 0  1 -1  0]
 [ 0 -1  1  0]
 [ 0  0  0  0]]
action: 7
[[ 0  0  0  0]
 [ 0  1  1  1]
 [ 0 -1 -1 -1]
 [ 0  0  0  0]]
action: 15
[[ 0  0  0 -1]
 [ 0  1 -1  1]
 [ 0 -1  1  1]
 [ 0  0  0  1]]
action: 2
[[ 0 -1 -1 -1]
 [ 0 -1  1  1]
 [ 0 -1  1  1]
 [ 0  0  0  1]]
action: 12
[[ 0 -1 -1 -1]
 [ 0 -1  1  1]
 [ 0 -1  1  1]
 [ 1 -1  0  1]]
action: 8
[[ 0 -1 -1 -1]
 [ 0 -1 -1  1]
 [ 1  1 -1  1]
 [ 1 -1 -1  1]]
action: 4


# Entornos vectoriales

In [8]:
from multi_env import make_reversi_vec_env

In [9]:
board_shape = 8
n_envs = 10
env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=n_envs,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)

In [10]:
env.reset().shape

(10, 1, 8, 8)

- Notar que la entrada tiene como primer componente la cantidad de entornos en paralelo (10), luego la cantidad de canales (1), y finalmente las dimensiones del tablero 

- Imprimir obs y ver que hay distintas posibles entradas dependiendo de quien juega primero y que jugó el LocalPlayer si le toco primero

In [14]:
obs = env.reset()

In [15]:
print(obs)

[[[[ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0. -1. -1. -1.  0.  0.]
   [ 0.  0.  0.  1. -1.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]]]


 [[[ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0. -1.  1.  0.  0.  0.]
   [ 0.  0.  0. -1. -1.  0.  0.  0.]
   [ 0.  0.  0. -1.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]]]


 [[[ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0. -1.  1.  0.  0.  0.]
   [ 0.  0.  0. -1. -1.  0.  0.  0.]
   [ 0.  0.  0. -1.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]]]


 [[[ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0

### Guardar el SelfPlayEnv en el módulo multi_env para poder despues importarla desde otra notebook

# Instanciamos el modelo con MLP

In [16]:
from stable_baselines3 import PPO

In [17]:
model = PPO(
    'MlpPolicy',
    env,
    verbose=0,
)

In [18]:
model.policy

ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=64, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)

In [19]:
obs = env.reset()

In [20]:
obs.shape

(10, 1, 8, 8)

In [21]:
model.predict(obs)

(array([47, 28, 31, 58, 41, 54,  0, 11, 54, 59]), None)

Observaciones:
- Lo primero que hace stablebaselines si ponemos MLP es un flatten
- Las acciones predichas por el modelo (sin entrentar) tienen una alta probabildad de ser inválidas