# Creating Gym Geese Environment

While working with the Stable baselines framework, we need to convert our current environment to a Gym environment.
These kernels helped me to grasp a better understanding of that process:
https://www.kaggle.com/ryches/stable-baselines3-starter-wip &
https://www.kaggle.com/victordelafuente/dqn-goose-with-stable-baselines3-pytorch

In [None]:
# Importing of stable-baselines3 package
!pip install stable-baselines3

My agent will learn against three of [risk averse greedy goose](https://www.kaggle.com/ilialar/risk-averse-greedy-goose) 

In [None]:
%%writefile greedy-goose.py

from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, row_col
import numpy as np
import random

def get_nearest_cells(x,y):
    # returns all cells reachable from the current one
    result = []
    for i in (-1,+1):
        result.append(((x+i+7)%7, y))
        result.append((x, (y+i+11)%11))
    return result

def find_closest_food(table):
    # returns the first step toward the closest food item
    new_table = table.copy()
    
    
    # (direction of the step, axis, code)
    possible_moves = [
        (1, 0, 1),
        (-1, 0, 2),
        (1, 1, 3),
        (-1, 1, 4)
    ]
    
    # shuffle possible options to add variability
    random.shuffle(possible_moves)
    
    
    updated = False
    for roll, axis, code in possible_moves:

        shifted_table = np.roll(table, roll, axis)
        
        if (table == -2).any() and (shifted_table[table == -2] == -3).any(): # we have found some food at the first step
            return code
        else:
            mask = np.logical_and(new_table == 0,shifted_table == -3)
            if mask.sum() > 0:
                updated = True
            new_table += code * mask
        if (table == -2).any() and shifted_table[table == -2][0] > 0: # we have found some food
            return shifted_table[table == -2][0]
        
        # else - update new reachible cells
        mask = np.logical_and(new_table == 0,shifted_table > 0)
        if mask.sum() > 0:
            updated = True
        new_table += shifted_table * mask

    # if we updated anything - continue reccurison
    if updated:
        return find_closest_food(new_table)
    # if not - return some step
    else:
        return table.max()

last_step = None

def agent(obs_dict, config_dict):
    global last_step
    
    observation = Observation(obs_dict)
    configuration = Configuration(config_dict)
    player_index = observation.index
    player_goose = observation.geese[player_index]
    player_head = player_goose[0]
    player_row, player_column = row_col(player_head, configuration.columns)


    table = np.zeros((7,11))
    # 0 - emply cells
    # -1 - obstacles
    # -4 - possible obstacles
    # -2 - food
    # -3 - head
    # 1,2,3,4 - reachable on the current step cell, number is the id of the first step direction
    
    legend = {
        1: 'SOUTH',
        2: 'NORTH',
        3: 'EAST',
        4: 'WEST'
    }
    
    # let's add food to the map
    for food in observation.food:
        x,y = row_col(food, configuration.columns)
        table[x,y] = -2 # food
        
    # let's add all cells that are forbidden
    for i in range(4):
        opp_goose = observation.geese[i]
        if len(opp_goose) == 0:
            continue
            
        is_close_to_food = False
            
        if i != player_index:
            x,y = row_col(opp_goose[0], configuration.columns)
            possible_moves = get_nearest_cells(x,y) # head can move anywhere
            
            for x,y in possible_moves:
                if table[x,y] == -2:
                    is_close_to_food = True
            
                table[x,y] = -4 # possibly forbidden cells
        
        # usually we ignore the last tail cell but there are exceptions
        tail_change = -1
        if obs_dict['step'] % 40 == 39:
            tail_change -= 1
        
        # we assume that the goose will eat the food
        if is_close_to_food:
            tail_change += 1
        if tail_change >= 0:
            tail_change = None
            

        for n in opp_goose[:tail_change]:
            x,y = row_col(n, configuration.columns)
            table[x,y] = -1 # forbidden cells
    
    # going back is forbidden according to the new rules
    x,y = row_col(player_head, configuration.columns)
    if last_step is not None:
        if last_step == 1:
            table[(x + 6) % 7,y] = -1
        elif last_step == 2:
            table[(x + 8) % 7,y] = -1
        elif last_step == 3:
            table[x,(y + 10)%11] = -1
        elif last_step == 4:
            table[x,(y + 12)%11] = -1
        
    # add head position
    table[x,y] = -3
    
    # the first step toward the nearest food
    step = int(find_closest_food(table))
    
    # if there is not available steps try to go to possibly dangerous cell
    if step not in [1,2,3,4]:
        x,y = row_col(player_head, configuration.columns)
        if table[(x + 8) % 7,y] == -4:
            step = 1
        elif table[(x + 6) % 7,y] == -4:
            step = 2
        elif table[x,(y + 12)%11] == -4:
            step = 3
        elif table[x,(y + 10)%11] == -4:
            step = 4
                
    # else - do a random step and lose
        else:
            step = np.random.randint(4) + 1
    
    last_step = step
    return legend[step]

# Implementing PPO2 Agent

We convert the default env in a custom Gym env

In [None]:
# Importing of hungry_goose environnment
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, row_col, greedy_agent, GreedyAgent
from kaggle_environments import evaluate, make

# Importing of gym and numpy
import gym
import numpy as np
from gym import spaces

# We create our custon environment
class GeeseGym(gym.Env):
    
    # Environnment initializaton
    def __init__(self): 
        # We generate the default hungry_geese env & configuration
        self.env = make("hungry_geese")
        self.config = self.env.configuration
        
        # We only got 4 actions
        self.action_space = spaces.Discrete(4)
        
        # Normalizing obs_space from -1 to 1
        self.observation_space = spaces.Box(low=-1,
                                            high=1, 
                                            shape=(self.config.rows, 
                                                   self.config.columns, 
                                                   1), 
                                            dtype=np.float64)
        self.trainer = self.env.train([None,'greedy','greedy','greedy'])
        
    # We need to transform the information generated by the default hungry_geese environment
    # into a custom board normalized from -1 to 1
    # We can also attribute specific values to some elements like the food, heads to help tha RL algorithm.
    
    def transform_obs(self, obs, config):
        # Creating the board
        board = [[0 for _ in range(self.config.columns)] for _ in range(self.config.rows)]
        # We retrieve the useful informations from the defaut board
        if type(obs) is list:
            obs = obs[0].observation
        else:
            obs = Observation(obs)
        my_index = obs["index"]
        
        my_head = obs.geese[my_index][0]
        my_body = [_ for _ in obs.geese[my_index][1:]]
        
        other_geese = [_ for _ in obs.geese if _ != obs.geese[my_index]]
        other_heads = []
        other_bodies = []
        for goose in other_geese:
            if goose: 
                other_heads.append(goose[0])
            if len(goose) > 1:
                other_bodies.append(goose[1:])
            
        foods = [food for food in obs.food]
        
        # We attribute to any of these useful informations a specific value using the row_col conversion function
        r, c = row_col(my_head,self.config.columns)
        board[r][c] = 0.95
        
        for position in my_body:
            r, c = row_col(position,self.config.columns)
            board[r][c] = 0.90
        
        for position in other_heads:
            r, c = row_col(position,self.config.columns)
            board[r][c] = 0.15
        
        for goose_body in other_bodies:
            for position in goose_body:
                r, c = row_col(position,self.config.columns)
                board[r][c] = 0.1
        
        for position in foods:
            r, c = row_col(position,self.config.columns)
            board[r][c] = 1
        
        # We need a (7,11,1) dimensional array
        board = np.transpose(board)    
        board = np.array([board])
        board = np.transpose(board)
        
        return board
    
    def reset(self):
        # We transform the obs every time we reset the env
        self.obs = self.env.reset(num_agents = 4)
        self.transformed_obs = self.transform_obs(self.obs, self.config)
        return self.transformed_obs
    
    def transform_action(self,agent_value):
        # We need to transform the numerical values retruned by the algorithm 
        #into string values accepted by the default env 
        if agent_value == 0:
            return "NORTH"
        if agent_value == 1:
            return "EAST"
        if agent_value == 2:
            return "WEST"
        if agent_value == 3:
            return "SOUTH"
    
    def step(self, agent_value):
        # We transform the action and then pass it to the default env
        action = self.transform_action(agent_value)
        self.obs, reward, done, info = self.trainer.step(action)
        reward = 1
        if done == False:
            self.transformed_obs = self.transform_obs(self.obs, self.config)
        else:
            reward = -1
        return self.transformed_obs, reward, done, info

We train the agent on 6 vectorized envs

In [None]:
import os
# Importing the PPO algorithm and the make_vec_env packages
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# We create multiple vectorized environments
geese_env = make_vec_env(GeeseGym, n_envs = 6)

# We create the PPO agent and train it

model = PPO('MlpPolicy', geese_env, verbose = 1)
model.learn(total_timesteps=((1e6)*4))
model.save("ppo_goose")