This kernel attempts to train PPO agent with self play using stable-baselines3 library.
`EvalCallback` is specified to evaluate model every 20 timesteps. And if better model is formed, it is loaded as opponent using child callback `LoadNewOpponentsFromBestModelCallback`. This callback calls `HungryGeeseEnv.load_new_opponents_from_best_model` to actually load new model as opponents.
Code works as is. However, there is scope for performance improvement. **Please help me out to improve performance and indicating what I might be missing here.**

(Action and observation transformation functions are referenced from [this](https://www.kaggle.com/ryches/stable-baselines3-starter-wip) kernel.)

**TODO**

- Try increasing Policy NN layers




[Original NB](https://www.kaggle.com/maheshabnave999/hungry-geese-self-play-agent-using-stable-baseli#Custom-Environment)

In [None]:
!pip install stable-baselines3



# Imports 

In [None]:
import matplotlib.pyplot as plt
import gym
from gym import spaces
import numpy as np

from stable_baselines3 import PPO
from stable_baselines3.common import logger, results_plotter
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, BaseCallback

from shutil import copyfile
import os

from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecTransposeImage
from stable_baselines3.common.monitor import Monitor, load_results

import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, row_col
from kaggle_environments import evaluate, make

In [None]:
EED = 17
NUM_TIMESTEPS = int(1e7)
EVAL_FREQ = int(1e4)
EVAL_EPISODES = int(1e2)
BEST_THRESHOLD = 0.01 # must achieve a mean score above this to replace prev best self

REWARD_LOST = -1
REWARD_WON = 1

N_CPU = os.cpu_count()

LOGDIR = os.path.join(".","logs","custom_ppo_1")
MONITOR_LOGS_DIR = os.path.join(LOGDIR,"monitor_logs")
TB_LOGS_DIR = os.path.join(LOGDIR,"tensorboard_logs")
MODEL_DIR = os.path.join(LOGDIR,"model")
CHECKPOINTS_DIR = os.path.join(LOGDIR,"checkpoints")

In [None]:
if not os.path.exists(LOGDIR): 
    os.makedirs(LOGDIR)
if not os.path.exists(TB_LOGS_DIR):
    os.makedirs(TB_LOGS_DIR)
if not os.path.exists(MONITOR_LOGS_DIR):
    os.makedirs(MONITOR_LOGS_DIR)
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
if not os.path.exists(CHECKPOINTS_DIR):
    os.makedirs(CHECKPOINTS_DIR)

# Custom Environment 

In [None]:
class HungryGeeseEnv(gym.Env):
    
    def __init__(self, opponents=['random','greedy','greedy-goose.py'], debug=False, warmup_episode_count = 100, warmup_timesteps=5000):
        super(HungryGeeseEnv, self).__init__()
        self.opponents = opponents
        self.opponents_old_lengths = [1 for _ in range(0,len(opponents))]
        self.opponents_new_lengths = [1 for _ in range(0,len(opponents))]
        self.env = make("hungry_geese") #, debug=self.debug)
        self.config = self.env.configuration
        self.trainer = self.env.train([None, *opponents])
        
        self.action_space = spaces.Discrete(4)        
        self.observation_space = spaces.Box(low=0, high=255
                                            , shape=(self.config.rows, self.config.columns, 3)
                                            , dtype=np.uint8) 
        self.reward_range = (-1, 1000)  #TODO why this range?
        self.last_vert_actions_count = 0
        self.last_horz_actions_count = 0
        self.last_action = -1
        
        self.episode_count = 0
        self.timesteps = 0
        self.warmup_episode_count = warmup_episode_count
        self.warmup_timesteps = warmup_timesteps
        
    def update_opponent_lengths(self, obs):
        self.opponents_old_lengths = self.opponents_new_lengths
        self.opponents_new_lengths = [len(geese) for geese in obs[0]['geese'][1:]] 

    def update_last_actions(self, action):
        if self.last_action == action:
            if action == 0 or action == 3:
                self.last_vert_actions_count += 1 #counts last consecutive vertical actions
            else:
                self.last_horz_actions_count += 1 #counts last consecutive horizontal actions
        else:
            if action == 0 or action == 3:
                self.last_vert_actions_count = 1
                self.last_horz_actions_count = 0
            else:
                self.last_horz_actions_count = 1
                self.last_vert_actions_count = 0   

    def shape_reward(self, reward):   
        geese_length_diff = np.array(self.opponents_new_lengths) - np.array(self.opponents_old_lengths)
        total_geese_length_increase = (geese_length_diff > 0).sum()
        reward -= total_geese_length_increase * 10

        # prevent agent from taking straight trajectory
        if self.last_horz_actions_count > 11:
            reward -= 10
        if self.last_vert_actions_count > 7:
            reward -= 10

        return reward
    
    def step(self, action):
        self.update_last_actions(action)
        my_action = self.transform_action(action)
        
        self.timesteps += 1        
        
        #opponent_actions = self.transform_actions(action[1:])  #TODO 
        #self.obs = self.env.step([my_action, *opponent_actions])  #TODO      
        self.obs = self.trainer.step(my_action)  #TODO    
        self.update_opponent_lengths(self.obs)       
        x_obs = self.transform_step_observation(self.obs, self.config)
        # x_reward = self.obs[0].reward
        # done = (self.obs[0]["status"] != "ACTIVE")
        # info = self.obs[0]["info"]
        x_reward = self.obs[1]
        x_reward = self.shape_reward(x_reward)
        done = self.obs[2]
        info = self.obs[3]

        return x_obs, x_reward, done, info
        
    def reset(self):
        self.episode_count += 1
        self.obs = self.trainer.reset()
        x_obs = self.transform_observation(self.obs, self.config)
        return x_obs
    
    def load_new_opponents_from_best_model(self):
        if self.episode_count < self.warmup_episode_count or self.timesteps < self.warmup_timesteps:
            return True
        
        print("Loading new opponents from current best model for self play!!!")
        loaded_model = PPO.load(os.path.join(MODEL_DIR, "best_model")) 

        def agent_ppo(obs, config):
            obs = self.transform_observation(obs, self.config)
            return self.transform_action(loaded_model.predict(obs, deterministic=True)[0])
        
        self.opponents = [agent_ppo]*len(self.opponents)
        self.trainer = self.env.train([None, *self.opponents])
        self.reset()
        
    def transform_actions(self, actions):
        _actions = []
        for action in actions:
            _actions.append(self.transform_action(action))
        return _actions
        
    def transform_action(self, action):
        if action == 0:
            return "NORTH"
        if action == 1:
            return "EAST"
        if action == 2:
            return "WEST"
        if action == 3:
            return "SOUTH"
        
    def transform_step_observation(self, obs, config):
        my_board = np.zeros((config.columns * config.rows * 1), dtype = np.uint8)
        their_board = np.zeros((config.columns * config.rows * 1), dtype = np.uint8)
        food_board = np.zeros((config.columns * config.rows * 1), dtype = np.uint8)

        for goose_cell in obs[0].geese[0]:
            my_board[goose_cell] = 255
        my_board = my_board.reshape((config.rows, config.columns, 1))

        for goose in obs[0].geese[1:]:
            for goose_cell in goose:
                their_board[goose_cell] = 255
        their_board = their_board.reshape((config.rows, config.columns, 1))
        
        for food_cell in obs[0].food:
            food_board[food_cell] = 255
        food_board = food_board.reshape((config.rows, config.columns, 1))
        board = np.concatenate([my_board, their_board, food_board], axis = -1)
        return board

    def transform_observation(self, obs, config):
        my_board = np.zeros((config.columns * config.rows * 1), dtype = np.uint8)
        their_board = np.zeros((config.columns * config.rows * 1), dtype = np.uint8)
        food_board = np.zeros((config.columns * config.rows * 1), dtype = np.uint8)

        for goose_cell in obs.geese[0]:
            my_board[goose_cell] = 255
        my_board = my_board.reshape((config.rows, config.columns, 1))

        for goose in obs.geese[1:]:
            for goose_cell in goose:
                their_board[goose_cell] = 255
        their_board = their_board.reshape((config.rows, config.columns, 1))
        
        for food_cell in obs.food:
            food_board[food_cell] = 255
        food_board = food_board.reshape((config.rows, config.columns, 1))
        board = np.concatenate([my_board, their_board, food_board], axis = -1)
        return board

In [None]:
# Unused. Useful for vectorised environments 

def make_monitored_gym(rank =0): #TODO pass config
    def _init():
        env = HungryGeeseEnv() #TODO pass config
        #LOGDIR = "ppo_selfplay"
        log_file = os.path.join(LOGDIR, str(rank))
        env = Monitor(env, log_file, allow_early_resets=True) #TODO  allow_early_resets
        return env
    return _init

def make_gym(rank =0): #TODO pass config
    def _init():
        env = HungryGeeseEnv() #TODO pass config
        #LOGDIR = "ppo_selfplay"
        #log_file = os.path.join(LOGDIR, str(rank))
        #env = Monitor(env, log_file, allow_early_resets=True) #TODO  allow_early_resets
        return env
    return _init

In [None]:
env = Monitor(HungryGeeseEnv())

class LoadNewOpponentsFromBestModelCallback(BaseCallback):
    def __init__(self, env, verbose: int = 0):
        super(LoadNewOpponentsFromBestModelCallback, self).__init__(verbose=verbose)
        self.env = env

    #def __init_callback(self)

    def _on_step(self):
        env.load_new_opponents_from_best_model()
        return True
    
load_new_opponents_from_best_model_callback = LoadNewOpponentsFromBestModelCallback(env)

checkpoint_callback = CheckpointCallback(save_freq=1000, save_path=CHECKPOINTS_DIR,
                                         name_prefix="rl_model")

eval_env = VecTransposeImage(DummyVecEnv([lambda:Monitor(HungryGeeseEnv())]))
eval_callback = EvalCallback(eval_env, best_model_save_path=MODEL_DIR,
                             log_path=LOGDIR, eval_freq=20,
                             deterministic=True, render=False
                            , callback_on_new_best=load_new_opponents_from_best_model_callback)

model = PPO(policy = 'MlpPolicy'
                , env = env
                , verbose = 1
                , n_steps = 2048*16
                , batch_size = 128
                , n_epochs = 50
                #, tb_log_name = "ppo_vs_ppo_bfs" #TODO check if works
                , tensorboard_log = TB_LOGS_DIR
                , learning_rate = .01)

model.learn(total_timesteps=100000, callback=[checkpoint_callback, eval_callback])

In [None]:
def transform_observation(obs, config):
    my_board = np.zeros((config.columns * config.rows * 1), dtype = np.uint8)
    their_board = np.zeros((config.columns * config.rows * 1), dtype = np.uint8)
    food_board = np.zeros((config.columns * config.rows * 1), dtype = np.uint8)

    for goose_cell in obs.geese[0]:
        my_board[goose_cell] = 255
    my_board = my_board.reshape((config.rows, config.columns, 1))

    for goose in obs.geese[1:]:
        for goose_cell in goose:
            their_board[goose_cell] = 255
    their_board = their_board.reshape((config.rows, config.columns, 1))

    for food_cell in obs.food:
        food_board[food_cell] = 255
    food_board = food_board.reshape((config.rows, config.columns, 1))
    board = np.concatenate([my_board, their_board, food_board], axis = -1)
    return board
    
def transform_actions(actions):
    if actions == 0:
        return "NORTH"
    if actions == 1:
        return "EAST"
    if actions == 2:
        return "WEST"
    if actions == 3:
        return "SOUTH"

In [None]:
MODEL = os.path.join(LOGDIR,"model","best_model")
SAVE_MODEL = os.path.join(LOGDIR,"model","last_model")
STATE_DICT = os.path.join(LOGDIR,"state_dict")
loaded_model = PPO.load(MODEL)
loaded_model.save(SAVE_MODEL)
print(loaded_model.policy)
print(loaded_model.policy.to('cpu').state_dict())

import torch
torch.save(loaded_model.policy.to('cpu').state_dict(), STATE_DICT)


In [None]:
MODEL = os.path.join(LOGDIR,"model","best_model")
print(MODEL)
loaded_model = PPO.load(MODEL)
#print(loaded_model.policy)
env = make('hungry_geese', debug=True)

def agent_ppo(obs, config):
    obs = transform_observation(obs, env.configuration)
    #return directions[loaded_model.predict(obs)[0]]
    return transform_actions(loaded_model.predict(obs, deterministic=True)[0])
    
# env.run([agent_ppo,'random','greedy'])
# env.render(mode="ipython")

from kaggle_environments import evaluate

evaluate(
    "hungry_geese",
    ['random',agent_ppo,'greedy'],
    num_episodes=10
)