In [None]:
!apt-get update -qq
!apt-get install -qq -y cmake libopenmpi-dev python3-dev zlib1g-dev
!python -m pip install -q --upgrade pip
!pip install -q --upgrade kaggle-environments
!pip install -q 'tensorflow==1.15.0'
!pip install -q 'stable-baselines[mpi]==2.10.0'

In [None]:
%%writefile subproc_vec_env.py
# Original Source: https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/common/vec_env/subproc_vec_env.py

# Changes Made: Modified to preserve terminal observation

# Original License:
# The MIT License

# Copyright (c) 2017 OpenAI (http://openai.com)
# Copyright (c) 2018-2019 Stable-Baselines Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.


import multiprocessing
from collections import OrderedDict
from typing import Sequence

import gym
import numpy as np

from stable_baselines.common.vec_env.base_vec_env import VecEnv, CloudpickleWrapper


def _worker(remote, parent_remote, env_fn_wrapper):
    parent_remote.close()
    env = env_fn_wrapper.var()
    env_done = False
    while True:
        try:
            cmd, data = remote.recv()
            if cmd == 'step':
                if env_done:
                    observation = env.reset()
                    reward, done, info = 0, False, {}
                    env_done = False
                else:
                    observation, reward, done, info = env.step(data)
                    env_done = done
                # save final observation where user can get it, then reset
                # info['terminal_observation'] = observation
                # observation = env.reset()
                remote.send((observation, reward, done, info))
            elif cmd == 'seed':
                remote.send(env.seed(data))
            elif cmd == 'reset':
                observation = env.reset()
                env_done = False
                remote.send(observation)
            elif cmd == 'render':
                remote.send(env.render(data))
            elif cmd == 'close':
                env.close()
                remote.close()
                break
            elif cmd == 'get_spaces':
                remote.send((env.observation_space, env.action_space))
            elif cmd == 'env_method':
                method = getattr(env, data[0])
                remote.send(method(*data[1], **data[2]))
            elif cmd == 'get_attr':
                remote.send(getattr(env, data))
            elif cmd == 'set_attr':
                remote.send(setattr(env, data[0], data[1]))
            else:
                raise NotImplementedError("`{}` is not implemented in the worker".format(cmd))
        except EOFError:
            break


class SubprocVecEnv(VecEnv):
    """
    Creates a multiprocess vectorized wrapper for multiple environments, distributing each environment to its own
    process, allowing significant speed up when the environment is computationally complex.
    For performance reasons, if your environment is not IO bound, the number of environments should not exceed the
    number of logical cores on your CPU.
    .. warning::
        Only 'forkserver' and 'spawn' start methods are thread-safe,
        which is important when TensorFlow sessions or other non thread-safe
        libraries are used in the parent (see issue #217). However, compared to
        'fork' they incur a small start-up cost and have restrictions on
        global variables. With those methods, users must wrap the code in an
        ``if __name__ == "__main__":`` block.
        For more information, see the multiprocessing documentation.
    :param env_fns: ([callable]) A list of functions that will create the environments
        (each callable returns a `Gym.Env` instance when called).
    :param start_method: (str) method used to start the subprocesses.
           Must be one of the methods returned by multiprocessing.get_all_start_methods().
           Defaults to 'forkserver' on available platforms, and 'spawn' otherwise.
    """

    def __init__(self, env_fns, start_method=None):
        self.waiting = False
        self.closed = False
        n_envs = len(env_fns)

        if start_method is None:
            # Fork is not a thread safe method (see issue #217)
            # but is more user friendly (does not require to wrap the code in
            # a `if __name__ == "__main__":`)
            forkserver_available = 'forkserver' in multiprocessing.get_all_start_methods()
            start_method = 'forkserver' if forkserver_available else 'spawn'
        ctx = multiprocessing.get_context(start_method)

        self.remotes, self.work_remotes = zip(*[ctx.Pipe(duplex=True) for _ in range(n_envs)])
        self.processes = []
        for work_remote, remote, env_fn in zip(self.work_remotes, self.remotes, env_fns):
            args = (work_remote, remote, CloudpickleWrapper(env_fn))
            # daemon=True: if the main process crashes, we should not cause things to hang
            process = ctx.Process(target=_worker, args=args, daemon=True)  # pytype:disable=attribute-error
            process.start()
            self.processes.append(process)
            work_remote.close()

        self.remotes[0].send(('get_spaces', None))
        observation_space, action_space = self.remotes[0].recv()
        VecEnv.__init__(self, len(env_fns), observation_space, action_space)

    def step_async(self, actions):
        for remote, action in zip(self.remotes, actions):
            remote.send(('step', action))
        self.waiting = True

    def step_wait(self):
        results = [remote.recv() for remote in self.remotes]
        self.waiting = False
        obs, rews, dones, infos = zip(*results)
        return _flatten_obs(obs, self.observation_space), np.stack(rews), np.stack(dones), infos

    def seed(self, seed=None):
        for idx, remote in enumerate(self.remotes):
            remote.send(('seed', seed + idx))
        return [remote.recv() for remote in self.remotes]

    def reset(self):
        for remote in self.remotes:
            remote.send(('reset', None))
        obs = [remote.recv() for remote in self.remotes]
        return _flatten_obs(obs, self.observation_space)

    def close(self):
        if self.closed:
            return
        if self.waiting:
            for remote in self.remotes:
                remote.recv()
        for remote in self.remotes:
            remote.send(('close', None))
        for process in self.processes:
            process.join()
        self.closed = True

    def get_images(self) -> Sequence[np.ndarray]:
        for pipe in self.remotes:
            # gather images from subprocesses
            # `mode` will be taken into account later
            pipe.send(('render', 'rgb_array'))
        imgs = [pipe.recv() for pipe in self.remotes]
        return imgs

    def get_attr(self, attr_name, indices=None):
        """Return attribute from vectorized environment (see base class)."""
        target_remotes = self._get_target_remotes(indices)
        for remote in target_remotes:
            remote.send(('get_attr', attr_name))
        return [remote.recv() for remote in target_remotes]

    def set_attr(self, attr_name, value, indices=None):
        """Set attribute inside vectorized environments (see base class)."""
        target_remotes = self._get_target_remotes(indices)
        for remote in target_remotes:
            remote.send(('set_attr', (attr_name, value)))
        for remote in target_remotes:
            remote.recv()

    def env_method(self, method_name, *method_args, indices=None, **method_kwargs):
        """Call instance methods of vectorized environments."""
        target_remotes = self._get_target_remotes(indices)
        for remote in target_remotes:
            remote.send(('env_method', (method_name, method_args, method_kwargs)))
        return [remote.recv() for remote in target_remotes]

    def _get_target_remotes(self, indices):
        """
        Get the connection object needed to communicate with the wanted
        envs that are in subprocesses.
        :param indices: (None,int,Iterable) refers to indices of envs.
        :return: ([multiprocessing.Connection]) Connection object to communicate between processes.
        """
        indices = self._get_indices(indices)
        return [self.remotes[i] for i in indices]


def _flatten_obs(obs, space):
    """
    Flatten observations, depending on the observation space.
    :param obs: (list<X> or tuple<X> where X is dict<ndarray>, tuple<ndarray> or ndarray) observations.
                A list or tuple of observations, one per environment.
                Each environment observation may be a NumPy array, or a dict or tuple of NumPy arrays.
    :return (OrderedDict<ndarray>, tuple<ndarray> or ndarray) flattened observations.
            A flattened NumPy array or an OrderedDict or tuple of flattened numpy arrays.
            Each NumPy array has the environment index as its first axis.
    """
    assert isinstance(obs, (list, tuple)), "expected list or tuple of observations per environment"
    assert len(obs) > 0, "need observations from at least one environment"

    if isinstance(space, gym.spaces.Dict):
        assert isinstance(space.spaces, OrderedDict), "Dict space must have ordered subspaces"
        assert isinstance(obs[0], dict), "non-dict observation for environment with Dict observation space"
        return OrderedDict([(k, np.stack([o[k] for o in obs])) for k in space.spaces.keys()])
    elif isinstance(space, gym.spaces.Tuple):
        assert isinstance(obs[0], tuple), "non-tuple observation for environment with Tuple observation space"
        obs_len = len(space.spaces)
        return tuple((np.stack([o[i] for o in obs]) for i in range(obs_len)))
    else:
        return np.stack(obs)

In [None]:
import os
import math
import random
import numpy as np
import pandas as pd

from collections import OrderedDict

from matplotlib import pyplot as plt

from stable_baselines import PPO2, results_plotter
from stable_baselines.bench import Monitor
from stable_baselines.common.callbacks import BaseCallback
from stable_baselines.common.env_checker import check_env
from stable_baselines.common.policies import MlpPolicy
# from stable_baselines.common.vec_env import SubprocVecEnv  # issue with terminal observation
from subproc_vec_env import SubprocVecEnv

from gym import Env, spaces
from kaggle_environments import make, evaluate
from kaggle_environments.envs.halite.helpers import Board, Point

from tqdm.notebook import tqdm

%matplotlib inline

In [None]:
N_CPU = os.cpu_count()
print('CPU Cores =', N_CPU)

In [None]:
LOG_DIR = './log/'
AGENT_DIR = '../input/swarm-intelligence-with-sdk/'
MODEL_DIR = '../input/stable-baselines-starter'
MODEL_FILE = 'halite.pkl'

os.makedirs(LOG_DIR, exist_ok=True)

agents = {'idle_bot.py': 'idle.py',
          'beetle_bot.py': 'beetle.py', 
          'duo_bot.py': 'duo.py', 
          'submission.py': 'swarm.py', 
          'attack_bot.py': 'attack.py'}

for infile, outfile in agents.items():
    with open(os.path.join(AGENT_DIR, infile), 'rt') as f:
        agent_src = f.read()

    with open(outfile, 'wt') as f:
        f.write(agent_src)

In [None]:
# GAME_CONFIG = {'episodeSteps': 400, 'size': 21, 'num_agents': 4}
GAME_CONFIG = {'episodeSteps': 100, 'size': 11, 'num_agents': 4}

# GAME_AGENTS = ['random', 'idle.py', 'beetle.py', 'duo.py', 'swarm.py', 'attack.py']
GAME_AGENTS = ['idle.py'] * 4

In [None]:
def sort_cells(cells):
    ordered_cells = OrderedDict()
    size = int(len(cells) ** 0.5)
        
    for x in range(size):
        for y in range(size):
            point = Point(x, y)
            ordered_cells[point] = cells[point]
    return ordered_cells

In [None]:
SHIP_ACTIONS = [None, 'CONVERT', 'NORTH', 'EAST', 'SOUTH', 'WEST']
YARD_ACTIONS = [None, 'SPAWN']

N_SHIP_ACTIONS = len(SHIP_ACTIONS)
N_YARD_ACTIONS = len(YARD_ACTIONS)

MAX_SHIPS = 5
MAX_YARDS = 5

def transform_actions(actions, obs, config):
    next_actions = dict()
    
    board = Board(obs, config)
    me = board.current_player
    
    board_cells = sort_cells(board.cells)
    
    si = 0
    yi = MAX_SHIPS
    
    for _, c in board_cells.items():
        if c.ship in me.ships and si < MAX_SHIPS:
            i = actions[si]
            ship_action = SHIP_ACTIONS[i]
            si += 1

            if ship_action is not None:
                next_actions[c.ship.id] = ship_action
                    
        if c.shipyard in me.shipyards and yi < MAX_SHIPS + MAX_YARDS:
            i = actions[yi]
            yard_action = YARD_ACTIONS[i]
            yi += 1

            if yard_action is not None:
                next_actions[c.shipyard.id] = yard_action
        
    return next_actions

In [None]:
N_FEATURES = 14
MAX_SHIP_HALITE = 500
MAX_PLAYER_HALITE = 1000
MAX_DIFF_HALITE = 500

def transform_observation(done, obs, config):
    board = Board(obs, config)
    me = board.current_player
    
    board_cells = sort_cells(board.cells)

    step = []
    final_step = []
    halite = []
    cargo = []
    halite_diff = []
    cargo_diff = []
    cell_yield = []
    me_yard = []
    me_ship = []
    me_ship_cargo = []
    opp_yard = []
    opp_ship = []
    opp_ship_cargo = []
    directions = []
        
    halite_val = me.halite / MAX_PLAYER_HALITE
    cargo_val = sum(s.halite for s in me.ships) / MAX_PLAYER_HALITE
    
    halite_diff_val = me.halite - max(p.halite for p in board.opponents)
    halite_diff_val = (halite_diff_val + MAX_DIFF_HALITE) / (2 * MAX_DIFF_HALITE)

    cargo_diff_val = (sum(s.halite for s in me.ships) -
                      max(sum(s.halite for s in p.ships) for p in board.opponents))
    cargo_diff_val = (cargo_diff_val + MAX_DIFF_HALITE) / (2 * MAX_DIFF_HALITE)
    
    for _, c in board_cells.items():
        step.append(obs['step'] / config.episodeSteps)
        final_step.append(int(done))
        
        halite.append(halite_val)
        cargo.append(cargo_val)
        halite_diff.append(halite_diff_val)
        cargo_diff.append(cargo_diff_val)
        
        cell_yield.append(0 if c.halite < 4 else c.halite / config.maxCellHalite)
        
        if c.ship is None:
            me_ship.append(0)
            me_ship_cargo.append(0)
            opp_ship.append(0)
            opp_ship_cargo.append(0)
            
        elif c.ship in me.ships:
            me_ship.append(1)
            me_ship_cargo.append(c.ship.halite / MAX_SHIP_HALITE)
            opp_ship.append(0)
            opp_ship_cargo.append(0)
            
        else:
            me_ship.append(0)
            me_ship_cargo.append(0)
            opp_ship.append(1)
            opp_ship_cargo.append(c.ship.halite / MAX_SHIP_HALITE)

        if c.shipyard is None:
            me_yard.append(0)
            opp_yard.append(0)
        
        elif c.shipyard in me.shipyards:
            me_yard.append(1)
            opp_yard.append(0)
            
        else:
            me_yard.append(0)
            opp_yard.append(1)
        
        if (c.ship in me.ships or 
            c.north.ship in me.ships or
            c.east.ship in me.ships or
            c.south.ship in me.ships or
            c.west.ship in me.ships):
            directions.append(1)
        else:
            directions.append(0)
            
    x_obs = np.vstack((step,
                       final_step,
                       halite,
                       cargo,
                       halite_diff,
                       cargo_diff,
                       cell_yield, 
                       me_yard, 
                       me_ship, 
                       me_ship_cargo, 
                       opp_yard, 
                       opp_ship, 
                       opp_ship_cargo,
                       directions))
    
    x_obs = x_obs.reshape(config.size, config.size, N_FEATURES)
    x_obs = x_obs.astype(np.float32).clip(0, 1)
    
    return x_obs

In [None]:
REWARD_WON = GAME_CONFIG['episodeSteps']
REWARD_LOST = -REWARD_WON

MAX_DELTA = 500

def transform_reward(done, last_obs, obs, config):
    board = Board(obs, config)
    me = board.current_player
    
    nships = len(me.ships)
    nyards = len(me.shipyards)
    halite = me.halite
    cargo = sum(s.halite for s in me.ships)
    
    if nships == 0:
        if nyards == 0:
            return REWARD_LOST
        
        if halite < config.spawnCost:
            return REWARD_LOST
    
    if done:
        scores = [p.halite for p in board.players.values() if 
                  len(p.ships) > 0 or
                  (len(p.shipyards) > 0 and p.halite >= config.spawnCost)]
        
        if halite == max(scores):
            if scores.count(halite) == 1:
                return REWARD_WON
        return REWARD_LOST
        
    delta = 0
    
    if last_obs is not None:
        last_board = Board(last_obs, config)
        last_me = last_board.current_player
        
        last_nships = len(last_me.ships)
        last_nyards = len(last_me.shipyards)
        last_halite = last_me.halite
        last_cargo = sum(s.halite for s in last_me.ships)
        
        delta_ships = (nships - last_nships) * config.spawnCost
        delta_yards = (nyards - last_nyards) * (config.convertCost + config.spawnCost)
        delta_halite = halite - last_halite
        delta_cargo = cargo - last_cargo
    
        delta = delta_ships + delta_yards + delta_halite + delta_cargo        
        
        if delta_halite > 0:
            delta += MAX_DELTA
            
        if delta_cargo > 0:
            delta += MAX_DELTA // 2
            
        if nyards == 0:
            delta -= MAX_DELTA
            
        if nships == 0:
            delta -= MAX_DELTA
            
        delta = float(np.clip(delta / MAX_DELTA, -1, 1))
    
    reward = delta + 1 / MAX_DELTA
    return reward

In [None]:
def get_actions(model, obs, config, deterministic=False):
    x_obs = transform_observation(False, obs, config)
    actions, state = model.predict(x_obs, deterministic=deterministic)
    next_actions = transform_actions(actions, obs, config)
    return next_actions

In [None]:
class HaliteGym(Env):
    def __init__(self, config={}):
        self.agents = GAME_AGENTS

        size = config.get('size', 21)
        num_agents = config.get('num_agents', 4)
        episodeSteps = config.get('episodeSteps', 400)

        cfg = {'size': size, 'num_agents': num_agents, 'episodeSteps': episodeSteps}

        self.halite_env = make('halite', configuration=cfg, debug=True)
        self.config = self.halite_env.configuration

        self.env = None
        self.obs = self.reset_env()
        self.last_obs = None

        self.action_space = spaces.MultiDiscrete([N_SHIP_ACTIONS] * MAX_SHIPS +
                                                 [N_YARD_ACTIONS] * MAX_YARDS)

        self.observation_space = spaces.Box(low=0, high=1, 
                                            shape=(self.config.size, 
                                                   self.config.size, 
                                                   N_FEATURES), 
                                            dtype=np.float32)

        self.reward_range = (REWARD_LOST, REWARD_WON)

        self.spec = None
        self.metadata = None

    def reset_env(self):       
        game_agents = random.sample(self.agents, self.config.num_agents-1)
        position = random.randint(0, self.config.num_agents-1) 
        game_agents.insert(position, None)
        self.env = self.halite_env.train(game_agents)
        return self.env.reset()

    def reset(self):
        self.obs = self.reset_env()
        self.last_obs = None
        x_obs = transform_observation(False, self.obs, self.config)
        return x_obs

    def step(self, actions):
        next_actions = transform_actions(actions, self.obs, self.config)

        self.last_obs = self.obs
        self.obs, reward, done, info = self.env.step(next_actions)

        x_obs = transform_observation(done, self.obs, self.config)
        x_reward = transform_reward(done, self.last_obs, self.obs, self.config)

        if x_reward <= REWARD_LOST:
            done, info = True, {}

        return x_obs, x_reward, done, info

# Multi-Agent Environment: 
# https://github.com/openai/multiagent-particle-envs/blob/master/multiagent/environment.py

In [None]:
test_env = HaliteGym()
check_env(test_env)

In [None]:
def make_env(config, rank=0):
    def _init():
        env = HaliteGym(config)
        log_file = os.path.join(LOG_DIR, str(rank))
        env = Monitor(env, log_file, allow_early_resets=True)
        return env
    return _init

In [None]:
env = SubprocVecEnv([make_env(GAME_CONFIG, i) for i in range(N_CPU)])
# env = Monitor(HaliteGym(GAME_CONFIG), LOG_DIR, allow_early_resets=True)

In [None]:
model_path = os.path.join(MODEL_DIR, MODEL_FILE)

if os.path.isfile(model_path):
    print('Loading model')
    model = PPO2.load(model_path)
    model.set_env(env)
    
else:
    print('Making model')
    model = PPO2(policy=MlpPolicy, 
                 env=env, 
                 n_cpu_tf_sess=N_CPU, 
                 verbose=0, 
                 n_steps=GAME_CONFIG['episodeSteps'], 
                 nminibatches=N_CPU, 
                 noptepochs=4, 
                 seed=None,
                 _init_setup_model=True, 
                 learning_rate=0.00025,
                 gamma=0.99, 
                 ent_coef=0.01,  
                 vf_coef=0.5, 
                 max_grad_norm=0.5, 
                 lam=0.95, 
                 cliprange=0.2, 
                 cliprange_vf=None,
                 policy_kwargs=None, 
                 tensorboard_log=None, 
                 full_tensorboard_log=False)

In [None]:
class ProgressBar(BaseCallback):
    def __init__(self, verbose=0):
        super(ProgressBar, self).__init__(verbose)
        self.pbar = None

    def _on_training_start(self):
        self.pbar = tqdm(total=self.locals['n_updates'])

    def _on_rollout_start(self):
        self.pbar.refresh()

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        self.pbar.update()

    def _on_training_end(self):
        self.pbar.close()
        self.pbar = None

In [None]:
TIMESTEPS = 400000
progressbar = ProgressBar()
model = model.learn(total_timesteps=TIMESTEPS, callback=progressbar)

In [None]:
model.save(MODEL_FILE)

In [None]:
plt.style.use(['seaborn-whitegrid'])

results_plotter.plot_results([LOG_DIR], TIMESTEPS, 
                             results_plotter.X_TIMESTEPS, 'Halite Timesteps')

results_plotter.plot_results([LOG_DIR], TIMESTEPS, 
                             results_plotter.X_EPISODES, 'Halite Episodes')

In [None]:
log_files = [os.path.join(LOG_DIR, f'{i}.monitor.csv') for i in range(N_CPU)]
# log_files = [os.path.join(LOG_DIR, 'monitor.csv')]

for i, log_file in enumerate(log_files):
  if os.path.isfile(log_file):
    df = pd.read_csv(log_file, skiprows=1)

    fig = plt.figure(figsize=(8, 2))
    plt.subplot(1, 2, 1, label=log_file)
    df['r'].rolling(window=TIMESTEPS//1000).mean().plot(title=f'Rewards {i}')

    plt.subplot(1, 2, 2, label=log_file)
    df['l'].rolling(window=TIMESTEPS//1000).mean().plot(title=f'Lengths {i}')

    plt.tight_layout()
    plt.show()

In [None]:
def run_test(model, deterministic=False):
    env = make('halite', configuration=GAME_CONFIG, debug=True)
    config = env.configuration

    game_agents = random.sample(GAME_AGENTS, config.num_agents-1)
    position = random.randint(0, config.num_agents-1) 
    game_agents.insert(position, None)
    print('Agents:', game_agents)
    
    trainer = env.train(game_agents)
    obs = trainer.reset()

    while not env.done:
        actions = get_actions(model, obs, config, deterministic=deterministic)
        obs, reward, done, info = trainer.step(actions)
    
    env.render(mode='ipython', width=640, height=480)

In [None]:
run_test(model, deterministic=True)

In [None]:
run_test(model, deterministic=False)