The content of this Code is mainly borrowed from https://www.kaggle.com/yuricat/smart-geese-trained-by-reinforcement-learning 

Thank you for sharing this code public!

Install libraries
```bash
git clone https://github.com/DeNA/HandyRL.git
cd HandyRL
pip3 install -r requirements.txt
pip3 install -r handyrl/envs/kaggle/requirements.txt
```

In [None]:
# requires Internet Access.
!git clone https://github.com/DeNA/HandyRL.git
!pip install -r HandyRL/requirements.txt
!pip install -r HandyRL/handyrl/envs/kaggle/requirements.txt

Place this file where you run main.py

`config.yaml`
```config.yaml

env_args:
    #env: 'TicTacToe'
    #env: 'Geister'
    env: 'HungryGeese'
    source: 'handyrl.envs.kaggle.hungry_geese'
    #env: 'handyrl.envs.parallel_tictactoe'  # specify by path

train_args:
    turn_based_training: False
    observation: True
    gamma: 0.8
    forward_steps: 32
    compress_steps: 4
    entropy_regularization: 2.0e-3
    entropy_regularization_decay: 0.3
    update_episodes: 500
    batch_size: 128
    minimum_episodes: 10000
    maximum_episodes: 50000
    epochs: 10 # increase as needed
    num_batchers: 2
    eval_rate: 0.1
    worker:
        num_parallel: 6
    lambda: 0.7
    policy_target: 'TD' # 'UPGO' 'VTRACE' 'TD' 'MC'
    value_target: 'TD' # 'VTRACE' 'TD' 'MC'
    seed: 0
    restart_epoch: 0


```
Refer to https://github.com/DeNA/HandyRL/blob/master/docs/parameters.md for meanings

Refer to https://www.kaggle.com/c/hungry-geese/discussion/218190 for publicly available models and parameters

change parallel processing setting according to HW limitations.

In [None]:
%%writefile config.yaml
env_args:
    #env: 'TicTacToe'
    #env: 'Geister'
    env: 'HungryGeese'
    source: 'handyrl.envs.kaggle.hungry_geese'
    #env: 'handyrl.envs.parallel_tictactoe'  # specify by path

train_args:
    turn_based_training: False
    observation: True
    gamma: 0.8
    forward_steps: 32
    compress_steps: 4
    entropy_regularization: 2.0e-3
    entropy_regularization_decay: 0.3
    update_episodes: 500
    batch_size: 128
    minimum_episodes: 10000
    maximum_episodes: 80000
    epochs: 50 # Set more epochs to learn more. -1 for running forever.
    num_batchers: 4
    eval_rate: 0.1
    worker:
        num_parallel: 4
    lambda: 0.7
    policy_target: 'TD' # 'UPGO' 'VTRACE' 'TD' 'MC'
    value_target: 'TD' # 'VTRACE' 'TD' 'MC'
    seed: 0
    restart_epoch: 0


Train using

```bash
python3 main.py --train
```

Models are saved at `models/*.pth`

By default, models are evaluated against random player. You may change this to GreedyAgent.



In [None]:
%run HandyRL/main.py --train

In [None]:
!ls models

In [None]:
import torch
import pickle
import bz2
import base64

state_dict = torch.load('models/latest.pth') # using latest.pth, you could also use i.pth (i=0, 1, 2, ...)
# Save model parameters as base64 for submission
PARAM = base64.b64encode(bz2.compress(pickle.dumps(state_dict)))

# Save param
# with open("PARAM.txt", "w") as f:
#     f.write(repr(PARAM))

PARAM = repr(PARAM) # prepend `b'` and append `'`
print(PARAM[:10] + "..." + PARAM[-10:])

In [None]:
%%writefile submission.py

# This is a lightweight ML agent trained by self-play.
# https://github.com/DeNA/HandyRL
# copied from https://www.kaggle.com/yuricat/smart-geese-trained-by-reinforcement-learning


import pickle
import bz2
import base64
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


# Neural Network for Hungry Geese

class TorusConv2d(nn.Module):
    def __init__(self, input_dim, output_dim, kernel_size, bn):
        super().__init__()
        self.edge_size = (kernel_size[0] // 2, kernel_size[1] // 2)
        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size=kernel_size)
        self.bn = nn.BatchNorm2d(output_dim) if bn else None

    def forward(self, x):
        h = torch.cat([x[:,:,:,-self.edge_size[1]:], x, x[:,:,:,:self.edge_size[1]]], dim=3)
        h = torch.cat([h[:,:,-self.edge_size[0]:], h, h[:,:,:self.edge_size[0]]], dim=2)
        h = self.conv(h)
        h = self.bn(h) if self.bn is not None else h
        return h


class GeeseNet(nn.Module):
    def __init__(self):
        super().__init__()
        layers, filters = 12, 32
        self.conv0 = TorusConv2d(17, filters, (3, 3), True)
        self.blocks = nn.ModuleList([TorusConv2d(filters, filters, (3, 3), True) for _ in range(layers)])
        self.head_p = nn.Linear(filters, 4, bias=False)
        self.head_v = nn.Linear(filters * 2, 1, bias=False)

    def forward(self, x):
        h = F.relu_(self.conv0(x))
        for block in self.blocks:
            h = F.relu_(h + block(h))
        h_head = (h * x[:,:1]).view(h.size(0), h.size(1), -1).sum(-1)
        h_avg = h.view(h.size(0), h.size(1), -1).mean(-1)
        p = self.head_p(h_head)
        v = torch.tanh(self.head_v(torch.cat([h_head, h_avg], 1)))

        return {'policy': p, 'value': v}


# Input for Neural Network

def make_input(obses):
    b = np.zeros((17, 7 * 11), dtype=np.float32)
    obs = obses[-1]

    for p, pos_list in enumerate(obs['geese']):
        # head position
        for pos in pos_list[:1]:
            b[0 + (p - obs['index']) % 4, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + (p - obs['index']) % 4, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + (p - obs['index']) % 4, pos] = 1
            
    # previous head position
    if len(obses) > 1:
        obs_prev = obses[-2]
        for p, pos_list in enumerate(obs_prev['geese']):
            for pos in pos_list[:1]:
                b[12 + (p - obs['index']) % 4, pos] = 1

    # food
    for pos in obs['food']:
        b[16, pos] = 1

    return b.reshape(-1, 7, 11)


# Load PyTorch Model

PARAM = %PARAM%

state_dict = pickle.loads(bz2.decompress(base64.b64decode(PARAM)))
model = GeeseNet()
model.load_state_dict(state_dict)
model.eval()


# Main Function of Agent

obses = []

def agent(obs, _):
    obses.append(obs)
    x = make_input(obses)
    with torch.no_grad():
        xt = torch.from_numpy(x).unsqueeze(0)
        o = model(xt)
    p = o['policy'].squeeze(0).detach().numpy()

    actions = ['NORTH', 'SOUTH', 'WEST', 'EAST']
    return actions[np.argmax(p)]

In [None]:
with open('submission.py', 'r+') as f:
    agent = f.read()
    f.seek(0.0)
    f.write(agent.replace('%PARAM%', PARAM))

In [None]:
from kaggle_environments import make
env = make("hungry_geese", debug=True)

env.reset()
env.run(['greedy', 'greedy', 'submission.py', 'submission.py'])
env.render(mode="ipython", width=800, height=700)

In [None]:
env.state