# git clone libraries

To submit to kaggle using TF-Agent ..

1. Create a python file that describes the agent
2. git cloned TF-Agent directory
3. git cloned gin directory
4. Learned weights file

I found that I should make submiton.tar.gz by putting the four of them together with tar.

Below is a brief example.

---

- V1: init release
- V2: changed params
- V3: added conv_layer_param(CNN) 

In [None]:
!git clone https://github.com/tensorflow/agents.git
!mv agents/tf_agents .
!rm -rf agents

In [None]:
!git clone https://github.com/google/gin-config
!mv gin-config/gin .
!rm -rf gin-config

# Import

In [None]:
import numpy as np

import tensorflow as tf
print('tf.version:', tf.version.VERSION)

import tf_agents

from kaggle_environments import make

In [None]:
import random
import os
def seed_everything(seed=42):
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything()

# Parameters

In [None]:
# for model
fc_layer_params = (512, 256)
# for training
learning_rate = 1e-5
replay_buffer_max_length = 10_000
batch_size = 64
num_eval_episodes = 10

num_iterations = 400_000
collect_steps_per_iteration = 1
eval_interval = 1_000

# Mod observation function

In [None]:
def get_board(obs, config, last_position):
    """Convert `obs` dict to 1D array.
    Dim : 0=mine, 1=inhibit, 2=food
    Returns:
        numpy array size (7, 11, 3)
    """
    rows, columns = config['rows'], config['columns']
    n_cells = rows * columns
    center = 0
    current_position = None
    X = np.zeros((n_cells, 3))
    if last_position:
        X[last_position, 1] = 1
    for n in range(4):
        geese = obs['geese'][n]
        if n==obs.index: # mine
            X[geese, 0] = 1
            if len(geese) > 0:
                current_position = geese[0]
                center = n_cells//2 + geese[0] + 1
        else:
            X[geese, 1] = 0.5
            if len(geese) > 0:
                X[geese[0], 1] = 1
    X[obs['food'], 2] = 1
    # centering board to my head
    X = np.tile(X, (3, 1))[center:center+n_cells, :]
    # reshape
    X = X.reshape((rows, columns, 3))
    X = np.array(X, dtype=np.float16)
    return X, current_position

# Custom environment

In [None]:
from tf_agents.environments import py_environment
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts

class GeeseEnv(py_environment.PyEnvironment):
    def __init__(self):
        self.choices = ['NORTH', 'SOUTH', 'WEST', 'EAST']
        self._env = make("hungry_geese", debug=False)
        self.env = self._env.train(["greedy", "greedy", "greedy", None])
        self.config = self._env.configuration
        self.last_position = None
        self._episode_ended = False
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=3, name='action')
        
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(self.config.rows, self.config.columns, 3), 
            dtype=np.float16, minimum=0, maximum=1,
            name='observation')
        
    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec
    
    def _reset(self):
        obs = self.env.reset()
        state, self.last_position = get_board(obs, self.config, self.last_position)
        self._episode_ended = False
        return ts.restart(state)
    
    def _step(self, action):
        if self._episode_ended:
            return self.reset()
        obs, reward, done, info = self.env.step(self.choices[action])
        state, self.last_position = get_board(obs, self.config, self.last_position)
        if self._env.done:
            self._episode_ended = True
        if self._episode_ended:
            return ts.termination(state, reward)
        else:
            return ts.transition(state, reward, discount=0.9)

In [None]:
# Check environment
from tf_agents.environments import utils

utils.validate_py_environment(GeeseEnv(), episodes=5)

In [None]:
# Convert python env to tensorflow env
from tf_agents.environments import tf_py_environment

train_env = tf_py_environment.TFPyEnvironment(GeeseEnv())
eval_env = tf_py_environment.TFPyEnvironment(GeeseEnv())

# Agent

In [None]:
from tf_agents.networks import q_network

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params,
    conv_layer_params=[(32,3,1),(64,3,1)],
)

In [None]:
from tf_agents.agents.dqn import dqn_agent
from tf_agents.utils import common

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

In [None]:
def compute_avg_return(environment, policy, num_episodes=10):

    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

    while not time_step.is_last():
        action_step = policy.action(time_step)
        time_step = environment.step(action_step.action)
        episode_return += time_step.reward
    total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

# Data Collection
https://www.tensorflow.org/agents/tutorials/10_checkpointer_policysaver_tutorial?hl=en#data_collection

In [None]:
from tf_agents.replay_buffers import tf_uniform_replay_buffer

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

In [None]:
from tf_agents.drivers import dynamic_step_driver

collect_driver = dynamic_step_driver.DynamicStepDriver(
    train_env,
    agent.collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=collect_steps_per_iteration)

# Initial data collection
_ = collect_driver.run()

In [None]:
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)

iterator = iter(dataset)

# Training

In [None]:
# Save model to ./model directory
import shutil
from tf_agents.policies import policy_saver

best_return = 0
def save_model_if_best(agent, avg_return):
    global best_return
    if avg_return > best_return:
        policy_dir = 'model'
        shutil.rmtree(policy_dir, ignore_errors=True)
        tf_policy_saver = policy_saver.PolicySaver(agent.policy, batch_size=None)
        tf_policy_saver.save(policy_dir)
        print(f'saved model, best return={avg_return:,.0f}')
        best_return = avg_return

In [None]:
%%time 

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

def train_one_iteration():

    # Collect a few steps using collect_policy and save to the replay buffer.
    collect_driver.run()

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience)

    step = agent.train_step_counter.numpy()

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: Average Return = {1:,.0f}'.format(step, avg_return))
        save_model_if_best(agent, avg_return)
        returns.append(avg_return)

for _ in range(num_iterations):
    train_one_iteration()

In [None]:
import matplotlib.pyplot as plt

steps = range(0, num_iterations + 1, eval_interval)
plt.ylabel('Average Return')
plt.xlabel('Step')
plt.plot(steps, returns)

In [None]:
compute_avg_return(eval_env, agent.policy, num_eval_episodes)

# Make submission file

In [None]:
%%writefile main.py

import sys
from pathlib import Path
import numpy as np
import tensorflow as tf

p = Path('/kaggle_simulations/agent/')
if p.exists():
    sys.path.append(str(p))
else:
    p = Path('__file__').resolve().parent
    
# tf_agents
from tf_agents.networks import q_network
from tf_agents.trajectories import time_step as ts


In [None]:
# Save get_board function
import inspect

path = 'main.py'
with open(path, 'a') as f:
    s = inspect.getsource(get_board)
    f.write(s)

In [None]:
%%writefile -a main.py

last_position = None
saved_policy = tf.compat.v2.saved_model.load(str(p/'model'))
policy_state = saved_policy.get_initial_state(batch_size=1)

def main(obs, config):
    global last_position
    state, last_position = get_board(obs, config, last_position)
    time_step = ts.TimeStep([0], [0], [0], [state])
    action = saved_policy.action(time_step, policy_state)
    action = ['NORTH', 'SOUTH', 'WEST', 'EAST'][int(action.action)]
    return action

In [None]:
env = make("hungry_geese", debug=True)
env.reset()
steps = env.run(['greedy', 'greedy', 'greedy', 'main.py'])
[res.reward for res in steps[-1]]

In [None]:
# make submission.tar.gz
!tar -czf submission.tar.gz model main.py gin tf_agents

In [None]:
# clean up
!rm -rf gin tf_agents model main.py
!ls