# Smash that upvote button in you learned something new! 

In [None]:
!pip install tf-agents

# Imports

In [None]:
from abc import ABC
from random import choice
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.policies import policy_saver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

from kaggle_environments import make
from kaggle_environments.envs.hungry_geese.hungry_geese import *

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image


# Build the training env

In [None]:
tf.compat.v1.enable_v2_behavior()


def get_board(env):
    config = env.configuration
    columns = config.columns
    rows = config.rows

    numeric_board = np.full([columns * rows], 10, dtype=int)

    food_number = 5

    for pos in env.state[0].observation.food:
        numeric_board[pos] = food_number

    for index, goose in enumerate(env.state[0].observation.geese):
        for position in goose:
            numeric_board[position] = index

    #numeric_board = numeric_board.reshape((columns, rows))

    return numeric_board


class GeeseEnv(py_environment.PyEnvironment):

    def __init__(self):

        self._env = make("hungry_geese")
        # The number of agents
        self._NUM_AGENTS = 2

        # Reset environment
        observations = self._env.reset(num_agents=self._NUM_AGENTS)

        self._state = get_board(self._env)

        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=3, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(1, 1, self._state.shape[0]), dtype=np.int32, minimum=0, maximum=10,
            name='observation')
        self._episode_ended = False

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _reset(self):
        observations = self._env.reset(num_agents=self._NUM_AGENTS)
        self._state = [[get_board(self._env)]]
        self._episode_ended = False
        return ts.restart(np.array(self._state, dtype=np.int32))

    def _step(self, action):

        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self.reset()

        self._state = get_board(self._env)

        choices = ['NORTH', 'SOUTH', 'WEST', 'EAST']

        actions = [choices[action], choice(choices)]

        self._env.step(actions)

        reward = self._env.steps[len(self._env.steps) - 1][0].reward

        if self._env.done:
            self._episode_ended = True

        if self._episode_ended:
            return ts.termination(np.array([[self._state]], dtype=np.int32), reward)
        else:
            return ts.transition(
                np.array([[self._state]], dtype=np.int32), reward=reward, discount=1.0)


In [None]:
env = GeeseEnv()
print('action_spec:', env.action_spec())
print('time_step_spec.observation:', env.time_step_spec().observation)
print('time_step_spec.step_type:', env.time_step_spec().step_type)
print('time_step_spec.discount:', env.time_step_spec().discount)
print('time_step_spec.reward:', env.time_step_spec().reward)

# Build the Bot and Train
Feel free to play with the hyper parms. All this code came from [here](https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial)

In [None]:
num_iterations = 10000  # @param {type:"integer"}

initial_collect_steps = 100  # @param {type:"integer"}
collect_steps_per_iteration = 1  # @param {type:"integer"}
replay_buffer_max_length = 100000  # @param {type:"integer"}

batch_size = 64  # @param {type:"integer"}
learning_rate = 1e-3  # @param {type:"number"}
log_interval = 200  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}

In [None]:

tf.compat.v1.enable_v2_behavior()

train_py_env = GeeseEnv()
eval_py_env = GeeseEnv()

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

fc_layer_params = (1000,)

q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

eval_policy = agent.policy
collect_policy = agent.collect_policy

random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())


def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):

        time_step = environment.reset()
        episode_return = 0.0

        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return

    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]


compute_avg_return(eval_env, random_policy, num_eval_episodes)

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)


def collect_step(environment, policy, buffer):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)

    # Add trajectory to the replay buffer
    buffer.add_batch(traj)


def collect_data(env, policy, buffer, steps):
    for _ in range(steps):
        collect_step(env, policy, buffer)


collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)

# This loop is so common in RL, that we provide standard implementations.
# For more details see the drivers module.
# https://www.tensorflow.org/agents/api_docs/python/tf_agents/drivers


dataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=batch_size,
    num_steps=2).prefetch(3)

iterator = iter(dataset)

print(iterator)

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

tf_policy_saver = policy_saver.PolicySaver(agent.policy)

for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)
        tf_policy_saver.save('sub/current_policy')


# Save the Policy 

In [None]:
saved_policy = tf.compat.v2.saved_model.load('sub/current_policy')

# Test the Policy with an empty state

In [None]:
from tf_agents.trajectories import time_step as ts

blank_board = np.zeros([1,1,77], dtype=np.int32)
print(blank_board.shape)
step_type = tf.convert_to_tensor(
    [0], dtype=tf.int32, name='step_type')
reward = tf.convert_to_tensor(
    [0], dtype=tf.float32, name='reward')
discount = tf.convert_to_tensor(
    [1], dtype=tf.float32, name='discount')
observations = tf.convert_to_tensor(
    [blank_board], dtype=tf.int32, name='observations')
timestep = ts.TimeStep(step_type, reward, discount, observations)

time_step = None
action_step = saved_policy.action(timestep)
print(action_step)


# Final File

In [None]:
%%writefile sub/main.py

!pip install tf-agents

from abc import ABC
from random import choice
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.policies import policy_saver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

from kaggle_environments import make
from kaggle_environments.envs.hungry_geese.hungry_geese import *

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image


from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, row_col

saved_policy = tf.compat.v2.saved_model.load('sub/current_policy')

def get_board(ob, co):
    config = co
    columns = config.columns
    rows = config.rows

    numeric_board = np.full([columns * rows], 10, dtype=int)

    food_number = 5

    for pos in ob.food:
        numeric_board[pos] = food_number

    for index, goose in enumerate(ob.geese):
        for position in goose:
            numeric_board[position] = index

    return numeric_board

def agent(obs_dict, config_dict):
    """This agent always moves toward observation.food[0] but does not take advantage of board wrapping"""
    this_board = np.array([[get_board(obs_dict, config_dict)]])
    
    step_type = tf.convert_to_tensor(
        [0], dtype=tf.int32, name='step_type')
    reward = tf.convert_to_tensor(
        [0], dtype=tf.float32, name='reward')
    discount = tf.convert_to_tensor(
        [1], dtype=tf.float32, name='discount')
    observations = tf.convert_to_tensor(
        [this_board], dtype=tf.int32, name='observations')
    timestep = ts.TimeStep(step_type, reward, discount, observations)

    action = saved_policy.action(timestep)
    
    choices = ['NORTH', 'SOUTH', 'WEST', 'EAST']
    
    choice = choices[int(action.action)]
    
    print(f"choice:{choice}")
    return choice

# Execute the Submission

In [None]:
from kaggle_environments import evaluate, make, utils

# Setup a hungry_geese environment.
env = make("hungry_geese", debug = True)
env.run([agent, "random"])
env.render(mode="ipython", width=600, height=650)

# Compress to submit

In [None]:
import tarfile
import os.path

def make_tarfile(output_filename, source_dir):
    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(source_dir, arcname=os.path.basename(source_dir))

make_tarfile('submission.tar.gz', './sub/')