In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import sys
import PIL.Image

import tensorflow as tf
import logging

from sklearn import preprocessing
import random
import matplotlib.pyplot as plt
import seaborn as sns

from kaggle_environments import evaluate, make
from kaggle_environments.envs.hungry_geese.hungry_geese import *

In [None]:
seed=123
tf.compat.v1.set_random_seed(seed)
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)
logging.disable(sys.maxsize)
global agent_

In [None]:
env = make("hungry_geese", debug=True)
env.run(["random","random"])
env.render(mode="ipython",width=800, height=600)

In [None]:
env.configuration

In [None]:
env.specification

In [None]:
env.specification.reward

In [None]:
env.specification.action

In [None]:
env.specification.observation

In [None]:
%%writefile simple_agent.py
bots_stats = {}



from kaggle_environments.envs.hungry_geese.hungry_geese import *
from random import choice, sample


def simple_agent(observation, configuration):
    observation = Observation(observation)
    configuration = Configuration(configuration)
    rows, columns = configuration.rows, configuration.columns

    food = observation.food
    geese = observation.geese
    opponents = [
        goose
        for index, goose in enumerate(geese)
        if index != observation.index and len(goose) > 0
    ]

    # Don't move adjacent to any heads
    head_adjacent_positions = {
        opponent_head_adjacent
        for opponent in opponents
        for opponent_head in [opponent[0]]
        for opponent_head_adjacent in adjacent_positions(opponent_head, rows, columns)
    }
    # Don't move into any bodies
    bodies = {position for goose in geese for position in goose[0:-1]}
    # Don't move into tails of heads that are adjacent to food
    tails = {
        opponent[-1]
        for opponent in opponents
        for opponent_head in [opponent[0]]
        if any(
            adjacent_position in food
            # Head of opponent is adjacent to food so tail is not safe
            for adjacent_position in adjacent_positions(opponent_head, rows, columns)
        )
    }

    # Move to the closest food
    position = geese[observation.index][0]
    actions = {
        action: min_distance(new_position, food, columns)
        for action in Action
        for new_position in [translate(position, action, columns, rows)]
        if (
            new_position not in head_adjacent_positions and
            new_position not in bodies and
            new_position not in tails
        )
    }

    if any(actions):
        return min(actions, key=actions.get).name

    return random_agent()

In [None]:
trainer = env.train([None, "random"])
observation = trainer.reset()
while not env.done:
    my_action = simple_agent(observation, env.configuration)
    print("My Action", my_action)
    observation = trainer.step(my_action)[0]
    print("Reward gained",observation.geese[0][0])

In [None]:
env.render(mode="ipython",width=800, height=600)

In [None]:
def ActorModel(num_actions,in_):
    common = tf.keras.layers.Dense(128, activation='tanh')(in_)
    common = tf.keras.layers.Dense(32, activation='tanh')(common)
    common = tf.keras.layers.Dense(num_actions, activation='softmax')(common)
    
    return common

In [None]:
def CriticModel(in_):
    common = tf.keras.layers.Dense(128)(in_)
    common = tf.keras.layers.ReLU()(common)
    common = tf.keras.layers.Dense(32)(common)
    common = tf.keras.layers.ReLU()(common)
    common = tf.keras.layers.Dense(1)(common)
    
    return common

In [None]:
input_ = tf.keras.layers.Input(shape=[2, 1,])
model = tf.keras.Model(inputs=input_, outputs=[ActorModel(4,input_),CriticModel(input_)])

In [None]:
optimizer = tf.keras.optimizers.Adam(lr=7e-4)

In [None]:
huber_loss = tf.keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
num_actions = 4
eps = np.finfo(np.float32).eps.item()
gamma = 0.99  # Discount factor for past rewards
env = make("hungry_geese", debug=True)
trainer = env.train([None,"random"])

In [None]:
le = preprocessing.LabelEncoder()
label_encoded = le.fit_transform(['NORTH', 'EAST', 'SOUTH', 'WEST'])
label_encoded

In [None]:
   
# Will keep track of whether a ship is collecting halite or carrying cargo to a shipyard
geese_states = {}
geese_ = 0
def update_L1():
    global geese_
    geese_+=1
    
# Returns the commands we send to our ships and shipyards
def advanced_agent(obs, config, action):
    global geese_
    observation = Observation(obs)
    configuration = Configuration(config)
    rows, columns = configuration.rows, configuration.columns

    food = observation.food
    geese = observation.geese
    opponents = [
        goose
        for index, goose in enumerate(geese)
        if index != observation.index and len(goose) > 0
    ]
    act = le.inverse_transform([action])[0]
    
    



    # Don't move adjacent to any heads
    head_adjacent_positions = {
        opponent_head_adjacent
        for opponent in opponents
        for opponent_head in [opponent[0]]
        for opponent_head_adjacent in adjacent_positions(opponent_head, rows, columns)
    }
    # Don't move into any bodies
    bodies = {position for goose in geese for position in goose[0:-1]}
    # Don't move into tails of heads that are adjacent to food
    tails = {
        opponent[-1]
        for opponent in opponents
        for opponent_head in [opponent[0]]
        if any(
            adjacent_position in food
            # Head of opponent is adjacent to food so tail is not safe
            for adjacent_position in adjacent_positions(opponent_head, rows, columns)
        )
    }

    # Move to the closest food
    position = geese[observation.index][0]
    actions = {
        action: min_distance(new_position, food, columns)
        for action in Action
        for new_position in [translate(position, action, columns, rows)]
        if (
            new_position not in head_adjacent_positions and
            new_position not in bodies and
            new_position not in tails
        )
    }
    update_L1()
    if act:
        return act
    elif any(actions):
        return min(actions, key=actions.get).name

    return random_agent()

In [None]:
trainer = env.train([None, "random"])

In [None]:
while not env.done:    
    state = trainer.reset()
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(1,env.configuration.episodeSteps+200):
            # of the agent in a pop up window.
            state_ = tf.convert_to_tensor(state.geese)
            state_ = tf.expand_dims(state_, 0)
            # Predict action probabilities and estimated future rewards
            # from environment state
            action_probs, critic_value = model(state_)
            critic_value_history.append(critic_value[0, 0])
            
            # Sample action from action probability distribution
            action = np.random.choice(num_actions, p=np.squeeze(action_probs)[0])
            action_probs_history.append(tf.math.log(action_probs[0,action-1:action]))
            
            # Apply the sampled action in our environment
            action = advanced_agent(state, env.configuration, action)
            state = trainer.step(action)[0]
            gain=state.geese[0][0]/5000
            rewards_history.append(gain)
            episode_reward += gain
            
            if env.done:
                state = trainer.reset() 
        # Update running reward to check condition for solving
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        # Calculate expected value from rewards
        # - At each timestep what was the total reward received after that timestep
        # - Rewards in the past are discounted by multiplying them with gamma
        # - These are the labels for our critic
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)
        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()
        # Calculating loss values to update our network
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            # At this point in history, the critic estimated that we would get a
            # total reward = `value` in the future. We took an action with log probability
            # of `log_prob` and ended up recieving a total reward = `ret`.
            # The actor must be updated so that it predicts an action that leads to
            # high rewards (compared to critic's estimate) with high probability.
            diff = ret - value
            actor_losses.append(-log_prob * diff)  # actor loss

            # The critic must be updated so that it predicts a better estimate of
            # the future rewards.
            critic_losses.append(
                huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
            )
        # Backpropagation
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()
        
    # Log details
    episode_count += 1
    if episode_count % 10 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(template.format(running_reward, episode_count))

    if running_reward > 550:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

In [None]:
while not env.done:
    state_ = tf.convert_to_tensor(state.geese)
    state_ = tf.expand_dims(state_, 0)
    action_probs, critic_value = model(state_)
    critic_value_history.append(critic_value[0, 0])
    action = np.random.choice(num_actions, p=np.squeeze(action_probs)[0])
    action_probs_history.append(tf.math.log(action_probs[0, action]))
    action = advanced_agent(state, env.configuration, action)
    state = trainer.step(action)[0]

In [None]:
env.render(mode="ipython",width=800, height=600)