In [None]:
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation,\
Configuration, Action, row_col, adjacent_positions, translate, min_distance,random_agent, GreedyAgent

from kaggle_environments import make
import numpy as np
from random import choice, shuffle

import tensorflow as tf
from tensorflow import keras
import numpy as np
import random
from collections import deque
import time

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

import collections
import statistics
import tqdm
eps = np.finfo(np.float32).eps.item()

In [None]:
# ACTIONS = [e.name for e in Action]
ACTIONS = ['NORTH', 'SOUTH', 'WEST', 'EAST']

env = make("hungry_geese")
display(env.reset())

trainer = env.train([None, "greedy", "greedy", "greedy"])
obs = trainer.reset()
PREV_GEESE = obs['geese']

TRAINER = trainer

display(PREV_GEESE)

seed = 42
# env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

In [None]:
class ActorCritic(tf.keras.Model):
    """Combined actor-critic network."""

    def __init__(
        self, 
        num_actions: int = 4):
        """Initialize."""
        super().__init__()
        
        self.conv1 = tf.keras.layers.Conv2D(filters=128, kernel_size=(3,5))
        self.bn1 = tf.keras.layers.BatchNormalization()
        self.conv2 = tf.keras.layers.Conv2D(filters=256, kernel_size=(3,3))
        self.bn2 = tf.keras.layers.BatchNormalization()
        self.conv3 = tf.keras.layers.Conv2D(filters=128, kernel_size=(3,3))
        self.bn3 = tf.keras.layers.BatchNormalization()
        
        self.flatter = tf.keras.layers.Flatten()
        
        self.fc1 = tf.keras.layers.Dense(128)
        self.bn4 = tf.keras.layers.BatchNormalization()

        self.actor = tf.keras.layers.Dense(num_actions)
        self.critic = tf.keras.layers.Dense(1)
        
        
        self.relu = tf.keras.layers.ReLU()

    def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
        x = self.relu (self.bn1(self.conv1(inputs)))
        x = self.relu (self.bn2(self.conv2(x)))
        x = self.relu (self.bn3(self.conv3(x)))
        x = self.flatter(x)
        common = self.relu (self.bn4(self.fc1(x)))
        
        return self.actor(common), self.critic(common)

In [None]:
model = ActorCritic()
model.build(input_shape=(None, 7, 11, 17))
model.summary()

In [None]:
def centerize(b):
    dy, dx = np.where(b[0])
    if len(dy) == 0 and len(dx) == 0:
        return b
    centerize_y = (np.arange(0,7)-3+dy[0])%7
    centerize_x = (np.arange(0,11)-5+dx[0])%11
    
    b = b[:, centerize_y,:]
    b = b[:, :,centerize_x]
    
    return b

def make_state(obs, prev_obs):
#     print(obs)
    b = np.zeros((17, 7 * 11), dtype=np.float32)
#     obs = obses[-1]

    for p, pos_list in enumerate(obs['geese']):
        # head position
        for pos in pos_list[:1]:
            b[0 + (p - obs['index']) % 4, pos] = 1
        # tip position
        for pos in pos_list[-1:]:
            b[4 + (p - obs['index']) % 4, pos] = 1
        # whole position
        for pos in pos_list:
            b[8 + (p - obs['index']) % 4, pos] = 1
            
    # previous head position
    if prev_obs:
#         print(prev_obs)
#         prev_geese = prev_geese.numpy()
#         obs_prev = obses[-2]
        for p, pos_list in enumerate(prev_obs['geese']):
            for pos in pos_list[:1]:
                b[12 + (p - obs['index']) % 4, pos] = 1

    # food
    for pos in obs['food']:
        b[16, pos] = 1
        
    b = b.reshape(-1, 7, 11)
    b = centerize(b)
    b = np.transpose(b, (1,2,0))

    return b

In [None]:
class RLAgent:
    def __init__(self, net, stochastic):
        self.prev_obs = None
        self.net = net
        self.stochastic = stochastic

    def raw_outputs(self, state):
        state = tf.expand_dims(state, 0)
        logits, values = self.net(state, training=False)
        logits = tf.nn.softmax(logits)
        
        logits = tf.squeeze(logits)
        values = tf.squeeze(values)
        if self.stochastic:
            # get probabilities
#             probs = tf.math.exp(logits)
#             probs = tf.nn.softmax(logits).numpy()
#             # convert 2 numpy
            probs = logits.numpy()
#             probs /= probs.sum()
            
#             print(probs)

            action = np.random.choice(range(4), p=probs) 
#             action = tf.random.categorical(logits, 1)[0, 0]
#             logits = np.squeeze(logits)
#             print(f"act: {action}")
        else:
#             logits = np.squeeze(logits)
#             probs = tf.nn.softmax(logits)
            action = tf.math.argmax(logits)
#             print(f'Act2 : {action}')
#         logits = np.squeeze(logits)
        return action, logits[action], values

    def __call__(self, observation, configuration):
        if observation['step'] == 0:
            self.prev_obs = None
        state = make_state(observation, self.prev_obs)
        action, _, _ = self.raw_outputs(state)
        self.prev_obs =  observation
        return ACTIONS[action]

In [None]:
model = ActorCritic()
model.build(input_shape=(None, 7, 11, 17))

player = RLAgent(model, True)

opponent = RLAgent(model, True)

players = [None]+[opponent]*3

trainer = env.train(players)
obs = trainer.reset()

player.raw_outputs(make_state(obs, obs))

In [None]:
# trainer.reset()
# for x in range(30):
#     obs, r, d, i = trainer.step(ACTIONS[1])
#     if d:
#         display(x)
#         break
#     env.render(mode="ipython", width=500, height=450)

In [None]:
def get_rank(obs, prev_obs):
    geese = obs['geese']
    index = obs['index']
    player_len = len(geese[index])
    survivors = [i for i in range(len(geese)) if len(geese[i]) > 0]
    if index in survivors: # if our player survived in the end, its rank is given by its length in the last state
        return sum(len(x) >= player_len for x in geese) # 1 is the best, 4 is the worst
    # if our player is dead, consider lengths in penultimate state
    geese = prev_obs['geese']
    index = prev_obs['index']
    player_len = len(geese[index])
    rank_among_lost = sum(len(x) >= player_len for i, x in enumerate(geese) if i not in survivors)
    return rank_among_lost + len(survivors)
    
def get_rewards(env_reward, obs, prev_obs, done):
    geese = prev_obs['geese']
    index = prev_obs['index']
    step  = prev_obs['step']
    if done:
        rank = get_rank(obs, prev_obs)
        r = (200, -0.25*200, -0.75*200, -1*200)[rank - 1]
        died_from_hunger = ((step + 1) % 40 == 0) and (len(geese[index]) == 1)
        r += -200 if died_from_hunger else 0 # int(rank == 1) # huge penalty for dying from hunger and huge award for the win
    else:
        if step == 0:
            env_reward -= 1 # somehow initial step is a special case
#         r1 = 0
        r = max(0.5 * (env_reward - 1), 0) # food reward
    return r

In [None]:
def greedy_action(model, observation, state):
    # Explore
    g_agent = GreedyAgent(Configuration({'rows': 7, 'columns': 11}))
    action = g_agent(Observation(observation))

    action = ACTIONS.index(action)
    
    state = tf.expand_dims(state, 0)
    logits, values = model(state, training=False)
    logits = tf.nn.softmax(logits)
    
    logits = tf.squeeze(logits)
    values = tf.squeeze(values)
    
    return action, logits[action], values
    

In [None]:
def run_episode(
    model,
    player,
    env,
    players,
    data_buffer,
    gamma,
    episode
    ):
    """Runs a single episode to collect training data."""
    
    GREEDY_EPISODES = 50
    if episode < GREEDY_EPISODES:
        trainer = env.train([None, 'greedy', 'greedy', 'greedy'])
    else:
        shuffle(players)
        trainer = env.train(players)
        
    observation = trainer.reset()

    done = False
    
    initial_state = make_state(observation, None)

    initial_state_shape = initial_state.shape
    state = initial_state
    
    episod_rewards = []
    episod_dones = []
   
    while not done:
        prev_observation = observation
        
#         print(observation)
        
        if episode < GREEDY_EPISODES:
            action, logit, value = greedy_action(model, observation, state)
        else:
            action, logit, value = player.raw_outputs(state)
            

        
        data_buffer['states'].append(state)
        data_buffer['logits'].append(logit)
        data_buffer['values'].append(value)
        data_buffer['actions'].append(action)

        # Apply action to the environment to get next state and reward
        observation, reward, done, _ = trainer.step(ACTIONS[action])
        reward = get_rewards(reward, observation, prev_observation, done)
        
        episod_rewards.append(reward)
        episod_dones.append(done)
        
        state = make_state(observation, prev_observation)
    
    data_buffer['rewards'] += episod_rewards
    data_buffer['returns'] += get_expected_return(episod_rewards, gamma)
    data_buffer['dones'] += episod_dones
        
    return data_buffer

In [None]:
def collect_data(
    model,
    env,
    min_data_threshold,
    gamma,
    episode,
                ):
    data_buffer = {
        'states': [],
        'actions': [],
        'logits': [],
        'values': [],
        'rewards': [],
        'dones' : [],
        'returns': [],
    }
    
    player = RLAgent(model, stochastic=True)
    opponents = [RLAgent(model, stochastic=False) for _ in range(3)]
    
    while len(data_buffer['states']) < min_data_threshold :
        run_episode(model, player, env, players=[None] + opponents,
                                  data_buffer=data_buffer, gamma = gamma, episode = episode)
        
#         for key, values in data_buffer.items():
#             print(f" || {key}: {len(values)}", end='')
#         print()
    
    return data_buffer
        

In [None]:
def get_expected_return(
    rewards,
    gamma: float, 
    standardize: bool = False) :
    """Compute expected returns per timestep."""

    returns = []

    # Start from the end of `rewards` and accumulate reward sums
    # into the `returns` array
    
    discounted_sum = 0.0
    for reward in rewards[::-1]:
        discounted_sum = reward + gamma * discounted_sum
        returns.append(discounted_sum)
    
    returns = np.array(returns[::-1])
    if standardize:
        returns = ((np.array(returns) - np.mean(returns)) / 
                   (np.std(returns) + eps))

    return returns.tolist()

In [None]:
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

def compute_loss(
    action_probs: tf.Tensor,
    action_probs_old: tf.Tensor,
    values: tf.Tensor,  
    advantage: tf.Tensor,
    action_indices: tf.Tensor,
    returns:tf.Tensor,
    actions_one_hot:tf.Tensor,
    clip_ratio: float = 1e-30) -> tf.Tensor:
    """Computes the combined actor-critic loss."""
    
#     adv = returns - values
#     action_indices = tf.cast(action_indices, tf.int32)
#     logp = tf.gather_nd(action_probs,action_indices)
    
#     ratio = tf.math.log((logp - action_probs_old)+eps)
#     ratio = logp - action_probs_old
#     ratio = tf.clip_by_value(logp, 1e-12, 1e12)
#     ratio = tf.math.log(ratio)
    
#     clip_adv = tf.clip_by_value(ratio, 1 - clip_ratio, 1 + clip_ratio) * advantage
    
#     actor_loss = - tf.math.reduce_mean(tf.math.minimum(ratio * advantage, clip_adv))
    
    
#     actor_loss = - tf.math.reduce_mean(ratio * adv)
    
#     actor_loss = tf.keras.losses.CategoricalCrossentropy()(actions_one_hot, action_probs)

#     critic_loss = huber_loss(values, returns)

    advantage = returns - values
#     print(advantage)

    action_log_probs = tf.math.log(action_probs)
    actor_loss = - tf.math.reduce_sum(action_log_probs * advantage)

    critic_loss = huber_loss(values, returns)

    return  critic_loss + actor_loss

class GeeseLoss(tf.keras.losses.Loss):
    def call(self, y_true, y_pred):
        logits = y_pred[0]
        values = y_pred[1]
        
        logit_old = y_true[0]
#         values_old = y_true[0]
        advantage = y_true[1][0]
        
        actions = y_true[1][1]
        returns = y_true[1][2]
        
        return compute_loss(logits, logit_old, values, advantage, actions, returns)

In [None]:
def model_fit(X, Y, batch_size, epochs, optimizer):
    train_dataset = tf.data.Dataset.from_tensor_slices((X, Y))
    train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
#     drop_remainder=True
    
    for epoch in range(epochs):
#         print("\nStart of epoch %d" % (epoch,))
        lossess = 0.0

        # Iterate over the batches of the dataset.
        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            
            logits_old, advantages, actions , returns = y_batch_train
#             print(actions)
            action_indices = [[i, a] for i,a in enumerate(actions)]
            action_one_hot=tf.one_hot(actions, 4)
#             returns = tf.expand_dims(returns, 0)
#             print(actions)
#             print(returns)
            
#             print(np.sum(x_batch_train))
            
            # Open a GradientTape to record the operations run
            # during the forward pass, which enables auto-differentiation.
            with tf.GradientTape() as tape:

                # Run the forward pass of the layer.
                # The operations that the layer applies
                # to its inputs are going to be recorded
                # on the GradientTape.
                logits, values = model(x_batch_train, training=True)  # Logits for this minibatch
                logits = tf.nn.softmax(logits)
                values = tf.squeeze(values)
                
#                 print(values)
                
                action_indices = tf.cast(action_indices, tf.int32)
                logp = tf.gather_nd(logits,action_indices)
                
                action_probs, values, returns = [
                        tf.expand_dims(x, 1) for x in [logp, values, returns]] 
                    
#                 print(action_probs.shape)
#                 print(values.shape)
#                 print(returns.shape)
                
                
#                 values = tf.squeeze(values)
                
                # Compute the loss value for this minibatch.
                loss_value = compute_loss(action_probs, logits_old, values,  advantages, action_indices, returns, action_one_hot)

            # Use the gradient tape to automatically retrieve
            # the gradients of the trainable variables with respect to the loss.
            grads = tape.gradient(loss_value, model.trainable_weights)

            # Run one step of gradient descent by updating
            # the value of the variables to minimize the loss.
            optimizer.apply_gradients(zip(grads, model.trainable_weights))
            
            lossess += float(loss_value)
#             print(values)
#             print(returns)
#             print(action_probs)

            # Log every 200 batches.
#             if step % 1 == 0:
#                 print(
#                     "Training loss (for one batch) at step %d: %.4f"
#                     % (step, float(loss_value))
#                 )
#                 print("Seen so far: %s samples" % ((step + 1) * batch_size))
        
        print(f'Epoch: {epoch} || losses : {lossess}')

In [None]:
def train_model(data_buffer, model, optimizer):
    X = np.array(data_buffer['states'])
    advantage = np.array(data_buffer['returns']) - np.array(data_buffer['values'])
    
#     action_indices = [[i, a] for i,a in enumerate(data_buffer['actions'])]
    
    Y = ( np.array(data_buffer['logits'], dtype=np.float32),
         np.array(advantage,  dtype=np.float32),
         np.array(data_buffer['actions'], dtype=np.int32),
         np.array(data_buffer['returns'], dtype=np.float32))
    
#     print(Y)
    
    batch_size = 256
    epochs = 2
    
    model_fit(X, Y, batch_size, epochs, optimizer)
    
    
#     model.fit(X, Y, batch_size=batch_size, verbose=1, shuffle=True, epochs=1)
         
         
    

In [None]:
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)


def train_step(
    model: tf.keras.Model,
    optimizer,
    env,
    gamma: float, 
    min_data_threshold: int,
    episode: int) -> tf.Tensor:
    """Runs a model training step."""

    
    data_buffer = collect_data(model, env, min_data_threshold, gamma, episode)
    
#     for key, values in data_buffer.items():
#         print(f" || {key}: {len(values)}", end='')
#     print()
    
    train_model(data_buffer, model, optimizer)
    
#     print(data_buffer['rewards'])
    episode_reward = np.sum(data_buffer['rewards'])

    return episode_reward

In [None]:
%%time

min_episodes_criterion = 1000
max_episodes = 1000
max_steps_per_episode = 100
min_data_threshold = 1030

# Cartpole-v0 is considered solved if average reward is >= 195 over 100 
# consecutive trials
reward_threshold = 1000
running_reward = 0

# Discount factor for future rewards
gamma = 0.8
optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5)
# optimizer=tf.keras.optimizers.SGD(learning_rate=0.001)

model = ActorCritic(4)
# model.compile(loss=GeeseLoss(), optimizer=tf.keras.optimizers.Adam(lr=0.0004), metrics=None)
# Keep last episodes reward
episodes_reward: collections.deque = collections.deque(maxlen=min_episodes_criterion)

with tqdm.trange(max_episodes) as t:
  for i in t:
#     initial_state = make_state(trainer.reset())
#     initial_state = tf.constant(initial_state, dtype=tf.float32)
    episode_reward = int(train_step(model,optimizer, env, gamma, min_data_threshold, episode=i))

    episodes_reward.append(episode_reward)
    running_reward = statistics.mean(episodes_reward)

    t.set_description(f'Episode {i}')
    t.set_postfix(
        episode_reward=episode_reward, running_reward=running_reward)

    # Show average episode reward every 10 episodes
    if i % 50 == 0:
        model.save('my_model', save_format='tf')
#       pass # print(f'Episode {i}: average reward: {avg_reward}')

    if running_reward > reward_threshold and i >= min_episodes_criterion:  
        break

print(f'\nSolved at episode {i}: average reward: {running_reward:.2f}!')

In [None]:
model.save('my_model', save_format='tf')
# model = keras.models.load_model('my_model')


In [None]:
env.reset()
players = [RLAgent(model, False) for _ in range(4)]
env.run(players)
env.render(mode="ipython", width=500, height=450)

In [None]:
a = np.array([[0.1, 0.4, 0.3,0.2]])
x = tf.random.categorical(a, 1)
print(x)
print(x[0,0])

In [None]:
a = tf.convert_to_tensor(np.array([[0,2], [1,0], [2,3]]))
b = tf.convert_to_tensor(np.array([[1,2,0.3,4], [4,5,6,7], [8,9,10,11]]))

c = tf.gather_nd(b, a)
print(c-c)