In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Kaggle environments.
!git clone https://github.com/Kaggle/kaggle-environments.git
!cd kaggle-environments && pip install .

# GFootball environment.
!apt-get update -y
!apt-get install -y libsdl2-gfx-dev libsdl2-ttf-dev

# Make sure that the Branch in git clone and in wget call matches !!
!git clone -b v2.5 https://github.com/google-research/football.git
!mkdir -p football/third_party/gfootball_engine/lib

!wget https://storage.googleapis.com/gfootball/prebuilt_gameplayfootball_v2.5.so -O football/third_party/gfootball_engine/lib/prebuilt_gameplayfootball.so
!cd football && GFOOTBALL_USE_PREBUILT_SO=1 pip3 install .

In [None]:
# We will define this magic to run and write the cell at the same time
# This will facilitate the generation of submission file
from IPython.core.magic import register_cell_magic
@register_cell_magic
def write_and_run(line, cell):
    argz = line.split()
    file = argz[-1]
    mode = 'w'
    if len(argz) == 2 and argz[0] == '-a':
        mode = 'a'
    with open(file, mode) as f:
        f.write(cell)
    get_ipython().run_cell(cell)

In [None]:
import gym
from kaggle_environments import make
from typing import Tuple, Any
from tensorflow.keras import backend as K
import os
import threading
import uuid
from queue import Queue
import dill
from scipy.signal import lfilter
from threading import Lock
import json
from typing import List
import multiprocessing as mp
import time
import seaborn as sns

In [None]:
! mkdir -p /kaggle_simulations/agent/

In [None]:
%%write_and_run -a /kaggle_simulations/agent/main.py
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
import zlib

## PPO Agent

PPO is known for its ease of use and good results, so I hope it will know how to play football.

We're going to build 3 modules:
A more convenient environment wrapper to support parallel episodes collection.
A transformer with and multi-head self-attention Layers that will help to embed players' units and at last, the PPO agent.

## Agent structure

![diagrams](https://raw.githubusercontent.com/tchaye59/GRFootball/main/diagrams.jpg)

## FootEnv : custom environment Wrapper
We use this wrapper to preprocess the GFootball environment players_raw data.

* **units** : Information on both left and right side players is parsed as units. 
* **scalars** : Contain other information.

In [None]:
right_agent_path = '/kaggle/input/gfootball-template-bot/submission.py'

In [None]:
# FootEnv: 
class FootEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, right_agent=right_agent_path, env_id=0):
        super(FootEnv, self).__init__()
        self.env_id = env_id
        self.agents = [None, right_agent]# We will step on the None agent
        self.env = make("football", configuration={"save_video": False,
                                                   "scenario_name": "11_vs_11_kaggle",
                                                   "running_in_notebook": True})
        self.trainer = None


    def step(self, action):
        obs, reward, done, info = self.trainer.step([action])
        obs = obs['players_raw'][0]
        state,(l_score,r_score,custom_reward) = OBSParser.parse(obs)
        info['l_score'] = l_score
        info['r_score'] = r_score
        return state, custom_reward, done, info

    def reset(self):
        self.trainer = self.env.train(self.agents)
        obs = self.trainer.reset()
        obs = obs['players_raw'][0]
        state,_ = OBSParser.parse(obs)
        return state

    def render(self, **kwargs):
        return self.env.render(**kwargs)

    def close(self):
        pass

In [None]:
%%write_and_run -a /kaggle_simulations/agent/main.py
# OBSParser : used to parse observation
class OBSParser(object):

    @staticmethod
    def parse(obs):
        # parse left players units
        l_units = [[x[0] for x in obs['left_team']], [x[1] for x in obs['left_team']],
                   [x[0] for x in obs['left_team_direction']], [x[1] for x in obs['left_team_direction']],
                   obs['left_team_tired_factor'], obs['left_team_yellow_card'],
                   obs['left_team_active'], obs['left_team_roles']
                  ]

        l_units = np.r_[l_units].T

        # parse right players units
        r_units = [[x[0] for x in obs['right_team']], [x[1] for x in obs['right_team']],
                   [x[0] for x in obs['right_team_direction']], [x[1] for x in obs['right_team_direction']],
                   obs['right_team_tired_factor'],
                   obs['right_team_yellow_card'],
                   obs['right_team_active'], obs['right_team_roles']
                  ]

        r_units = np.r_[r_units].T
        # combine left and right players units
        units = np.r_[l_units, r_units].astype(np.float32)

        # get other information
        game_mode = [0 for _ in range(7)]
        game_mode[obs['game_mode']] = 1
        scalars = [*obs['ball'],
                   *obs['ball_direction'],
                   *obs['ball_rotation'],
                   obs['ball_owned_team'],
                   obs['ball_owned_player'],
                   *obs['score'],
                   obs['steps_left'],
                   *game_mode,
                   *obs['sticky_actions']]

        scalars = np.r_[scalars].astype(np.float32)
        # get the actual scores and compute a reward
        l_score,r_score = obs['score']
        reward = l_score - r_score
        reward_info = l_score,r_score,reward
        return (units[np.newaxis, :], scalars[np.newaxis, :]),reward_info

In [None]:
#Just creat and return an environment. Useful when we run multiples threads to collect experiences.
def env_fn(env_id=1,right_agent=right_agent_path):
    return FootEnv(env_id=env_id,right_agent=right_agent)

!! Let test **FootEnv** and **OBSParser**

In [None]:
env = env_fn() 
state = env.reset()
done = False
i = 0
while not done and i <5:
    i+=1
    state, reward, done, info = env.step(5)
    print('reward ', reward, info)
print(f"Units shape {state[0].shape}, Scalars shape {state[1].shape}")

## TransformerBlock & MultiHeadSelfAttention:

I'm trying to replicate the Entity encoder part of the paper [Grandmaster level in StarCraft II using multi-agent reinforcement learning](https://www.nature.com/articles/s41586-019-1724-z.epdf).
They use a Transformer architecture to embed the StarCraft game entities. We will do the same for players' units.



[The core idea behind the Transformer model is self-attention‚Äîthe ability to attend to different positions of the input sequence to compute a representation of that sequence.  
It make no assumptions about the temporal/spatial relationships across the data. This is ideal for processing a set of objects (for example,StarCraft units).](https://www.tensorflow.org/tutorials/text/transformer)

In [None]:
%%write_and_run -a /kaggle_simulations/agent/main.py
"""
## Implement multi head self attention as a Keras layer
"""
class MultiHeadSelfAttention(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads=8, **kwargs):
        super(MultiHeadSelfAttention, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = keras.layers.Dense(embed_dim)
        self.key_dense = keras.layers.Dense(embed_dim)
        self.value_dense = keras.layers.Dense(embed_dim)
        self.combine_heads = keras.layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def scaled_dot_product_attention(self, q, k, v, mask):
        # (..., seq_len_q, seq_len_k)
        matmul_qk = tf.matmul(q, k, transpose_b=True)

        # scale matmul_qk
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # add the mask to the scaled tensor.
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        # softmax is normalized on the last axis (seq_len_k) so that the scores
        # add up to 1.
        attention_weights = tf.nn.softmax(
            scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

        output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

        return output, attention_weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(
            x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs, training=None, mask=None):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.scaled_dot_product_attention(
            query, key, value, mask)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output

    def get_config(self):
        config = super(MultiHeadSelfAttention, self).get_config()
        config.update({
            "num_heads": self.num_heads,
            "embed_dim": self.embed_dim,
            "projection_dim": self.projection_dim,
            "query_dense": self.query_dense,
            "key_dense": self.key_dense,
            "value_dense": self.value_dense,
            "combine_heads": self.combine_heads,

        })
        return config


"""
## Implement a Transformer block as a layer
"""
class TransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="relu"),
             keras.layers.Dense(embed_dim), ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=None, mask=None):
        attn_output = self.att(inputs, training, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super(TransformerBlock, self).get_config()
        config.update({
            "att": self.att,
            "ffn": self.ffn,
            "layernorm1": self.layernorm1,
            "layernorm2": self.layernorm2,
            "dropout1": self.dropout1,
            "dropout2": self.dropout2,

        })
        return config

## UnitsEncoder
Now we prepare the units UnitsEncoder layer

In [None]:
%%write_and_run -a /kaggle_simulations/agent/main.py
class UnitsEncoder(keras.layers.Layer):
    def __init__(self, embed_dim=2 ** 7, num_heads=8, ff_dim=128, rate=0.0, name=None, **args):
        super(UnitsEncoder, self).__init__(name=name, *args)
        self.supports_masking = True
        self.embed_dim = embed_dim
        self.dense = keras.layers.Dense(embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim, rate)
        self.layernorm = keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, training=None, mask=None):
        if mask is not None and mask.dtype != inputs.dtype:
            mask = tf.cast(mask, inputs.dtype)
        inputs = self.layernorm(inputs)
        inputs = self.dense(inputs)
        return self.transformer_block(inputs, training, mask=mask)

    def get_output_shape_for(self, input_shape):
        return input_shape[0], input_shape[1], self.embed_dim

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[1], self.embed_dim

    def get_config(self):
        config = super(UnitsEncoder, self).get_config()
        config.update({
            "dense": self.dense,
            "transformer_block": self.transformer_block,
            "layernorm": self.layernorm,
            'embed_dim': self.embed_dim
        })
        return config

### Define used variables

In [None]:
%%write_and_run -a /kaggle_simulations/agent/main.py

LOSS_CLIPPING = 2  # Only implemented clipping for the surrogate loss, paper said it was best
ENTROPY_LOSS = 5e-3
GAMMA = 0.99
N_ACTIONS = 19
LR = 0.0001
BATCH_SIZE = 1024
EPOCHS = 10
GAMMA = 0.99
LAMBDA = 0.95

In [None]:
restore_path = '/kaggle/input/data' # if we want to restore the previous checkpoint
data_path = ''
lock = Lock()

## Prepare Actor&Critic

In [None]:
%%write_and_run -a /kaggle_simulations/agent/main.py

# This code is shared by both actor and critic
def build_shared(units, scalars) -> keras.layers.Dense:
    scalars = keras.layers.LayerNormalization()(scalars)

    # units_encoder
    units_encoder = UnitsEncoder(embed_dim=2 ** 7, num_heads=4, ff_dim=128, name='entities_encoder')(units)

    # scalars encoder
    scalars_encoder = keras.layers.LayerNormalization()(scalars)
    scalars_encoder = keras.layers.Dense(64, activation='relu', name='scalars_encoder1')(scalars_encoder)
    scalars_encoder = keras.layers.Dense(64, activation='relu', name='scalars_encoder2')(scalars_encoder)
    # combine scalars and units
    scalars_encoder = keras.layers.RepeatVector(units_encoder.shape[1])(scalars_encoder)
    encoder = keras.layers.concatenate([units_encoder, scalars_encoder], axis=-1)
    encoder = keras.layers.Dense(128, activation='relu', )(encoder)
    encoder = keras.layers.MaxPooling1D(2)(encoder)
    encoder = keras.layers.Dense(128, activation='relu', )(encoder)
    encoder = keras.layers.MaxPooling1D(2)(encoder)
    encoder = keras.layers.Dense(128, activation='relu', )(encoder)
    encoder = keras.layers.MaxPooling1D(2)(encoder)
    encoder = keras.layers.Flatten()(encoder)
    encoder = keras.layers.Dense(256, activation='relu', )(encoder)
    return encoder

#build actor
def build_actor(verbose=True, lr=1e-4):
    n_actions = 19
    # create the model architecture

    # inputs
    units_input = keras.layers.Input(shape=(22, 8), name='units_input')
    scalars_input = keras.layers.Input(shape=(31,), name='scalars_input')

    # advantage and old_prediction inputs
    advantage = keras.layers.Input(shape=(1,), name='advantage')
    old_action = keras.layers.Input(shape=(n_actions,), name='old_action')
    action_lbl = keras.layers.Input(shape=(n_actions,), name='action_lbl')

    # build_shared
    encoder = build_shared(units_input, scalars_input)

    # outputs
    action = keras.layers.Dense(n_actions, activation=keras.activations.softmax)(encoder)
    inputs = [units_input, scalars_input, advantage, old_action, action_lbl]

    model = keras.models.Model(inputs, action)
    model.add_loss(ppo_loss(action_lbl, action, advantage, old_action))

    model.compile(optimizer=keras.optimizers.Adam(lr))
    if verbose: model.summary()
    return model

#build critic
def build_critic(verbose=True, lr=1e-4):
    # inputs
    units_input = keras.layers.Input(shape=(22, 8), name='units_input')
    scalars_input = keras.layers.Input(shape=(31,), name='scalars_input')

    # build_shared
    encoder = build_shared(units_input, scalars_input)

    # outputs
    value_dense = keras.layers.Dense(1, name='value')(encoder)
    inputs = [units_input, scalars_input]

    model = keras.models.Model(inputs, value_dense)

    model.compile(loss='mse', optimizer=keras.optimizers.Adam(lr))
    if verbose: model.summary()
    return model


def ppo_loss(label_layer, prediction_layer, advantage, old_prediction, clip=True):
    prob = label_layer * prediction_layer
    old_prob = label_layer * old_prediction
    r = prob / (old_prob + 1e-10)
    clipped = r
    if clip:
        clipped = K.clip(r, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING)
    return -K.mean(K.minimum(r * advantage,clipped* advantage) + 
                   ENTROPY_LOSS * (prob * K.log(prob + 1e-10)))

In [None]:
print("Actor:")
build_actor()
print("Critic:")
build_critic()

## Memory
To store states, rewards ... for each episode

In [None]:
class Memory:
    def __init__(self):
        # inputs
        self.units = []
        self.scalars = []
        # action
        self.actions_matrix = []
        self.actions_probs = []
        # rewards
        self.rewards = []
        # dones
        self.terminal = []

    def isEmpty(self):
        return len(self.rewards) == 0

    def store(self, obs, actions, reward, done):
        # inputs
        units, saclars = obs
        self.units.append(units)
        self.scalars.append(saclars)

        # actions
        _, actions_matrix, actions_probs = actions
        if actions_matrix is not None: self.actions_matrix.append(actions_matrix)
        if actions_probs is not None: self.actions_probs.append(actions_probs)
        # reward
        self.rewards.append(reward)
        self.terminal.append(done)

    def discount(self, x, gamma=GAMMA):
        return lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]

    def discount_rewards(self, GAMMA=0.99):
        return self.discount(self.rewards, GAMMA)

    def normalize(self, x):
        mean = np.mean(x)
        std = np.std(x)
        return (x - mean) / np.maximum(std, 1e-6)

    def compute_advantages(self, pred_value, GAMMA=0.99, LAMBDA=0.95, normalize=True):
        # Computes GAE (generalized advantage estimations (from the Schulman paper))
        rewards = np.array(self.rewards, dtype=np.float32)
        pred_value_t = pred_value
        pred_value_t1 = np.concatenate([pred_value[1:], [0.]])
        pred_value_t1[self.terminal] = 0
        advantage = rewards + GAMMA * pred_value_t1 - pred_value_t
        advantage = self.normalize(self.discount(advantage, GAMMA * LAMBDA))
        return np.array(self.discount_rewards(), dtype=np.float32), \
               advantage.astype(np.float32)

    def compute_normal_advantages(self, pred_value, GAMMA=0.99):
        rewards = np.array(self.discount_rewards(GAMMA), dtype=np.float32)
        advantage = rewards - pred_value
        return rewards.astype(np.float32), advantage.astype(np.float32)

    def get_all_as_tensors(self):
        rewards = np.array(self.discount_rewards(), dtype=np.float32)
        units = tf.concat(self.units, axis=0)
        scalars = tf.concat(self.scalars, axis=0)

        actions_matrix = tf.convert_to_tensor(self.actions_matrix, dtype=tf.float32)
        actions_probs = tf.convert_to_tensor(self.actions_probs, dtype=tf.float32)
        dones = np.array(self.terminal, dtype=np.float32)
        return (units, scalars), actions_matrix, actions_probs, rewards, dones

## PPOPolicy

In [None]:
%%write_and_run -a /kaggle_simulations/agent/main.py

# The policy class is straightforward, easy to understand
class PPOPolicy:
    def __init__(self, val=False):
        self.actor = build_actor(lr=LR,verbose=0)
        self.critic = build_critic(lr=LR*10,verbose=0)

        self.val = val # Validation or trainning

    def get_values(self, X):
        return self.critic.predict(X).flatten()

    def get_action(self, X):
        action_prob = self.actor.predict(X)
        action_prob = action_prob[0]
        # action_probs = np.nan_to_num(action_probs[0])
        n_actions = action_prob.size
        if self.val:
            action = np.argmax(action_prob, axis=-1)
        else:
            action = np.random.choice(n_actions, p=action_prob)

        # matrix
        action_matrix = np.zeros(n_actions, np.float32)
        action_matrix[action] = 1

        return action, action_matrix, action_prob

    def train(self, memories):
        if not memories:
            return [],[]
        actor_ds, critic_ds = None, None
        # prepare dataset
        # process and combine memories in actor_ds and critic_ds(tf.data.Dataset objects) 
        for i, memory in enumerate(memories):
            print(f"Add Memory {i + 1}/{len(memories)}")
            inputs, actions_matrix, actions_probs, rewards, dones = memory.get_all_as_tensors()
            c_inputs = inputs
            pred_values = self.get_values(c_inputs)

            # Generalized Advantage Estimation
            rewards, advantage = memory.compute_advantages(pred_values)
            rewards = rewards[:, np.newaxis]
            advantage = advantage[:, np.newaxis]

            labels = actions_matrix
            a_inputs = *inputs, advantage, actions_probs, labels

            if actor_ds is None:
                actor_ds = tf.data.Dataset.from_tensor_slices((a_inputs, labels))
            else:
                actor_ds = actor_ds.concatenate(tf.data.Dataset.from_tensor_slices((a_inputs, labels)))
            if critic_ds is None:
                critic_ds = tf.data.Dataset.from_tensor_slices((c_inputs, rewards))
            else:
                critic_ds = critic_ds.concatenate(tf.data.Dataset.from_tensor_slices((c_inputs, rewards)))

        # train
        print("Updating...")
        actor_ds = actor_ds.shuffle(100).batch(BATCH_SIZE).prefetch(2)
        critic_ds = critic_ds.shuffle(100).batch(BATCH_SIZE).prefetch(2)

        s = time.time()
        a_losses = self.actor.fit(actor_ds, epochs=EPOCHS, verbose=False)
        a_time = time.time() - s
        print(f">>>Actor updated: {a_time}ms")
        s = time.time()
        c_losses = self.critic.fit(critic_ds, epochs=EPOCHS, verbose=False)
        c_time = time.time() - s
        print(f">>>Critic updated: {c_time}ms")
        print(f"Total Duration: {a_time + c_time}")

        return a_losses.history['loss'], c_losses.history['loss']

    def save(self, path):
        self.actor.save_weights(path + '.actor.h5')
        self.critic.save_weights(path + '.critic.h5')

    def load(self, path):
        if os.path.exists(path + '.actor.h5') or os.path.exists(path + '.critic.h5'):
            self.actor.load_weights(path + '.actor.h5')
            self.critic.load_weights(path + '.critic.h5')

## EpisodeCollector & ParallelEpisodeCollector

* EpisodeCollector : run a single environment 
* ParallelEpisodeCollector : run a multiple environments

In [None]:
# EpisodeCollector
class EpisodeCollector(threading.Thread):
    n_episode = 1
    reward_sum = 0
    max_episode = 0

    def __init__(self, env: FootEnv, policy: PPOPolicy, result_queue=None, replays_dir=None):
        super().__init__()
        self.result_queue = result_queue
        self.env = env
        self.policy = policy
        self.replays_dir = replays_dir
        self.n_episode = -1

    def clone(self):
        obj = EpisodeCollector(self.env, self.policy)
        obj.result_queue = self.result_queue
        obj.replays_dir = self.replays_dir
        obj.n_episode = self.n_episode
        return obj

    def run(self):
        self.result_queue.put(self.collect(1))

    def collect(self, n=1):
        n = max(n, self.n_episode)
        return [self.collect_() for _ in range(n)]

    def collect_(self):
        memory = Memory()
        done = False
        EpisodeCollector.n_episode += 1
        obs = self.env.reset()
        i = 0
        total_reward = 0
        while not done:
            actions = self.policy.get_action(obs)
            #action,action_matrix,action_prob = actions
            new_obs, reward, done, info = self.env.step(actions[0])
            # store data
            memory.store(obs, actions, reward, done)

            if done or i % 1000 == 0:
                with lock:
                    print(
                        f"Episode: {EpisodeCollector.n_episode}/{EpisodeCollector.max_episode} | "
                        f"Step: {i} | "
                        f"Env ID: {self.env.env_id} | "
                        f"Reward: {reward} | "
                        f"Total Rewards: {EpisodeCollector.reward_sum} | "
                        f"{info}"
                    )

            obs = new_obs
            i += 1
        EpisodeCollector.reward_sum += info['l_score'] # count the total goal scored by the agent
        if self.replays_dir:
            with open(os.path.join(self.replays_dir, f'replay-{uuid.uuid4().hex}.dill'), 'wb') as f:
                dill.dump(memory, f)
        return memory

# ParallelEpisodeCollector
class ParallelEpisodeCollector:

    def __init__(self, env_fn, n_jobs, policy: PPOPolicy, replays_dir=None, ):
        self.n_jobs = n_jobs
        self.policy: Policy
        self.envs = []
        self.result_queue = Queue()
        self.replays_dir = replays_dir
        for i in range(n_jobs):
            self.envs.append(env_fn(env_id=i))
        self.collectors = [EpisodeCollector(env,
                                            policy=policy,
                                            result_queue=self.result_queue,
                                            replays_dir=replays_dir) for env in self.envs]

    def collect(self, n_steps=1):
        if not n_steps: n_steps = 1
        result_queue = self.result_queue
        for i, collector in enumerate(self.collectors):
            collector = collector.clone()
            self.collectors[i] = collector
            collector.n_episode = max(1, int(n_steps / len(self.collectors)))
            print("Starting collector {}".format(i))
            collector.start()
        tmp = []
        for _ in self.collectors:
            res = result_queue.get()
            tmp.extend(res)
        [collector.join() for collector in self.collectors]
        return tmp

### init 

In [None]:
tf_logs_path = os.path.join(data_path, 'tf_log') # For tensorboard
info_path = os.path.join(data_path, 'info.json')
writer = tf.summary.create_file_writer(tf_logs_path)
os.makedirs(tf_logs_path, exist_ok=True)

# Policy
policy_path = os.path.join('/kaggle_simulations/agent/', 'model')
val_policy_path = os.path.join('/kaggle_simulations/agent/', 'model_val')
policy = PPOPolicy()


# restore previous training state
best_reward = float('-inf')
best_val_reward = 0.
n_episodes = 0
rewards=[]
if os.path.exists(os.path.join(restore_path, 'info.json')):
    with open(os.path.join(restore_path, 'info.json'), 'r') as f:
        info = json.load(f)
        best_reward = info['best_reward']
        best_val_reward = info['best_val_reward']
        n_episodes = info['n_episodes']
    policy.load(os.path.join(restore_path, 'model'))

# Define the episode collector
PARALLEL_COLLECTOR = False # ParallelEpisodeCollector No working since last update of kaggle-environment
collector = None
n_collect = 5 #collect 5 episodes each step
if PARALLEL_COLLECTOR:
    collector = ParallelEpisodeCollector(env_fn, mp.cpu_count(), policy)
else:
    collector = EpisodeCollector(env_fn(), policy)

In [None]:
def train(steps):
    global best_reward, best_val_reward,rewards,n_episodes

    EpisodeCollector.max_episode = steps
    EpisodeCollector.n_episode = n_episodes
    i = 0
    while EpisodeCollector.n_episode < EpisodeCollector.max_episode:
        print("Collect episodes...")
        memories = collector.collect(n_collect)
        print("Updating the policy...")
        losses = policy.train(memories)
        
        reward = record(memories, EpisodeCollector.n_episode, losses)
        # Save the best best policy
        if reward >= best_reward:
            best_reward = reward
            print("Saving best policy...")
            policy.save(policy_path)
        print(
            f"Episode: {n_episodes} | "
            f"Reward: {int(reward)} | "
            f"Best Reward: {int(best_reward)} | "
            f"Episode Rewards: {[mem.rewards[-1] for mem in memories]} | "
        )

        # Validation
        if i % 10 == 0 and i != 0:
            print("Agent validation...")
            policy.val = True
            memories = collector.collect(2)
            policy.val = False

            EpisodeCollector.n_episode -= len(memories)
            rew = sum([mem.rewards[-1] for mem in memories if not mem.isEmpty()]) / len(memories)
            print(f"Validation reward : {rew}")
            with writer.as_default():
                tf.summary.scalar("val_reward", rew, step=EpisodeCollector.n_episode)
                writer.flush()
            if rew >= best_val_reward:
                best_val_reward = rew
                print("Saving best validation policy...")
                policy.save(val_policy_path)
        i += 1


def record(memories: List[Memory], current_step, losses):
    global n_episodes, info_path,rewards
    if not memories:
        return 0
    n_episodes += len(memories)
    reward = sum([memory.rewards[-1] for memory in memories]) / len(memories)
    rewards.append(reward)

    with writer.as_default():
        if losses[0] is not None: tf.summary.scalar("Actor loss", sum(losses[0]) / len(losses[0]), step=current_step)
        if losses[1] is not None: tf.summary.scalar("Critic loss", sum(losses[1]) / len(losses[1]), step=current_step)
        tf.summary.scalar("best_reward", best_reward, step=current_step)
        writer.flush()

    with open(info_path, 'w') as f:
        json.dump({
            'best_reward': best_reward,
            'n_episodes': n_episodes,
            'best_val_reward': best_val_reward,
        }, f)
    return reward

## Start training

In [None]:
# Start training
n_collect = 1
train(20) # The training is long. I'll just show a few episodes and continue on my laptop

In [None]:
print('Plot rewards')
# smooth first
def exponential_average(old, new, b1=0.99):
    return old * b1 + (1 - b1) * new

rewards_ = []
old = 0
for r in rewards:
    old = exponential_average(old,r)
    rewards_.append(old)

sns.lineplot(range(len(rewards)),rewards_)

Now all that remains is to train the agent, hoping that it will converge. Except that train, this agent from scratch will be a very hard task. In my next notebook, I will show how to train it to imitate a rule base one  which will then serve as a base.

In [None]:
#build_actor().save_weights('/kaggle_simulations/agent/model.actor.h5')

In [None]:
#! rm /kaggle_simulations/agent/main.py

## Build submission.tar.gz

In [None]:
%%writefile -a /kaggle_simulations/agent/main.py

import time


# set weights in policy
actor = build_actor(verbose=False)
# Now that we can submit the tar.gz file, I'm still having a timeout issue. 
actor.load_weights('/kaggle_simulations/agent/model.actor.h5')
actor.predict([np.zeros((1,22,8)),np.zeros((1,31))])


def agent(obs):
    s = time.time()

    obs = obs['players_raw'][0]
    state,_ = OBSParser.parse(obs)
    action = np.argmax(actor.predict(state),axis=-1)[0]
    #print(f'Action : {action} | Duration : {time.time()-s}')
    return [int(action)]

In [None]:
! ls /kaggle_simulations/agent/

In [None]:
! cd /kaggle_simulations/agent/ && tar -czvf submission.tar.gz  main.py model.actor.h5
! mv  /kaggle_simulations/agent/submission.tar.gz /kaggle/working/submission.tar.gz

### Test the agent

In [None]:
import sys 
sys.path.append(os.path.abspath("/kaggle_simulations/agent/"))
import main

In [None]:
# There is a time out issue. It needs time to load libraries like TensorFlow... and the model weights. 
# I'm still looking for a workaround
from kaggle_environments import make
env = make("football", 
           debug=True,
           configuration={"save_video": True, 
                          "scenario_name": "11_vs_11_kaggle", 
                          "running_in_notebook": True,
                          #"actTimeout": 30,
                         })  
output = env.run([main.agent, "/kaggle_simulations/agent/main.py"])[-1]
print('Left player: reward = %s, status = %s, info = %s' % (output[0]['reward'], output[0]['status'], output[0]['info']))
print('Right player: reward = %s, status = %s, info = %s' % (output[1]['reward'], output[1]['status'], output[1]['info']))
env.render(mode="human", width=800, height=600)

!!! **Facing two problems:**

ParallelEpisodeCollector getting locked and the timeout issue

I think I'm done. My goal was to build an easily customizable agentüòÖÔ∏è, not training(no resources), Still, I'll be happy to see agents based on this one at the top of the LB. The code can be adapted to any A2C like an algorithm: you just have to change the ppo_loss and the advantage function. 

In [None]:
!rm -rf football
!rm -rf kaggle-environments
!rm -rf kaggle-football