In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.makedirs("/kaggle/kaggle_simulations/agent/saved_model")

In [None]:
os.path.exists("/kaggle/kaggle_simulations/agent/saved_model")

In [None]:
# GFootball environment.
!pip install kaggle_environments
!apt-get update -y
!apt-get install -y libsdl2-gfx-dev libsdl2-ttf-dev
!git clone -b v2.3 https://github.com/google-research/football.git
!mkdir -p football/third_party/gfootball_engine/lib
!wget https://storage.googleapis.com/gfootball/prebuilt_gameplayfootball_v2.3.so -O football/third_party/gfootball_engine/lib/prebuilt_gameplayfootball.so
!cd football && GFOOTBALL_USE_PREBUILT_SO=1 pip3 install .

In [None]:
import gym
import gfootball
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
smm_env = gym.make("GFootball-11_vs_11_kaggle-SMM-v0")
smm_state = smm_env.reset()

In [None]:
from gym.core import ObservationWrapper
from gym.spaces import Box

import cv2


class PreprocessGFootball(ObservationWrapper):
    def __init__(self, env):
        """A gym wrapper that crops, scales image into the desired shapes and optionally grayscales it."""
        ObservationWrapper.__init__(self, env)

        self.img_size = (72, 96, 4)
        self.observation_space = Box(0.0, 1.0, self.img_size)

    def observation(self, img):
        """what happens to each observation"""
        #  * Convert image pixels to (0, 1) range, float32 type.
        #img = cv2.resize(img, (self.img_size[0], self.img_size[1]))#resizing
        img = (img/255).astype(np.float32)
        return np.array(img)

In [None]:
import gym
# spawn game instance for tests
env = gym.make("GFootball-11_vs_11_kaggle-SMM-v0")  # create raw env
env = PreprocessGFootball(env)

observation_shape = env.observation_space.shape
n_actions = env.action_space.n
obs = env.reset()

In [None]:
import numpy as np
from gym.spaces.box import Box
from gym.core import Wrapper


class FrameBuffer(Wrapper):
    def __init__(self, env=None, n_frames=4):
        """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
        super(FrameBuffer, self).__init__(env)
        height, width, n_channels = env.observation_space.shape
        obs_shape = [height, width, n_channels * n_frames]
        self.observation_space = Box(0.0, 1.0, obs_shape)
        self.framebuffer = np.zeros(obs_shape, 'float32')

    def reset(self):
        """resets breakout, returns initial frames"""
        self.framebuffer = np.zeros_like(self.framebuffer)
        self.update_buffer(self.env.reset())
        return self.framebuffer

    def step(self, action):
        """plays breakout for 1 step, returns frame buffer"""
        new_img, reward, done, info = self.env.step(action)
        self.update_buffer(new_img)
        return self.framebuffer, reward, done, info

    def update_buffer(self, img):
        offset = self.env.observation_space.shape[-1]
        axis = -1
        cropped_framebuffer = self.framebuffer[:, :, :-offset]
        self.framebuffer = np.concatenate(
            [img, cropped_framebuffer], axis=axis)


In [None]:
def make_env():
    env = gym.make("GFootball-11_vs_11_kaggle-SMM-v0")
    env = PreprocessGFootball(env)
    env = FrameBuffer(env, n_frames=4)
    return env

In [None]:
env = make_env()
env.reset()
n_actions = env.action_space.n
state_dim = env.observation_space.shape
print(f"n_actions : {n_actions}\nstate_dim : {state_dim}")

In [None]:
from keras.layers import Conv2D, Dense, Flatten,InputLayer
from keras.models import Sequential, Model

class Conv_DQN_Model(Model):
    def __init__(self, state_shape, num_actions):
        super(Conv_DQN_Model, self).__init__()
        self.input_layer = InputLayer(input_shape = state_shape,)
        self.conv1 = Conv2D(filters = 16, kernel_size = (3,3), strides=(2), activation='relu')
        self.conv2 = Conv2D(filters = 32, kernel_size = (3,3), strides=(2), activation='relu')
        self.conv3 = Conv2D(filters = 64, kernel_size = (3,3), strides=(2), activation='relu')
        self.flat = Flatten()
        self.dense1 = Dense(256,activation='relu')
        self.output_layer = Dense(n_actions, activation='linear')

    #@tf.function
    def call(self, inputs):
        t = self.input_layer(inputs)
        t = tf.expand_dims(t, axis=0)
        t = self.conv1(t)
        t = self.conv2(t)
        t = self.conv3(t)
        t = self.flat(t)
        t = self.dense1(t)
        output = self.output_layer(t)
        return output

In [None]:
class DQNAgent:
    def __init__(self, name, state_shape, n_actions, epsilon=0, reuse=False):
        """A simple DQN agent"""
        self.agent = Conv_DQN_Model(state_shape, n_actions)
        
        self.epsilon = epsilon
        
    def save_agent(self,path) :
        self.agent.save(path)

    def weights(self) :
        return self.agent.get_weights()
    
    def load_weights(self, _agent) :
        self.agent.set_weights(_agent.weights())
    
    def get_symbolic_qvalues(self, state_t):
        """takes agent's observation, returns qvalues. Both are tf Tensors"""
        #<YOUR CODE: apply your network layers here>
        qvalues = self.agent(state_t)

        assert tf.debugging.is_numeric_tensor(qvalues) and qvalues.shape.ndims == 2, \
            "please return 2d tf tensor of qvalues [you got %s]" % repr(qvalues)
        assert int(qvalues.shape[1]) == n_actions
        return qvalues

    def get_qvalues(self, state_t):
        """Same as symbolic step except it operates on numpy arrays"""
        qvalues = self.get_symbolic_qvalues(state_t)
        qvalues = qvalues.numpy()
        return qvalues

    def sample_actions(self, qvalues):
        """pick actions given qvalues. Uses epsilon-greedy exploration strategy. """
        epsilon = self.epsilon
        batch_size, n_actions = qvalues.shape
        random_actions = np.random.choice(n_actions, size=batch_size)
        best_actions = qvalues.argmax(axis=-1)
        should_explore = np.random.choice([0, 1], batch_size, p=[1-epsilon, epsilon])
        return np.where(should_explore, random_actions, best_actions)

In [None]:
agent = DQNAgent("dqn_agent", state_dim, n_actions, epsilon=0.5)

In [None]:
def evaluate(env, agent, n_games=1, greedy=False, t_max=10000):
    """ Plays n_games full games. If greedy, picks actions as argmax(qvalues). Returns mean reward. """
    rewards = []
    for _ in range(n_games):
        s = env.reset()
        reward = 0
        for _ in range(t_max):
            qvalues = agent.get_qvalues(s)
            action = qvalues.argmax(axis=-1)[0] if greedy else agent.sample_actions(qvalues)[0]
            s, r, done, _ = env.step(action)
            reward += r
            if done:
                break

        rewards.append(reward)
    return np.mean(rewards)
evaluate(env, agent, n_games=1)

In [None]:
import random

class ReplayBuffer(object):
    def __init__(self, size):
        """Create Replay buffer.
        Parameters
        ----------
        size: int
            Max number of transitions to store in the buffer. When the buffer
            overflows the old memories are dropped.
        """
        self._storage = []
        self._maxsize = size
        self._next_idx = 0

    def __len__(self):
        return len(self._storage)

    def add(self, obs_t, action, reward, obs_tp1, done):
        data = (obs_t, action, reward, obs_tp1, done)

        if self._next_idx >= len(self._storage):
            self._storage.append(data)
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize

    def _encode_sample(self, idxes):
        obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
        for i in idxes:
            data = self._storage[i]
            obs_t, action, reward, obs_tp1, done = data
            obses_t.append(np.array(obs_t, copy=False))
            actions.append(np.array(action, copy=False))
            rewards.append(reward)
            obses_tp1.append(np.array(obs_tp1, copy=False))
            dones.append(done)
        return (
            np.array(obses_t),
            np.array(actions),
            np.array(rewards),
            np.array(obses_tp1),
            np.array(dones)
        )

    def sample(self, batch_size):
        """Sample a batch of experiences.
        Parameters
        ----------
        batch_size: int
            How many transitions to sample.
        Returns
        -------
        obs_batch: np.array
            batch of observations
        act_batch: np.array
            batch of actions executed given obs_batch
        rew_batch: np.array
            rewards received as results of executing act_batch
        next_obs_batch: np.array
            next set of observations seen after executing act_batch
        done_mask: np.array
            done_mask[i] = 1 if executing act_batch[i] resulted in
            the end of an episode and 0 otherwise.
        """
        idxes = [
            random.randint(0, len(self._storage) - 1)
            for _ in range(batch_size)
        ]
        return self._encode_sample(idxes)

In [None]:
def play_and_record(agent, env, exp_replay, n_steps=1):
    """
    Play the game for exactly n steps, record every (s,a,r,s', done) to replay buffer. 
    Whenever game ends, add record with done=True and reset the game.
    It is guaranteed that env has done=False when passed to this function.

    PLEASE DO NOT RESET ENV UNLESS IT IS "DONE"

    :returns: return sum of rewards over time
    """
    # initial state
    s = env.reset()
    reward = 0
    # Play the game for n_steps as per instructions above
    for step in range(n_steps) :
        qvalues = agent.get_qvalues([s])
        action = agent.sample_actions(qvalues)[0]
          #data = [s,action]
        _s, _r, _done, _ = env.step(action)
        exp_replay.add(s,action,_r,_s,_done)
        reward += _r
        if _done:
            s = env.reset()
    return reward

In [None]:
# testing your code. This may take a minute...
exp_replay = ReplayBuffer(10**3)
reward = play_and_record(agent, env, exp_replay, n_steps=10**3)
print(reward)
# if you're using your own experience replay buffer, some of those tests may need correction.
# just make sure you know what your code does

In [None]:
target_network = DQNAgent("target_network", state_dim, n_actions)

In [None]:
def load_weigths_into_target_network(agent, target_network):
    """ assign target_network.weights variables to their respective agent.weights values. """
    target_network.load_weights(agent)

In [None]:
# placeholders that will be fed with exp_replay.sample(batch_size)
obs_ph = tf.Variable(tf.random.uniform(state_dim,minval=0,maxval=1,dtype=tf.dtypes.float32),trainable = True)
#tf.compat.v1.placeholder(tf.float32, shape=(None,) + state_dim)
actions_ph = tf.Variable(1,trainable=True,dtype=tf.int32)
rewards_ph = tf.Variable(1,trainable=True,dtype=tf.float32)
next_obs_ph = tf.Variable(tf.random.uniform(state_dim,minval=0,maxval=1,dtype=tf.dtypes.float32),trainable = True)
is_done_ph = tf.Variable(0.,dtype=tf.float32)

is_not_done = 1 - is_done_ph
gamma = 0.99

In [None]:
def train_step(td=False):
    with tf.GradientTape() as tape:
        current_qvalues = agent.get_symbolic_qvalues(obs_ph)
        current_action_qvalues = tf.reduce_sum(tf.one_hot(actions_ph, n_actions) * current_qvalues, axis=1)
        next_qvalues_target = target_network.get_symbolic_qvalues(next_obs_ph)
        next_state_values_target = tf.reduce_max(next_qvalues_target,axis=1)
        reference_qvalues = rewards_ph + gamma * next_state_values_target * is_not_done
        td_loss = tf.square(current_action_qvalues - reference_qvalues)
        td_loss = tf.reduce_mean(td_loss)
        if td : return td_loss
        #print("td_loss",td_loss)
    variables = self.model.trainable_variables
    gradients = tape.gradient(td_loss, variables)
    self.optimizer.apply_gradients(zip(gradients, variables))
    return loss

In [None]:
from tqdm import trange
import pandas as pd
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

def moving_average(x, span=100, **kw):
    return pd.DataFrame({'x': np.asarray(x)}).x.ewm(span=span,ignore_na=True, **kw).mean().values

mean_rw_history = []
td_loss_history = []

In [None]:
exp_replay = ReplayBuffer(2500)
play_and_record(agent, env, exp_replay, n_steps=10000)


def sample_batch(exp_replay, batch_size):
    obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(batch_size)
    return {
        obs_ph: obs_batch,
        actions_ph: act_batch,
        rewards_ph: reward_batch,
        next_obs_ph: next_obs_batch,
        is_done_ph: is_done_batch,
    }

In [None]:
#tf.compat.v1.disable_tensor_equality()
for i in trange(30000):
    # play
    play_and_record(agent, env, exp_replay, 10)
    # train
    loss_t = train_step(True)
    td_loss_history.append(loss_t.numpy())

    # adjust agent parameters
    if i % 500 == 0:
        # You could think that loading weights onto a target network is simply
        #     load_weigths_into_target_network(agent, target_network)
        # but actually calling this function repeatedly creates a TF copy operator
        # again and again, which bloats memory consumption with each training step.
        # Instead, you should create 'copy_step' once.
        #sess.run(copy_step)
        load_weigths_into_target_network(agent, target_network)
        agent.epsilon = max(agent.epsilon * 0.99, 0.01)
        mean_rw_history.append(evaluate(make_env(), agent, n_games=3))

    if i % 100 == 0:
        clear_output(True)
        print("buffer size = %i, epsilon = %.5f" % (len(exp_replay), agent.epsilon))
        #if len(exp_replay) > 30000 : break

        plt.subplot(1, 2, 1)
        plt.title("mean reward per game")
        plt.plot(mean_rw_history)
        plt.grid()

        assert not np.isnan(False)
        plt.figure(figsize=[12, 4])
        plt.subplot(1, 2, 2)
        plt.title("TD loss history (moving average)")
        plt.plot(moving_average(np.array(td_loss_history), span=100, min_periods=100))
        plt.grid()
        plt.show()

In [None]:
os.makedirs("/kaggle_simulations/agent/saved_model")

In [None]:
os.chdir("/kaggle_simulations/agent")

In [None]:
!ls

In [None]:
agent.save_agent("/kaggle_simulations/agent/saved_model")

In [None]:
%%writefile /kaggle_simulations/agent/main.py

import gym
import gfootball
import numpy as np
import pandas as pd
import tensorflow as tf

from gym.core import ObservationWrapper
from gym.spaces import Box
import cv2

from gym.spaces.box import Box
from gym.core import Wrapper


from gfootball.env import observation_preprocessing
from tensorflow import keras


class FrameBuffer(Wrapper):
    def __init__(self, env=gym.Env, n_frames=4):
        """A gym wrapper that reshapes, crops and scales image into the desired shapes"""
        super(FrameBuffer, self).__init__(env)
        height, width, n_channels = [72, 96, 4]
        obs_shape = [height, width, n_channels * n_frames]
        self.observation_space = Box(0.0, 1.0, obs_shape)
        self.framebuffer = np.zeros(obs_shape, 'float32')

    def reset(self):
        """resets breakout, returns initial frames"""
        self.framebuffer = np.zeros_like(self.framebuffer)
        self.update_buffer(self.env.reset())
        return self.framebuffer

    def step(self, action):
        """plays breakout for 1 step, returns frame buffer"""
        new_img, reward, done, info = self.env.step(action)
        self.update_buffer(new_img)
        return self.framebuffer, reward, done, info

    def update_buffer(self, img):
        offset = 4
        axis = -1
        cropped_framebuffer = self.framebuffer[:, :, :-offset]
        self.framebuffer = np.concatenate(
            [img, cropped_framebuffer], axis=axis)
        return self.framebuffer


from keras.layers import Conv2D, Dense, Flatten,InputLayer
from keras.models import Sequential, Model

class Conv_DQN_Model(Model):
    def __init__(self, state_shape=(72,96,16), num_actions=19):
        super(Conv_DQN_Model, self).__init__()
        self.input_layer = InputLayer(input_shape = state_shape,batch_size=1)
        self.conv1 = Conv2D(filters = 16, kernel_size = (3,3), strides=(2), activation='relu')
        self.conv2 = Conv2D(filters = 32, kernel_size = (3,3), strides=(2), activation='relu')
        self.conv3 = Conv2D(filters = 64, kernel_size = (3,3), strides=(2), activation='relu')
        self.flat = Flatten()
        self.dense1 = Dense(256,activation='relu')
        self.output_layer = Dense(n_actions, activation='linear')

    #@tf.function
    def call(self, inputs):
        t = self.input_layer(inputs)
        t = tf.expand_dims(t, axis=0)
        t = self.conv1(t)
        t = self.conv2(t)
        t = self.conv3(t)
        t = self.flat(t)
        t = self.dense1(t)
        output = self.output_layer(t)
        return output
    

class DQNAgent:
    def __init__(self, name="agent", state_shape=(72,96,16), n_actions=19, epsilon=0, reuse=False):
        """A simple DQN agent"""
        self.agent = Conv_DQN_Model(state_shape, n_actions)
        
        self.epsilon = epsilon

    def weights(self) :
        return self.agent.get_weights()
    
    def load_weights(self, _agent) :
        self.agent.set_weights(_agent.weights())
    
    def get_symbolic_qvalues(self, state_t):
        """takes agent's observation, returns qvalues. Both are tf Tensors"""
        #<YOUR CODE: apply your network layers here>
        qvalues = self.agent(state_t)

        assert tf.debugging.is_numeric_tensor(qvalues) and qvalues.shape.ndims == 2, \
            "please return 2d tf tensor of qvalues [you got %s]" % repr(qvalues)
        assert int(qvalues.shape[1]) == n_actions
        return qvalues

    def get_qvalues(self, state_t):
        """Same as symbolic step except it operates on numpy arrays"""
        qvalues = self.get_symbolic_qvalues(state_t)
        qvalues = qvalues.numpy()
        return qvalues

    def sample_actions(self, qvalues):
        """pick actions given qvalues. Uses epsilon-greedy exploration strategy. """
        epsilon = self.epsilon
        batch_size, n_actions = qvalues.shape
        random_actions = np.random.choice(n_actions, size=batch_size)
        best_actions = qvalues.argmax(axis=-1)
        should_explore = np.random.choice([0, 1], batch_size, p=[1-epsilon, epsilon])
        return np.where(should_explore, random_actions, best_actions)
  
            
dqn_agent = keras.models.load_model("/kaggle_simulations/agent/saved_model")
frame_buffer = FrameBuffer() 
def agent(obs):
    global dqn_agent
    global frame_buffer
    obs = obs['players_raw'][0]
    obs = observation_preprocessing.generate_smm([obs])[0]
    obs = (obs/255).astype(np.float32)
    obs = frame_buffer.update_buffer(obs)
    #print(obs.shape)    
    qvalues = dqn_agent(obs).numpy()
    action = np.argmax(qvalues)
    
    return [int(action)]

In [None]:
from typing import Tuple, Dict, List, Any

from kaggle_environments import make

env = make("football", debug=True,configuration={"save_video": True,
                                                 "scenario_name": "11_vs_11_kaggle"})

# Define players
left_player = "/kaggle_simulations/agent/main.py"  # A custom agent, eg. random_agent.py or example_agent.py
right_player = "/kaggle_simulations/agent/main.py"  # eg. A built in 'AI' agent or the agent again


output: List[Tuple[Dict[str, Any], Dict[str, Any]]] = env.run([left_player, right_player])

print(f"Final score: {sum([r['reward'] for r in output[0]])} : {sum([r['reward'] for r in output[1]])}")
env.render(mode="human", width=800, height=600)

In [None]:
# Prepare a submision package containing trained model and the main execution logic.
!cd /kaggle_simulations/agent && tar -czvf /kaggle/working/submit.tar.gz main.py saved_model