```yaml
atari:
  env_wrapper:
    - stable_baselines3.common.atari_wrappers.AtariWrapper
  frame_stack: 4
  policy: 'CnnPolicy'
  n_timesteps: !!float 1e7
  buffer_size: 100000
  learning_rate: !!float 1e-4
  batch_size: 32
  learning_starts: 100000
  target_update_interval: 1000
  train_freq: 4
  gradient_steps: 1
  exploration_fraction: 0.1
  exploration_final_eps: 0.01
  optimize_memory_usage: True

# Almost Tuned
CartPole-v1:
  n_timesteps=5e4,
  policy='MlpPolicy',
  learning_rate=float 2.3e-3,
  batch_size=64,
  buffer_size=100000,
  learning_starts=1000,
  gamma=0.99,
  target_update_interval=10,
  train_freq=256,
  gradient_steps=128,
  exploration_fraction=0.16,
  exploration_final_eps=0.04,
  policy_kwargs=policy_kwargs,

# Tuned
MountainCar-v0:
  n_timesteps=1.2e5,
  policy='MlpPolicy',
  learning_rate=float 4e-3,
  batch_size=128,
  buffer_size=10000,
  learning_starts=1000,
  gamma=0.98,
  target_update_interval=600,
  train_freq=16,
  gradient_steps=8,
  exploration_fraction=0.2,
  exploration_final_eps=0.07,
  policy_kwargs=policy_kwargs,

# Tuned
LunarLander-v2:
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  learning_rate: !!float 6.3e-4
  batch_size: 128
  buffer_size: 50000
  learning_starts: 0
  gamma: 0.99
  target_update_interval: 250
  train_freq: 4
  gradient_steps: -1
  exploration_fraction: 0.12
  exploration_final_eps: 0.1
  policy_kwargs: "dict(net_arch=[256, 256])"

# Tuned
Acrobot-v1:
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  learning_rate: !!float 6.3e-4
  batch_size: 128
  buffer_size: 50000
  learning_starts: 0
  gamma: 0.99
  target_update_interval: 250
  train_freq: 4
  gradient_steps: -1
  exploration_fraction: 0.12
  exploration_final_eps: 0.1
  policy_kwargs: "dict(net_arch=[256, 256])"
  ```

---

# <center> GFootball Stable-Baselines3 </center>

---
<center><img src="https://raw.githubusercontent.com/DLR-RM/stable-baselines3/master/docs/_static/img/logo.png" width="308" height="268" alt="Stable-Baselines3"></center>
<center><small>Image from Stable-Baselines3 repository</small></center>

---
This notebook uses the [Stable-Baselines3](https://github.com/DLR-RM/stable-baselines3) library to train a [PPO](https://openai.com/blog/openai-baselines-ppo/) reinforcement learning agent on [GFootball Academy](https://github.com/google-research/football/tree/master/gfootball/scenarios) scenarios, applying the architecture from the paper "[Google Research Football: A Novel Reinforcement Learning Environment](https://arxiv.org/abs/1907.11180)".

In [None]:
import sys
sys.path.append("..")
# sys.path.append("../imitation_learning")
import os
import base64
import pickle
import zlib
import gym
import numpy as np
import pandas as pd
import torch as th
from torch import nn, tensor
from collections import deque
from gym.spaces import Box, Discrete
# from kaggle_environments import make
# from kaggle_environments.envs.football.helpers import *
from gfootball.env import create_environment, observation_preprocessing, wrappers
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy
from stable_baselines3.common import results_plotter
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env.base_vec_env import VecEnv
from stable_baselines3.common.policies import BasePolicy, register_policy
from IPython.display import HTML
import time
from datetime import date
# from visualizer import visualize
from matplotlib import pyplot as plt
from stable_baselines3 import DQN
import torch
from models.MlpClassifierModel import MlpClassifierModel
%matplotlib inline

In [None]:
torch.manual_seed(42)
torch.manual_seed(torch.initial_seed())

---
# Football Gym
> [Stable-Baselines3: Custom Environments](https://stable-baselines3.readthedocs.io/en/master/guide/custom_env.html)<br/>
> [SEED RL Agent](https://www.kaggle.com/piotrstanczyk/gfootball-train-seed-rl-agent): stacked observations

In [None]:
class FootballGym(gym.Env):
    spec = None
    metadata = None
#     metadata = {'render.modes': ['human']}
    
    def __init__(self, config=None, render=False, rewards='scoring'):
        super(FootballGym, self).__init__()
        env_name = "academy_empty_goal_close"
#         rewards = "scoring,checkpoints"

        rewards = rewards
        if config is not None:
            env_name = config.get("env_name", env_name)
            rewards = config.get("rewards", rewards)
        self.env = create_environment(
            env_name=env_name,
            stacked=False,
            representation="simple115v2",
            rewards = rewards,
            write_goal_dumps=False,
            write_full_episode_dumps=False,
            render=render,
            write_video=False,
            dump_frequency=1,
            logdir=".",
            extra_players=None,
            number_of_left_players_agent_controls=1,
            number_of_right_players_agent_controls=0)
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        self.reward_range = (-1, 1)
        self.obs_stack = deque([], maxlen=4)
        
    def reset(self):
        self.obs_stack.clear()
        obs = self.env.reset()
#         obs = self.transform_obs(obs)
        return obs
    
    def step(self, action):
        obs, reward, done, info = self.env.step([action])
#         obs = self.transform_obs(obs)
        return obs, float(reward), done, info
    
# check_env(env=FootballGym(), warn=True)

---
# Football CNN
> [Stable-Baselines3: Custom Policy Network](https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html)<br/>
> [Google Research Football: A Novel Reinforcement Learning Environment](https://arxiv.org/abs/1907.11180)

---
> [Stable-Baselines3: PPO](https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html)<br/>
> [Stable-Baselines3: Vectorized Environments](https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html)<br/>
> [Stable-Baselines3: Custom Policy Network](https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html)<br/>
> [GFootball: A Novel Reinforcement Learning Environment](https://arxiv.org/abs/1907.11180)<br/>
> [GFootball: Academy Scenarios](https://github.com/google-research/football/tree/master/gfootball/scenarios)<br/>

In [None]:
import pytorch_lightning as pl

In [None]:
class MlpClassifierModel(pl.LightningModule):
    def __init__(self, hparams, input_size: int = 115, p_dropout: float = 0.25, num_classes:int = 19):
        super().__init__()
        
        self.hparams.update(hparams)
        # self.hidden_size = hparams.get('hidden_size', 128)
#         self.hidden_size = hparams['hidden_size']
        self.lr = hparams['lr']
        self.batch_size = hparams['batch_size']
        self.activation = hparams['activation']
        hidden_size = hparams['hidden_size']
        if self.activation == 'LeakyReLU':
            act_func = nn.LeakyReLU()
        elif self.activation == 'ReLU':
            act_func = nn.ReLU()
#         else:
#             act_func = nn.PReLU()
        else:
            act_func = nn.GELU()
            
        self.model = nn.Sequential(
            # nn.BatchNorm1d(input_dim, affine=False),
            nn.Linear(input_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            act_func,
            nn.Dropout(p_dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            act_func,
            nn.Dropout(p_dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            act_func,
            nn.Dropout(p_dropout),
#             nn.Linear(hidden_size, num_classes)
        )
        self.init_weights()

    def init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                torch.nn.init.kaiming_uniform_(module.weight)
                torch.nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.BatchNorm1d):
                if module.affine:
                    torch.nn.init.constant_(module.weight, 1)
                    module.bias.data.zero_()

    def forward(self, input_tensor):
        return self.model(input_tensor)

In [None]:
class FootballMLP(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=115):
        super().__init__(observation_space, features_dim)
        self.mlp = MlpClassifierModel(hparams, input_size=115, p_dropout=0.25, num_classes=19)
        
    def forward(self, input_tensor):
        return self.mlp(input_tensor)  

In [None]:
hparams = {}
hparams['hidden_size'] = 1024
hparams['lr'] = 2e-3
hparams['lr_decay_rate'] = 0.25
hparams['batch_size'] = 256
hparams['activation'] = 'GELU'
# hparams['activation'] = 'ReLU'
# model = MLPModel(hparams).to('cuda')
# model = MlpClassifierModel(hparams).to('cuda')

## DDQN Model

In [None]:
from torch.nn import functional as F

class DDQN(DQN):
    def train(self, gradient_steps: int, batch_size: int = 100) -> None:
        # Switch to train mode (this affects batch norm / dropout)
        self.policy.set_training_mode(True)
        # Update learning rate according to schedule
        self._update_learning_rate(self.policy.optimizer)

        losses = []
        for _ in range(gradient_steps):
            ### YOUR CODE HERE
            # Sample replay buffer
            replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)

            # Do not backpropagate gradient to the target network
            with th.no_grad():
#                 print(f"replay data actions: {replay_data.actions.shape} and data: {replay_data.actions}")
                # Compute the next Q-values using the target network
                next_q_values = self.q_net_target(replay_data.next_observations)
#                 print(f"Next Q values shape calculated by target net: {next_q_values.shape}")
                # Decouple action selection from value estimation
                # Compute q-values for the next observation using the online q net
                next_q_values_online = self.q_net(replay_data.next_observations)
#                 print(f"Next Q values shape calculated by Online net: {next_q_values_online.shape}")
                # Select action with online network
                next_actions_online = next_q_values_online.argmax(dim=1)
#                 print(f"Next Actions shape calculated by Online net: {next_actions_online.shape}")
                # Estimate the q-values for the selected actions using target q network
                next_q_values = th.gather(next_q_values, dim=1, index=next_actions_online.unsqueeze(-1))
#                 print(f"Next Q values calculated by Target net from the selected actions: {next_q_values.shape}")
               
                # 1-step TD target
                target_q_values = replay_data.rewards + (1 - replay_data.dones) * self.gamma * next_q_values

            # Get current Q-values estimates
            current_q_values = self.q_net(replay_data.observations)

            # Retrieve the q-values for the actions from the replay buffer
            current_q_values = th.gather(current_q_values, dim=1, index=replay_data.actions.long())

            # Check the shape
            assert current_q_values.shape == target_q_values.shape

            # Compute loss (L2 or Huber loss)
            loss = F.smooth_l1_loss(current_q_values, target_q_values)

            ### END OF YOUR CODE
            
            losses.append(loss.item())

            # Optimize the q-network
            self.policy.optimizer.zero_grad()
            loss.backward()
            # Clip gradient norm
            th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
            self.policy.optimizer.step()

        # Increase update counter
        self._n_updates += gradient_steps

        self.logger.record("train/n_updates", self._n_updates, exclude="tensorboard")
        self.logger.record("train/loss", np.mean(losses))

In [None]:
scenarios = {0: "academy_empty_goal_close",
             1: "academy_empty_goal",
             2: "academy_run_to_score",
             3: "academy_run_to_score_with_keeper",
             4: "academy_pass_and_shoot_with_keeper",
             5: "academy_run_pass_and_shoot_with_keeper",
             6: "academy_3_vs_1_with_keeper",
             7: "academy_corner",
             8: "academy_counterattack_easy",
             9: "academy_counterattack_hard",
             10: "academy_single_goal_versus_lazy",
             11: "11_vs_11_kaggle",
             12: "11_vs_11_stochastic",
             13: "11_vs_11_easy_stochastic",
             14: "11_vs_11_hard_stochastic"}

scenario_name = scenarios[13]

### Environment creation and logging

In [None]:
from typing import Callable
from stable_baselines3.common.utils import set_random_seed
def make_env(config: dict, rank: int, log_save_dir: str, seed: int = 42) -> Callable:
    """
    Utility function for multiprocessed env.
    
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    """
    def _init() -> gym.Env:
        env = FootballGym(config)
        log_file = os.path.join(log_save_dir, str(rank))
        env = Monitor(env, log_file, allow_early_resets=True)
        env.seed(seed + rank)
        return env
    set_random_seed(seed)
    return _init

In [None]:
# Creating the vectorized training environmewnt and also creating the direcotry for logging

timestamp = time.strftime('%d-%m-%Y-%H-%M-%S', time.localtime())
print(timestamp)

n_envs = 8
config={"env_name":scenario_name, 'reward':'baal, checkpoint'}
log_save_dir = os.path.join("../logs/dqn_logs", timestamp)
print(f"Log dir: {log_save_dir}")
os.mkdir(log_save_dir)
train_env = SubprocVecEnv([make_env(config, rank=i, log_save_dir=log_save_dir) for i in range(n_envs)])

### Initializing the Vanilla DQN model

In [None]:
# # This initialization concentrates most of the layers into the feature extractor
# # and then leaves only a single layer for the prediction part

# policy_kwargs = dict(features_extractor_class=FootballMLP,
#                      features_extractor_kwargs=dict(features_dim=1024),
#                     net_arch = [],
#                     )
# model_name = "dqn"
# model = DQN(policy="MlpPolicy", 
#             env=train_env, 
#             policy_kwargs=policy_kwargs, 
#             verbose=1,
#             exploration_initial_eps=0.00,
#             exploration_final_eps=0.0,
#             target_update_interval=15000,
# #             learning_rate=hparams['lr'],
# #             batch_size=hparams['batch_size'],
#             seed=42,
#             tensorboard_log='tb_logs_DQN',
#             learning_starts=100000,
#            )
# model.policy

In [None]:
# # Configuring the DQN net like SB3's network structure
# # Only one layer for the feature extraction and the rest for the Q value computation
# from stable_baselines3 import DQN

# policy_kwargs = dict(
#     net_arch = [1024, 1024, 1024],
#     activation_fn = torch.nn.GELU
# )
# model_name = "dqn"
# model = DQN(policy="MlpPolicy", 
#             env=train_env, 
#             policy_kwargs=policy_kwargs, 
#             verbose=1,
#             exploration_initial_eps=0.00,
#             exploration_final_eps=0.0,
#             target_update_interval=15000,
# #             learning_rate=hparams['lr'],
# #             batch_size=hparams['batch_size'],
#             seed=42,
#             tensorboard_log='tb_logs_DQN',
#             learning_starts=100000,
#            )
# model.policy
           

### Double DQN

In [None]:
# DDQN

policy_kwargs = dict(
    net_arch = [1024, 1024, 1024],
    activation_fn = torch.nn.ReLU
)

# policy_kwargs = dict(features_extractor_class=FootballMLP,
#                      features_extractor_kwargs=dict(features_dim=1024),
#                     net_arch = [],
#                     )
model_name = "ddqn"
model = DDQN(policy="MlpPolicy", 
            env=train_env, 
            policy_kwargs=policy_kwargs, 
            verbose=1,
            exploration_initial_eps=0.00,
            exploration_final_eps=0.00,
            target_update_interval=150000,
#             learning_rate=0.0000001,
#             batch_size=hparams['batch_size'],
            seed=42,
            tensorboard_log='tb_logs_DDQN',
            train_freq=3002,
#             learning_starts=100000,
           )

# With cartpole initialization
# model = DDQN(policy="MlpPolicy", 
#             env=train_env, 
#             learning_rate=2.3e-3,
#             batch_size=64,
#             buffer_size=100000,
#             learning_starts=1000,
#             gamma=0.99,
#             target_update_interval=10,
#             train_freq=256,
#             gradient_steps=128,
#             exploration_fraction=0.16,
#             exploration_final_eps=0.04,
#             policy_kwargs=policy_kwargs,
#              tensorboard_log='tb_logs_DDQN',
#              seed=42,
#              verbose=1,
#            )
model.policy
           

In [None]:
model.policy.state_dict()

In [None]:
# checkpoint_path = "/media/ssk/DATA/GRP_code/gr_football_analytics/notebooks/lightning_logs/version_27/checkpoints/epoch=208-step=681548.ckpt"
checkpoint_path = "/media/ssk/DATA/GRP_code/gr_football_analytics/notebooks/lightning_logs/version_38/checkpoints/epoch=146-step=479366.ckpt"
checkpoint_dict = torch.load(checkpoint_path)
checkpoint_dict

In [None]:
mlp_keys_todqn_keys_dict = {}
sd_dqn_model = model.policy.state_dict()
count = 0
for mlp_key, dqn_key in zip(checkpoint_dict['state_dict'].keys(), model.policy.q_net.state_dict().keys()):
    sd_dqn_model['q_net.' + dqn_key] = checkpoint_dict['state_dict'][mlp_key]
for mlp_key, dqn_key in zip(checkpoint_dict['state_dict'].keys(), model.policy.q_net_target.state_dict().keys()):
    sd_dqn_model['q_net_target.' + dqn_key] = checkpoint_dict['state_dict'][mlp_key]
    
model.policy.load_state_dict(sd_dqn_model)

# Check the model's weights after loading the weights from IL agent
model.policy.state_dict()

In [None]:
# for idx, param in enumerate(model.policy.parameters()):
# #     if param.shape == torch.Size([19, 1024]) or param.shape == torch.Size([19]):
#     if param.shape == torch.Size([19]):
#         print(param.shape, param.requires_grad)
#         param.requires_grad = True
#     else:
#         param.requires_grad = False
#         #     print(param.shape)

In [None]:
# for idx, param in enumerate(model.policy.parameters()):
# #     param.requires_grad = True
#     print(param.shape, param.requires_grad)

---
# Training
> [Stable-Baselines3: Examples](https://stable-baselines3.readthedocs.io/en/master/guide/examples.html)<br/>
> [Stable-Baselines3: Callbacks](https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html)

In [None]:
from tqdm.notebook import tqdm
class ProgressBar(BaseCallback):
    def __init__(self, verbose=0):
        super(ProgressBar, self).__init__(verbose)
        self.pbar = None

    def _on_training_start(self):
        factor = np.ceil(self.locals['total_timesteps'] / n_steps)
        print(f"self.locals['total_timesteps']:{self.locals['total_timesteps']}, n_steps: {n_steps}")
        n = 1
        try:
            n = len(self.training_env.envs)
        except AttributeError:
            try:
                n = len(self.training_env.remotes)
            except AttributeError:
                n = 1
        total = int(n_steps * factor / n)
        self.pbar = tqdm(total=total)

    def _on_rollout_start(self):
        self.pbar.refresh()

    def _on_step(self):
        self.pbar.update(1)
        return True

    def _on_rollout_end(self):
        self.pbar.refresh()

    def _on_training_end(self):
        self.pbar.close()
        self.pbar = None

progressbar = ProgressBar()

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
# callback = EvalCallback(train_env, log_path = log_save_dir, deterministic=True)

In [None]:
total_epochs = 30
n_steps = 3002
total_timesteps = n_steps * n_envs * total_epochs
model.learn(total_timesteps=total_timesteps, callback=progressbar, log_interval=8, tb_log_name='ddqn_scoring_il_init_30_epochs_train_freq_3002_added_KL_loss_epsilon_0')


saved_model_name = model_name + '_gfootball_' + str(n_envs) + "_" + timestamp
model.save(f"../models/{model_name}/{saved_model_name}")

### Visualization with Tensorboard

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir tb_logs_DDQN

In [None]:
# for idx, param in enumerate(model.policy.parameters()):
#     if param.shape == torch.Size([19, 1024]) or param.shape == torch.Size([19]):
#         print(param.shape, param.requires_grad)
#         param.requires_grad = True
#     else:
#         param.requires_grad = False
#         #     print(param.shape)

In [None]:
# for idx, param in enumerate(model.policy.parameters()):
# #     param.requires_grad = True
#     print(param.shape, param.requires_grad)

In [None]:
# a = torch.randn([2, 3])
# a.shape == torch.Size([2, 3])

### Visualizing the results

In [None]:
# Plot the rewards per timestep and per episode

plt.style.use(['seaborn-whitegrid'])
results_plotter.plot_results([log_save_dir], total_timesteps, results_plotter.X_TIMESTEPS, "GFootball Timesteps")
# plt.savefig('../figures/dqn/rewards_per_timestamp_dqn_with_my_policy.png')
results_plotter.plot_results([log_save_dir], total_timesteps, results_plotter.X_EPISODES, "GFootball Episodes")
# plt.savefig('../figures/dqn/rewards_per_episode_dqn_with_my_policy.png')

In [None]:
# Plot the episodic reward with line

x, y = results_plotter.ts2xy(results_plotter.load_results(log_save_dir), 'timesteps')  # Organising the logged results in to a clean format for plotting.
fig = plt.figure(figsize=(20, 16))
plt.plot(x,y)
plt.ylim([-10, 10])
plt.xlabel('Timesteps')
plt.ylabel('Episode Rewards')
plt.savefig('../figures/dqn/episode_rewards_50_epochs_with_IL_diff_net.jpg')

In [None]:
# Plot the rolling mean reward per environment

plt.style.use(['seaborn-whitegrid'])
log_files = [os.path.join(log_save_dir, f"{i}.monitor.csv") for i in range(n_envs)]

nrows = np.ceil(n_envs/2)
fig = plt.figure(figsize=(8, 2 * nrows))
for i, log_file in enumerate(log_files):
    if os.path.isfile(log_file):
        df = pd.read_csv(log_file, skiprows=1)
        plt.subplot(nrows, 2, i+1, label=log_file)
        df['r'].rolling(window=5).mean().plot(title=f"Rewards: Env {i}")
        plt.tight_layout()
plt.show()

In [None]:
# Plot the mean episodic reward

df = pd.read_csv('data_for_figures/run-ddqn_3-tag-rollout_ep_rew_mean.csv')
fig = plt.figure(figsize=(10, 8))
df.plot(x ='Step', y='Value')
# plt.savefig('../figures/dqn/mean_ep_reward_with_IL_50_epochs_diff_net.jpg')

### Other visualizations

In [None]:
# x, y = results_plotter.ts2xy(results_plotter.load_results('../logs/dqn_logs/10-05-2022-11-15-40'), 'timesteps')  # Organising the logged results in to a clean format for plotting.
# fig = plt.figure(figsize=(20, 16))
# plt.plot(x,y)
# plt.ylim([-10, 10])
# plt.xlabel('Timesteps')
# plt.ylabel('Episode Rewards')
# # plt.savefig('../figures/dqn/episode_rewards_50_epochs_without_IL.jpg')

In [None]:
# plt.style.use(['seaborn-whitegrid'])
# log_files = ['../logs/dqn_logs/10-05-2022-11-15-40', '../logs/dqn_logs/10-05-2022-12-15-33']

# nrows = 1
# fig = plt.figure(figsize=(8, 2))
# for i, log_file in enumerate(log_files):
# #     if os.path.isfile(log_file):
#     x, y = results_plotter.ts2xy(results_plotter.load_results(log_file), 'timesteps')  # Organising the logged results in to a clean format for plotting.
    
# #         df = pd.read_csv(log_file, skiprows=1)
#     plt.subplot(nrows, 2, i+1, label='log_file', title="hello")
#     plt.plot(x,y)
# #         df['r'].rolling(window=5).mean().plot(title=f"Rewards: Env {i}")
#     plt.tight_layout()
# plt.show()

In [None]:
# plt.style.use(['seaborn-whitegrid'])
# log_files = ['../logs/dqn_logs/10-05-2022-11-15-40', '../logs/dqn_logs/10-05-2022-12-15-33']

# nrows = 1
# fig = plt.figure(figsize=(16, 4))
# #     if os.path.isfile(log_file):
# x_with_il, y_with_il = results_plotter.ts2xy(results_plotter.load_results(log_files[0]), 'timesteps')  # Organising the logged results in to a clean format for plotting.
# x_wo_il, y_wo_il = results_plotter.ts2xy(results_plotter.load_results(log_files[1]), 'timesteps')  # Organising the logged results in to a clean format for plotting.
    
# #         df = pd.read_csv(log_file, skiprows=1)
# plt.subplot(nrows, 2, 1, label='log_file', title=f"With IL initialization after 50 epochs")
# plt.plot(x_with_il, y_with_il)
# plt.subplot(nrows, 2, 2, label='log_file', title=f"Without IL initialization after 50 epochs")
# plt.plot(x_wo_il, y_wo_il)
# #         df['r'].rolling(window=5).mean().plot(title=f"Rewards: Env {i}")
# plt.tight_layout()
# # plt.show()
# plt.savefig('../figures/dqn/compariosn_of_initialization.jpg')

In [None]:
# plt.style.use(['seaborn-whitegrid'])
# log_files = ['data_for_figures/run-DQN_22-tag-rollout_ep_rew_mean.csv', 'data_for_figures/run-DQN_23-tag-rollout_ep_rew_mean.csv']
# names = ['with IL', 'without IL']
# nrows = 1
# fig = plt.figure(figsize=(16, 4 * nrows))

# for i, log_file in enumerate(log_files):
#     df = pd.read_csv(log_file)
#     plt.subplot(nrows, 2, i+1, label='log_file', title=names[i])
#     plt.plot(df['Step'], df['Value'])
# #     df.plot(x='Step', y='Value', title=f"Hello")
# #     plt.tight_layout()
# # plt.show()
# plt.savefig('../figures/dqn/comparison_mean_ep_rew_with_and_wo_IL_init.jpg')

---
# Agent
> [Stable-Baselines: Exporting Models](https://stable-baselines.readthedocs.io/en/master/guide/export.html)<br/>
> [Stable-Baselines: Converting a Model into PyTorch](https://github.com/hill-a/stable-baselines/issues/372)<br/>
> [Connect4: Make Submission with Stable-Baselines3](https://www.kaggle.com/toshikazuwatanabe/connect4-make-submission-with-stable-baselines3)

In [None]:
# from stable_baselines3 import DQN
# model = DQN.load("dqn_gfootball")
# test_env = FootballGym({"env_name":scenario_name}, render=False)
# obs = test_env.reset()
# done = False
# rewards = []
# goals_scored, goals_conceded = 0, 0
# while not done:
# #     action, state = model.predict(obs, deterministic=True)
#     action, state = model.predict(obs, deterministic=True)
#     obs, reward, done, info = test_env.step(action)
#     if reward < 0:
#         goals_conceded += reward
#     elif reward > 0:
#         goals_scored += reward
#     rewards.append(reward)
# #     print(f"{Action(action).name.ljust(16,' ')}\t{round(reward,2)}\t{info}\t{done}")
#     print(f"{action_set[action].ljust(16,' ')}\t{round(reward,2)}\t{info}\t{done}")
# print(sum(rewards))

In [None]:
# def play_match(test_env):
#     obs = test_env.reset()
#     env_steps = 0
#     match_reward = 0
#     done = False
#     my_agent = 0
#     ai_agent = 0
# #     model.eval()
#     while not done:
#         obs = torch.tensor(obs).to('cuda')
#         obs = obs.reshape((1, obs.shape[0])).to('cuda')
#         outputs = model(obs)
#         _, action = torch.max(outputs.data, 1)
#         obs, reward, done, info = test_env.step(action.item())
# #         print(type(obs))
# #         match_reward += reward
#         if reward > 0:
#             my_agent += 1
#         elif reward < 0:
#             ai_agent += 1
# #             print(f"Step: {str(env_steps).ljust(10, ' ')}\t{str(action_set[action.item()]).ljust(10, ' ')}\t{round(reward,2)}\t{info}")
#         env_steps += 1
#         if (env_steps+1) % 3001  == 0:
# #             print(f"Match reward: {match_reward}")
#             return my_agent, ai_agent
    

In [None]:
# from stable_baselines3 import DQN
# model = DQN.load("dqn_gfootball")
# test_env = FootballGym({"env_name":scenario_name}, render=False)
# obs = test_env.reset()
# done = False
# rewards = []
# while not done:
#     action, state = model.predict(obs, deterministic=True)
#     obs, reward, done, info = test_env.step(action)
#     rewards.append(reward)
#     print(f"{Action(action).name.ljust(16,' ')}\t{round(reward,2)}\t{info}\t{done}")
    
#     if done:
#         ep_rew = sum(rewards)
#         ep_len = len(rewards)
#         print(ep_rew, ep_len)

In [None]:
%%writefile submission.py
import base64
import pickle
import zlib
import numpy as np
import torch as th
from torch import nn, tensor
from collections import deque
from gfootball.env import observation_preprocessing

state_dict = _STATE_DICT_

state_dict = pickle.loads(zlib.decompress(base64.b64decode(state_dict)))

def conv3x3(in_channels, out_channels, stride=1):
    return nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=True)

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.relu = nn.ReLU()
        self.conv1 = conv3x3(in_channels, out_channels, stride)
        self.conv2 = conv3x3(out_channels, out_channels, stride)
        
    def forward(self, x):
        residual = x
        out = self.relu(x)
        out = self.conv1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out += residual
        return out
    
class PyTorchCnnPolicy(nn.Module):
    global state_dict
    def __init__(self):
        super().__init__()
        self.cnn = nn.Sequential(
            conv3x3(in_channels=16, out_channels=32),
            nn.MaxPool2d(kernel_size=3, stride=2, dilation=1, ceil_mode=False),
            ResidualBlock(in_channels=32, out_channels=32),
            ResidualBlock(in_channels=32, out_channels=32),
            nn.ReLU(),
            nn.Flatten(),
        )
        self.linear = nn.Sequential(
          nn.Linear(in_features=52640, out_features=256, bias=True),
          nn.ReLU(),
        )
        self.action_net = nn.Sequential(
          nn.Linear(in_features=256, out_features=19, bias=True),
          nn.ReLU(),
        )
        self.out_activ = nn.Softmax(dim=1)
        self.load_state_dict(state_dict)

    def forward(self, x):
        x = tensor(x).float() / 255.0  # normalize
        x = x.permute(0, 3, 1, 2).contiguous()  # 1 x channels x height x width
        x = self.cnn(x)
        x = self.linear(x)
        x = self.action_net(x)
        x = self.out_activ(x)
        return int(x.argmax())
    
obs_stack = deque([], maxlen=4)
def transform_obs(raw_obs):
    global obs_stack
    obs = raw_obs['players_raw'][0]
    obs = observation_preprocessing.generate_smm([obs])
    if not obs_stack:
        obs_stack.extend([obs] * 4)
    else:
        obs_stack.append(obs)
    obs = np.concatenate(list(obs_stack), axis=-1)
    return obs

policy = PyTorchCnnPolicy()
policy = policy.float().to('cpu').eval()
def agent(raw_obs):
    obs = transform_obs(raw_obs)
    action = policy(obs)
    return [action]

In [None]:
model = PPO.load("ppo_gfootball")
_state_dict = model.policy.to('cpu').state_dict()
state_dict = {
    "cnn.0.weight":_state_dict['features_extractor.cnn.0.weight'], 
    "cnn.0.bias":_state_dict['features_extractor.cnn.0.bias'], 
    "cnn.2.conv1.weight":_state_dict['features_extractor.cnn.2.conv1.weight'], 
    "cnn.2.conv1.bias":_state_dict['features_extractor.cnn.2.conv1.bias'],
    "cnn.2.conv2.weight":_state_dict['features_extractor.cnn.2.conv2.weight'], 
    "cnn.2.conv2.bias":_state_dict['features_extractor.cnn.2.conv2.bias'], 
    "cnn.3.conv1.weight":_state_dict['features_extractor.cnn.3.conv1.weight'], 
    "cnn.3.conv1.bias":_state_dict['features_extractor.cnn.3.conv1.bias'], 
    "cnn.3.conv2.weight":_state_dict['features_extractor.cnn.3.conv2.weight'], 
    "cnn.3.conv2.bias":_state_dict['features_extractor.cnn.3.conv2.bias'], 
    "linear.0.weight":_state_dict['features_extractor.linear.0.weight'], 
    "linear.0.bias":_state_dict['features_extractor.linear.0.bias'], 
    "action_net.0.weight":_state_dict['action_net.weight'],
    "action_net.0.bias":_state_dict['action_net.bias'],
}
state_dict = base64.b64encode(zlib.compress(pickle.dumps(state_dict)))
with open('submission.py', 'r') as file:
    src = file.read()
src = src.replace("_STATE_DICT_", f"{state_dict}")
with open('submission.py', 'w') as file:
    file.write(src)

---
# Testing

In [None]:
kaggle_env = make("football", debug = False,
                  configuration={"scenario_name": scenario_name, 
                                 "running_in_notebook": True,
                                 "save_video": False})

In [None]:
output = kaggle_env.run(["submission.py", "do_nothing"])

In [None]:
scores = output[-1][0]["observation"]["players_raw"][0]["score"]
print("Scores  {0} : {1}".format(*scores))
print("Rewards {0} : {1}".format(output[-1][0]["reward"], output[-1][1]["reward"]))

In [None]:
viz = visualize(output)

> Modified [Human Readable Visualization](https://www.kaggle.com/jaronmichal/human-readable-visualization)

In [None]:
HTML(viz.to_html5_video())

---
# Checkpoints

0. [academy_empty_goal_close @ 800K steps](https://www.kaggle.com/kwabenantim/gfootball-stable-baselines3?scriptVersionId=45569809#Test-Agent) (Nature CNN)<br/>
1. [academy_empty_goal @ 800K steps](https://www.kaggle.com/kwabenantim/gfootball-stable-baselines3?scriptVersionId=45639135#Test-Agent) (Nature CNN)<br/>
2. [academy_run_to_score @ 800K steps](https://www.kaggle.com/kwabenantim/gfootball-stable-baselines3?scriptVersionId=45941674#Test-Agent) (Nature CNN)<br/>
3. [academy_run_to_score_with_keeper @ 800K steps](https://www.kaggle.com/kwabenantim/gfootball-stable-baselines3?scriptVersionId=45703399#Test-Agent) (Nature CNN)<br/>
4. [academy_pass_and_shoot_with_keeper @ 800K steps](https://www.kaggle.com/kwabenantim/gfootball-stable-baselines3?scriptVersionId=45716494#Test-Agent) (Nature CNN)<br/>
5. [academy_run_pass_and_shoot_with_keeper @ 1.6M steps](https://www.kaggle.com/kwabenantim/gfootball-stable-baselines3?scriptVersionId=46590578#Testing) (Nature CNN)<br/>
6. [academy_3_vs_1_with_keeper @ 500K steps](https://www.kaggle.com/kwabenantim/gfootball-stable-baselines3?scriptVersionId=46843278#Testing) (GFootball CNN)