In [1]:
import sys
import os
import base64
import pickle
import zlib
import gym
import numpy as np
import pandas as pd
import torch as th
from torch import nn, tensor
from collections import deque
from gym.spaces import Box, Discrete
from gfootball.env import create_environment, observation_preprocessing, wrappers
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy
from stable_baselines3.common import results_plotter
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines3.common.vec_env.subproc_vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env.base_vec_env import VecEnv
from stable_baselines3.common.policies import BasePolicy, register_policy
from IPython.display import HTML
import time
from datetime import date
from matplotlib import pyplot as plt
from stable_baselines3 import DQN
import torch
%matplotlib inline

In [2]:
torch.manual_seed(42)
torch.manual_seed(torch.initial_seed())

<torch._C.Generator at 0x7ff76d719b70>

### Football Gym

In [3]:
class FootballGym(gym.Env):
    spec = None
    metadata = None
#     metadata = {'render.modes': ['human']}
    def __init__(self, config=None, render=False, rewards='scoring'):
        super(FootballGym, self).__init__()
        env_name = "academy_empty_goal_close"
        rewards = rewards
        if config is not None:
            env_name = config.get("env_name", env_name)
            rewards = config.get("rewards", rewards)
        self.env = create_environment(
            env_name=env_name,
            stacked=False,
            representation="simple115v2",
            rewards = rewards,
            write_goal_dumps=False,
            write_full_episode_dumps=False,
            render=render,
            write_video=False,
            dump_frequency=1,
            logdir=".",
            extra_players=None,
            number_of_left_players_agent_controls=1,
            number_of_right_players_agent_controls=0)
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        self.reward_range = (-1, 1)
        self.obs_stack = deque([], maxlen=4)
        
    def reset(self):
        self.obs_stack.clear()
        obs = self.env.reset()
        return obs
    
    def step(self, action):
        obs, reward, done, info = self.env.step([action])
        return obs, float(reward), done, info

In [4]:
scenarios = {0: "academy_empty_goal_close",
             1: "academy_empty_goal",
             2: "academy_run_to_score",
             3: "academy_run_to_score_with_keeper",
             4: "academy_pass_and_shoot_with_keeper",
             5: "academy_run_pass_and_shoot_with_keeper",
             6: "academy_3_vs_1_with_keeper",
             7: "academy_corner",
             8: "academy_counterattack_easy",
             9: "academy_counterattack_hard",
             10: "academy_single_goal_versus_lazy",
             11: "11_vs_11_kaggle",
             12: "11_vs_11_stochastic",
             13: "11_vs_11_easy_stochastic",
             14: "11_vs_11_hard_stochastic"}

scenario_name = scenarios[13]

### Environment creation and logging

In [5]:
from typing import Callable
from stable_baselines3.common.utils import set_random_seed
def make_env(config: dict, rank: int, log_save_dir: str, seed: int = 42) -> Callable:
    """
    Utility function for multiprocessed env.
    
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    :return: (Callable)
    """
    def _init() -> gym.Env:
        env = FootballGym(config, rewards='scoring')
#         env = FootballGym(config, rewards='scoring,checkpoints')
        log_file = os.path.join(log_save_dir, str(rank))
        env = Monitor(env, log_file, allow_early_resets=True)
        env.seed(seed + rank)
        return env
    set_random_seed(seed)
    return _init

In [6]:
# Creating the vectorized training environmewnt and also creating the direcotry for logging

timestamp = time.strftime('%d-%m-%Y-%H-%M-%S', time.localtime())
print(timestamp)

n_envs = 8
config={"env_name":scenario_name}
log_save_dir = os.path.join("../logs/ppo_logs", timestamp)
print(f"Log dir: {log_save_dir}")
os.mkdir(log_save_dir)
train_env = SubprocVecEnv([make_env(config, rank=i, log_save_dir=log_save_dir) for i in range(n_envs)])

27-05-2022-02-25-04
Log dir: ../logs/ppo_logs/27-05-2022-02-25-04


### Initializing the PPO model

In [7]:
# PPO

policy_kwargs = dict(
    net_arch = [1024, 1024, 1024],
    activation_fn = torch.nn.ReLU
)

# policy_kwargs = dict(features_extractor_class=FootballMLP,
#                      features_extractor_kwargs=dict(features_dim=1024),
#                     net_arch = [],
#                     )
model_name = "ppo"
model = PPO(policy="MlpPolicy", 
            env=train_env, 
            seed=0,
            n_steps=128, 
            max_grad_norm=0.5,
            gamma=0.99,
            ent_coef=0.01,
            learning_rate=0.00008,
            clip_range=0.27,
            policy_kwargs=policy_kwargs, 
            verbose=1,
            tensorboard_log='../logs/tb_logs_PPO',
           )
model.policy
           

Using cuda device
If you don't want to use dropout, just comment line 198 in torch_layers.py file
If you don't want to use dropout, just comment line 198 in torch_layers.py file
If you don't want to use dropout, just comment line 198 in torch_layers.py file


ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential(
      (0): Linear(in_features=115, out_features=1024, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.25, inplace=False)
      (3): Linear(in_features=1024, out_features=1024, bias=True)
      (4): ReLU()
      (5): Dropout(p=0.25, inplace=False)
      (6): Linear(in_features=1024, out_features=1024, bias=True)
      (7): ReLU()
      (8): Dropout(p=0.25, inplace=False)
    )
    (policy_net): Sequential()
    (value_net): Sequential()
  )
  (action_net): Linear(in_features=1024, out_features=19, bias=True)
  (value_net): Linear(in_features=1024, out_features=1, bias=True)
)

In [8]:
model.policy.state_dict()

OrderedDict([('mlp_extractor.shared_net.0.weight',
              tensor([[-0.0007,  0.0500, -0.0767,  ..., -0.0567,  0.0846,  0.0639],
                      [-0.0786, -0.0232,  0.0042,  ..., -0.0265, -0.0313, -0.0138],
                      [ 0.0010,  0.0769,  0.0116,  ...,  0.0766,  0.0523, -0.0561],
                      ...,
                      [ 0.0268,  0.0916,  0.0925,  ...,  0.0530,  0.0869, -0.0264],
                      [-0.0562,  0.0565,  0.0287,  ..., -0.0781,  0.0359, -0.0436],
                      [-0.0794,  0.0102, -0.0926,  ..., -0.0125,  0.0526, -0.0026]],
                     device='cuda:0')),
             ('mlp_extractor.shared_net.0.bias',
              tensor([-0.0840, -0.0386, -0.0518,  ..., -0.0873, -0.0291,  0.0105],
                     device='cuda:0')),
             ('mlp_extractor.shared_net.3.weight',
              tensor([[-0.0037,  0.0212,  0.0036,  ...,  0.0208, -0.0275,  0.0147],
                      [ 0.0037, -0.0189,  0.0156,  ..., -0.0189,  0.02

#### Loading IL agent's weights
---
Download the checkpoints from here: [IL agent checkpoints](https://drive.google.com/drive/folders/1QwyPsWdGfJMhjEcBIhNot15iij_VRx_U?usp=sharing)

Modify the path of the checkpoints

In [9]:
# IL agent with BN
# checkpoint_path = "../il_agent_checkpoints/epoch=208-step=681548.ckpt"

# IL agent without BN
checkpoint_path = "../il_agent_checkpoints/epoch=146-step=479366.ckpt"
checkpoint_dict = torch.load(checkpoint_path)
checkpoint_dict

{'epoch': 147,
 'global_step': 479367,
 'pytorch-lightning_version': '1.5.10',
 'state_dict': OrderedDict([('model.0.weight',
               tensor([[-1.2712,  1.7235, -0.8170,  ..., -0.7283, -1.0528, -0.3971],
                       [ 0.2426, -0.4846, -0.1319,  ..., -0.3745,  0.1083, -0.1955],
                       [ 0.1865, -0.4384,  0.2724,  ...,  0.0148, -0.0946, -0.1312],
                       ...,
                       [ 0.9357, -2.8718, -1.0750,  ..., -1.5358,  0.2322, -0.8907],
                       [ 2.0421, -4.0784, -1.6789,  ..., -0.9545,  0.5744,  0.2149],
                       [ 1.1143,  1.9087,  0.0565,  ..., -0.2459, -0.7759, -0.8163]],
                      device='cuda:0')),
              ('model.0.bias',
               tensor([-0.6341, -0.3114, -0.1605,  ..., -0.9393, -2.3791, -0.4585],
                      device='cuda:0')),
              ('model.3.weight',
               tensor([[-1.1699,  0.0614, -0.0255,  ..., -0.2109, -3.2698, -1.0542],
                    

In [10]:
mlp_keys_toppo_keys_dict = {}
sd_ppo_model = model.policy.state_dict()
ppo_shared_net_keys = [key for key in model.policy.state_dict().keys() if 'mlp_extractor.shared_net' in key]
    
mlp_net_keys = ['model.0.weight', 'model.0.bias', 'model.3.weight', 'model.3.bias', 'model.6.weight', 'model.6.bias']
for mlp_key, ppo_key in zip(mlp_net_keys, ppo_shared_net_keys):
    sd_ppo_model[ppo_key] = checkpoint_dict['state_dict'][mlp_key]

sd_ppo_model['action_net.weight'] = checkpoint_dict['state_dict']['model.9.weight']
sd_ppo_model['action_net.bias'] = checkpoint_dict['state_dict']['model.9.bias']
# sd_ppo_model['value_net.weight'] = checkpoint_dict['state_dict']['model.9.weight']
# sd_ppo_model['value_net.bias'] = checkpoint_dict['state_dict']['model.9.bias']

    
model.policy.load_state_dict(sd_ppo_model)

# Check the model's weights after loading the weights from IL agent
model.policy.state_dict()

OrderedDict([('mlp_extractor.shared_net.0.weight',
              tensor([[-1.2712,  1.7235, -0.8170,  ..., -0.7283, -1.0528, -0.3971],
                      [ 0.2426, -0.4846, -0.1319,  ..., -0.3745,  0.1083, -0.1955],
                      [ 0.1865, -0.4384,  0.2724,  ...,  0.0148, -0.0946, -0.1312],
                      ...,
                      [ 0.9357, -2.8718, -1.0750,  ..., -1.5358,  0.2322, -0.8907],
                      [ 2.0421, -4.0784, -1.6789,  ..., -0.9545,  0.5744,  0.2149],
                      [ 1.1143,  1.9087,  0.0565,  ..., -0.2459, -0.7759, -0.8163]],
                     device='cuda:0')),
             ('mlp_extractor.shared_net.0.bias',
              tensor([-0.6341, -0.3114, -0.1605,  ..., -0.9393, -2.3791, -0.4585],
                     device='cuda:0')),
             ('mlp_extractor.shared_net.3.weight',
              tensor([[-1.1699,  0.0614, -0.0255,  ..., -0.2109, -3.2698, -1.0542],
                      [ 0.0258, -0.0823, -0.1066,  ..., -0.0416, -0.02

##### ##### Freezing the layers

In [11]:
# for idx, param in enumerate(model.policy.parameters()):
# #     if param.shape == torch.Size([19, 1024]) or param.shape == torch.Size([19]):
#     if param.shape == torch.Size([19]):
#         print(param.shape, param.requires_grad)
#         param.requires_grad = True
#     else:
#         param.requires_grad = False
#         #     print(param.shape)

In [12]:
# for idx, param in enumerate(model.policy.parameters()):
# #     param.requires_grad = True
#     print(param.shape, param.requires_grad)

# Training

In [13]:
from tqdm.notebook import tqdm
class ProgressBar(BaseCallback):
    def __init__(self, verbose=0):
        super(ProgressBar, self).__init__(verbose)
        self.pbar = None

    def _on_training_start(self):
        factor = np.ceil(self.locals['total_timesteps'] / n_steps)
        print(f"self.locals['total_timesteps']:{self.locals['total_timesteps']}, n_steps: {n_steps}")
        n = 1
        try:
            n = len(self.training_env.envs)
        except AttributeError:
            try:
                n = len(self.training_env.remotes)
            except AttributeError:
                n = 1
        total = int(n_steps * factor / n)
        self.pbar = tqdm(total=total)

    def _on_rollout_start(self):
        self.pbar.refresh()

    def _on_step(self):
        self.pbar.update(1)
        return True

    def _on_rollout_end(self):
        self.pbar.refresh()

    def _on_training_end(self):
        self.pbar.close()
        self.pbar = None

progressbar = ProgressBar()

In [14]:
total_epochs = 1
n_steps = 3002
total_timesteps = n_steps * n_envs * total_epochs
model.learn(total_timesteps=total_timesteps, callback=progressbar, log_interval=3, tb_log_name='ppo')

saved_model_name = model_name + '_gfootball_' + str(n_envs) + "_" + timestamp
model.save(f"../models/{model_name}/{saved_model_name}")

Logging to ../logs/tb_logs_PPO/ppo_3
self.locals['total_timesteps']:24016, n_steps: 3002


  0%|          | 0/3002 [00:00<?, ?it/s]

----------------------------------------
| time/                   |            |
|    fps                  | 359        |
|    iterations           | 3          |
|    time_elapsed         | 8          |
|    total_timesteps      | 3072       |
| train/                  |            |
|    approx_kl            | 0.24612623 |
|    clip_fraction        | 0.564      |
|    clip_range           | 0.27       |
|    entropy_loss         | -1.68      |
|    explained_variance   | -0.0795    |
|    learning_rate        | 8e-05      |
|    loss                 | 0.127      |
|    n_updates            | 20         |
|    policy_gradient_loss | 0.0872     |
|    value_loss           | 0.135      |
----------------------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 348        |
|    iterations           | 6          |
|    time_elapsed         | 17         |
|    total_timesteps      | 6144       |
| train/        

### Visualization with Tensorboard

In [15]:
%load_ext tensorboard

In [16]:
%tensorboard --logdir ../logs/tb_logs_PPO

### Visualizing the results

In [None]:
# Plot the rewards per timestep and per episode

plt.style.use(['seaborn-whitegrid'])
results_plotter.plot_results([log_save_dir], total_timesteps, results_plotter.X_TIMESTEPS, "GFootball Timesteps")
# plt.savefig('../figures/dqn/rewards_per_timestamp_dqn_with_my_policy.png')
results_plotter.plot_results([log_save_dir], total_timesteps, results_plotter.X_EPISODES, "GFootball Episodes")
# plt.savefig('../figures/dqn/rewards_per_episode_dqn_with_my_policy.png')

In [None]:
# Plot the episodic reward with line

x, y = results_plotter.ts2xy(results_plotter.load_results(log_save_dir), 'timesteps')  # Organising the logged results in to a clean format for plotting.
fig = plt.figure(figsize=(20, 16))
plt.plot(x,y)
plt.ylim([-10, 10])
plt.xlabel('Timesteps')
plt.ylabel('Episode Rewards')

In [None]:
# Plot the rolling mean reward per environment

plt.style.use(['seaborn-whitegrid'])
log_files = [os.path.join(log_save_dir, f"{i}.monitor.csv") for i in range(n_envs)]

nrows = np.ceil(n_envs/2)
fig = plt.figure(figsize=(8, 2 * nrows))
for i, log_file in enumerate(log_files):
    if os.path.isfile(log_file):
        df = pd.read_csv(log_file, skiprows=1)
        plt.subplot(nrows, 2, i+1, label=log_file)
        df['r'].rolling(window=5).mean().plot(title=f"Rewards: Env {i}")
        plt.tight_layout()
plt.show()

In [None]:
# Plot the mean episodic reward
# Download the CSV from tensorboard and put the path here
df = pd.read_csv('data_for_figures/...')
fig = plt.figure(figsize=(10, 8))
df.plot(x ='Step', y='Value')
# plt.savefig('../figures/dqn/mean_ep_reward_with_IL_50_epochs_diff_net.jpg')