In [72]:
google_colab = False
if google_colab:
    from google.colab import drive

    drive.mount('/content/drive', force_remount=True)
    base_path = '/content/drive/MyDrive/MasterThesis/'
else:
    base_path = '../.'

In [73]:
if google_colab:
    !pip install socceraction matplotsoccer
    !pip install pandas
    !pip install tables == 3.5.1
    !pip install kaleido

In [74]:

import os
import warnings
import json
import pandas as pd
import torch
import numpy as np
from enum import Enum
from pathlib import Path

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

from tqdm.notebook import tqdm

pd.set_option('display.max_rows', 60)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

from socceraction.data.statsbomb import StatsBombLoader
import socceraction.spadl as spadl
import socceraction.vaep.features as fs
import socceraction.vaep.labels as labs
import socceraction.vaep.formula as vaepformula

if google_colab:
    % load_ext autoreload
    % autoreload 2
    print(torch.cuda.device_count())
    print(torch.cuda.get_device_name(0))


class dotdict(dict):
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [75]:
if google_colab:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
    device = "cpu"
seed = 42


def seed_new_generator():
    return torch.Generator().manual_seed(seed)


os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = True
    print(torch.cuda.current_device())

0


In [76]:
if google_colab:
    os.system(f'git clone https://github.com/statsbomb/open-data {base_path}')

In [77]:
if google_colab:
    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
        print('Not connected to a GPU')
    else:
        print(gpu_info)

    from psutil import virtual_memory

    ram_gb = virtual_memory().total / 1e9
    print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

    if ram_gb < 20:
        print('Not using a high-RAM runtime')
    else:
        print('You are using a high-RAM runtime!')

In [78]:
class StatsBombDataset(Enum):
    BARCELONA = 1
    WOMEN = 2


class PlayerPosition(Enum):
    DEFENDER = 0
    MIDFIELDER = 1
    ATTACKER = 2

In [79]:
class SubstitutionType(str, Enum):
    DEFENSIVE = 'defensive'
    NEUTRAL = 'neutral'
    OFFENSIVE = 'offensive'


class SubstitutionResult(str, Enum):
    GOOD = 'good'
    BAD = 'bad'


class SubstitutionStrategy(str, Enum):
    AGGRESSIVE_1 = 'aggressive_1'
    RESERVED_1 = 'reserved_1'
    AGGRESSIVE_2 = 'aggressive_2'
    RESERVED_2 = 'reserved_2'

In [80]:
dataset_version = StatsBombDataset.BARCELONA

if dataset_version == StatsBombDataset.BARCELONA:
    datafolder = Path(base_path) / "data-barcelona2/"
elif dataset_version == StatsBombDataset.WOMEN:
    datafolder = Path(base_path) / "data-women2/"

archive_path = "./"

spadl_h5 = datafolder / "spadl-statsbomb.h5"
print(os.path.exists(spadl_h5))
features_h5 = datafolder / "features.h5"
labels_h5 = datafolder / "labels.h5"
predictions_h5 = datafolder / "predictions.h5"
vaep_values_h5 = datafolder / "vaep_values.h5"

neural_features_h5 = datafolder / "neural_features.h5"
neural_predictions_h5 = datafolder / "neural_predictions.h5"
substitutions_h5 = datafolder / "substitutions.h5"

player_features_h5 = datafolder / "player_features.h5"
player_substitutions_h5 = datafolder / "player_substitutions.h5"
player_list_per_season_json = datafolder / "player_list_per_season.json"
player_list_name_per_season_json = datafolder / "player_list_name_per_season.json"
cumsum_current_player_stats_npy = datafolder / "cumsum_current_player_stats.npy"
current_player_stats_npy = datafolder / "cumulated_player_stats.npy"
context_game_stats_npy = datafolder / "context_game_stats.npy"
cumsum_history_player_stats_npy = datafolder / "cumsum_history_player_stats.npy"
history_player_stats_npy = datafolder / "history_player_stats.npy"
player_lineups_npy = datafolder / "player_lineups"
game_substitutions_json = datafolder / "game_substitutions.json"
home_player_lineups_npy = datafolder / "home_player_lineups"
away_player_lineups_npy = datafolder / "away_player_lineups"
home_game_substitutions_npy = datafolder / "home_game_substitutions"
away_game_substitutions_npy = datafolder / "away_game_substitutions"

first_half_stats = datafolder / "first_half_stats"
first_half_substitutions = datafolder / "first_half_substitutions"

SBL = StatsBombLoader(root=Path(base_path) / 'data', getter='local')

# Create data folder if it doesn't exist
os.makedirs(datafolder, exist_ok=True)
assert os.path.exists(datafolder)

True


In [81]:
if os.path.exists(spadl_h5):
    games = pd.read_hdf(spadl_h5, "games")
    players = pd.read_hdf(spadl_h5, "player_games")
else:
    ## Change to select competitions
    competitions = SBL.competitions()
    if dataset_version == StatsBombDataset.BARCELONA:
        selected_competitions = competitions[competitions.competition_name == 'La Liga']
    elif dataset_version == StatsBombDataset.WOMEN:
        selected_competitions = competitions[competitions.competition_name == "FA Women's Super League"]
    games = pd.concat([
        SBL.games(row.competition_id, row.season_id)
        for row in selected_competitions.itertuples()
    ]).reset_index()
    ## Change to select games
    # games = games.head(1)
    teams, players = [], []
    actions = {}
    events = {}
    game_map = {}
    game_i = 0
    for game in tqdm(list(games.itertuples()), desc="Loading game data"):
        game_map[game.game_id] = game_i
        teams.append(SBL.teams(game.game_id))
        players.append(SBL.players(game.game_id))
        events[game.game_id] = SBL.events(game.game_id)
        actions[game.game_id] = spadl.statsbomb.convert_to_actions(events[game.game_id], game.home_team_id)
        game_i += 1
    teams = pd.concat(teams).drop_duplicates(subset="team_id")
    players = pd.concat(players)

    with pd.HDFStore(spadl_h5) as store:
        store["competitions"] = selected_competitions
        store["games"] = games
        store["teams"] = teams
        store["players"] = players[['player_id', 'player_name', 'nickname']].drop_duplicates(subset='player_id')
        store["player_games"] = players[
            ['player_id', 'game_id', 'team_id', 'is_starter', 'starting_position_id', 'starting_position_name',
             'minutes_played']]
        for game_id in actions.keys():
            store[f'actions/game_{game_id}'] = actions[game_id]

        f_games = (store["games"]
                   .merge(store["competitions"], how='left')
                   .merge(store["teams"].add_prefix('home_'), how='left')
                   .merge(store["teams"].add_prefix('away_'), how='left')
                   )

        game_id = f_games.game_id.values[0]
        f_actions = (
            store[f"actions/game_{game_id}"]
            .merge(spadl.actiontypes_df(), how='left')
            .merge(spadl.results_df(), how='left')
            .merge(spadl.bodyparts_df(), how='left')
            .merge(store["players"], how='left')
            .merge(store["teams"], how='left')
        )

        f_actions["player_name"] = f_actions[["nickname", "player_name"]].apply(lambda x: x[0] if x[0] else x[1],
                                                                                axis=1)
        del f_actions['nickname']


In [82]:
nb_actions_before = 10
xfns = [
    fs.actiontype,
    fs.actiontype_onehot,
    fs.bodypart,
    fs.bodypart_onehot,
    fs.result,
    fs.result_onehot,
    fs.goalscore,
    fs.startlocation,
    fs.endlocation,
    fs.movement,
    fs.space_delta,
    fs.startpolar,
    fs.endpolar,
    fs.team,
    fs.time,
    fs.time_delta
]
yfns = [labs.scores,
        labs.concedes,
        labs.goal_from_shot]

if not os.path.exists(features_h5):
    for game in tqdm(games.itertuples(), desc=f"Generating and storing features in {features_h5}"):
        actions = pd.read_hdf(spadl_h5, f"actions/game_{game.game_id}")
        gamestates = fs.gamestates(spadl.add_names(actions), nb_actions_before)
        gamestates = fs.play_left_to_right(gamestates, game.home_team_id)

        X = pd.concat([fn(gamestates) for fn in xfns], axis=1)
        X.to_hdf(features_h5, f"game_{game.game_id}")

if not os.path.exists(labels_h5):
    for game in tqdm(games.itertuples(), desc=f"Generating and storing features in {labels_h5}"):
        actions = pd.read_hdf(spadl_h5, f"actions/game_{game.game_id}")
        Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
        Y.to_hdf(labels_h5, f"game_{game.game_id}")


In [83]:
import xgboost
from sklearn.metrics import brier_score_loss, roc_auc_score, log_loss

games = games.sample(frac=1)
nb_games = len(games.index)
train_games = games
test_games = games


def getXY(games, Xcols):
    X = []
    for game_id in tqdm(games.game_id, desc="Selecting features"):
        Xi = pd.read_hdf(features_h5, f"game_{game_id}")
        X.append(Xi[Xcols])
    X = pd.concat(X).reset_index(drop=True)

    Y = []
    for game_id in tqdm(games.game_id, desc="Selecting labels"):
        Yi = pd.read_hdf(labels_h5, f"game_{game_id}")
        Y.append(Yi[Ycols])
    Y = pd.concat(Y).reset_index(drop=True)
    return X, Y


def evaluate(y, y_hat):
    print(f'Number of datapoints : {len(y)}')
    p = sum(y) / len(y)
    base = [p] * len(y)
    brier = brier_score_loss(y, y_hat)
    print(f"  Brier score: %.5f (BSS %.5f)" % (brier, 1 - (brier / brier_score_loss(y, base))))
    ll = log_loss(y, y_hat)
    print(f"  log loss score: %.5f (%.5f)" % (ll, ll / log_loss(y, base)))
    print(f"  ROC AUC: %.5f" % roc_auc_score(y, y_hat))


def save_predictions(model):
    A = []
    for game_id in tqdm(test_games.game_id, "Loading game ids"):
        Ai = pd.read_hdf(spadl_h5, f"actions/game_{game_id}")
        A.append(Ai[["game_id"]])

    A = pd.concat(A)
    A = A.reset_index(drop=True)
    grouped_predictions = pd.concat([A, Y_hat], axis=1).groupby('game_id')
    for k, df in tqdm(grouped_predictions, desc="Saving predictions per game"):
        df = df.reset_index(drop=True)
        df[Y_hat.columns].to_hdf(predictions_h5, predict_path + f"game_{int(k)}")


def evaluate_baseline(games):
    Y_hat = []
    Y = []
    for game in tqdm(games.itertuples(), desc="Evaluating baseline"):
        game_labels = pd.read_hdf(labels_h5, f'game_{game.game_id}')
        Y_hat.append(game_labels[["baseline_substitution"]])
        Y.append(game_labels[["substitution"]])
    Y_hat = pd.concat(Y_hat).reset_index(drop=True)['baseline_substitution'].apply(lambda x: 1 if x == 100 else 0)
    Y = pd.concat(Y).reset_index(drop=True)['substitution'].apply(lambda x: 1 if x else 0)
    Y_hat_baseline = np.full(len(Y), Y_hat.value_counts()[1] / Y_hat.value_counts()[0])

    evaluate(Y, Y_hat)
    evaluate(Y, Y_hat_baseline)


if not os.path.exists(predictions_h5):
    Xcols = fs.feature_column_names(xfns, nb_actions_before)
    Ycols = ["scores", "concedes"]

    Xtrain, Ytrain = getXY(train_games, Xcols)
    Xtest, Ytest = Xtrain, Ytrain
    Y_hat = pd.DataFrame()
    models = {}
    for col in Ytrain.columns:
        if google_colab:
            model = xgboost.XGBClassifier(n_estimators=50, max_depth=3, n_jobs=-3, verbosity=1, tree_method='gpu_hist')
        else:
            model = xgboost.XGBClassifier(n_estimators=50, max_depth=3, n_jobs=-3, verbosity=1)
        model.fit(Xtrain, Ytrain[col])
        models[col] = model
    for game_id in tqdm(games.game_id, "Loading features to "):
        Xi = pd.read_hdf(features_h5, f"game_{game_id}")
        Xi = Xi[Xcols].reset_index(drop=True)
        Yi = {"scores": [], "concedes": []}
        for col in Ycols:
            Yi[col] = models[col].predict_proba(Xi)[:, 1]
        Yi = pd.DataFrame(Yi, index=Xi.index)
        Yi.to_hdf(predictions_h5, f"goals/game_{game_id}")

    # evaluate_baseline(games)

    #Xtest, Ytest = Xtrain, Ytrain
    #Y_hat = train_model_XGBC()
    #save_predictions(Y_hat)


In [84]:
def compute_vaep_values(games):
    for game in tqdm(games.itertuples(), desc='Computing VAEP values'):
        predictions = pd.read_hdf(predictions_h5, f"goals/game_{game.game_id}")
        actions = pd.read_hdf(spadl_h5, f"actions/game_{game.game_id}")
        players = pd.read_hdf(spadl_h5, "players")
        teams = pd.read_hdf(spadl_h5, "teams")
        actions = (
            spadl.add_names(actions)
            .merge(players, how="left")
            .merge(teams, how="left")
            .sort_values(["game_id", "period_id", "action_id"])
            .reset_index(drop=True)
        )
        vaep_values = vaepformula.value(actions, predictions.scores, predictions.concedes)
        vaep_values.to_hdf(vaep_values_h5, f'game_{game.game_id}')


if not os.path.exists(vaep_values_h5):
    compute_vaep_values(games)

In [85]:
from sklearn.preprocessing import LabelEncoder


def normalize_columns(df, columns):
    df_min_max_scaled = df.copy()
    for column in columns:
        df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (
                df_min_max_scaled[column].max() - df_min_max_scaled[column].min())
    return df_min_max_scaled


def from_bool_to_integer_columns(df, columns):
    for column in columns:
        df[column] = df[column].apply(lambda x: 1 if x else 0)
    return df


def encode_team_ids(df, column_name):
    list_team_ids = df[column_name].unique()
    df[column_name] = df[column_name].astype('category').cat.codes
    return df


def extract_features(games):
    neural_features = []
    for game in tqdm(games.itertuples(), "Extracting features for neural network"):
        game_actions = pd.read_hdf(spadl_h5, f'actions/game_{game.game_id}')
        vaep_values = pd.read_hdf(vaep_values_h5, f'game_{game.game_id}')['vaep_value']
        game_labels = pd.read_hdf(labels_h5, f'game_{game.game_id}')
        features = pd.read_hdf(features_h5, f'game_{game.game_id}')
        extracted_real_features = features[features_real_columns]
        extracted_discrete_features = features[features_discrete_columns]
        extracted_time_features = features[time_column]
        extracted_actions = game_actions[actions_columns]
        game_features = pd.concat(
            [extracted_real_features,
             vaep_values,
             extracted_discrete_features, extracted_actions,
             extracted_time_features],
            axis=1)
        game_features['goalscore_diff'] = game_features['goalscore_diff']
        game_features['period_id_a0'] = game_features['period_id_a0'] - 1
        game_features['game_id'] = game.game_id
        neural_features.append(game_features)
    return pd.concat(neural_features, axis=0).reset_index(drop=True)


def create_neural_features(games):
    raw_neural_features = extract_features(games)
    normalized_features = normalize_columns(raw_neural_features, features_real_columns)
    # formatted_labels_features = from_bool_to_integer_columns(normalized_features, ['substitution'])
    finalized_features = encode_team_ids(normalized_features, 'team_id')
    return finalized_features


def save_neural_features_for_each_games(neural_features, games):
    games.to_hdf(neural_features_h5, 'game_list')
    neural_features.to_hdf(neural_features_h5, 'all_neural_features')
    for game in tqdm(games.itertuples(), "Saving neural features for each games"):
        game_data = neural_features[neural_features['game_id'] == game.game_id].reset_index(drop=True)
        game_data.to_hdf(neural_features_h5, f'neural_features/game_{game.game_id}')


features_real_columns = ['start_x_a0',
                         'start_y_a0',
                         'end_x_a0',
                         'end_y_a0',
                         'dx_a0',
                         'dy_a0',
                         'movement_a0',
                         'start_dist_to_goal_a0',
                         'start_angle_to_goal_a0',
                         'end_dist_to_goal_a0',
                         'end_angle_to_goal_a0',
                         ]
features_discrete_columns = [
    'goalscore_diff',
    'goalscore_team',
    'goalscore_opponent',
    'period_id_a0',
]
actions_columns = ['type_id',
                   'result_id',
                   'bodypart_id',
                   'team_id']
time_column = 'time_seconds_overall_a0'

label_encoders = {}
#Count the number of different discrete features to prepare Embedded layers
discrete_features_names = np.concatenate((features_discrete_columns, actions_columns))
discrete_features_value_counts = {}
features_column_index = {}
real_features_names = np.concatenate((features_real_columns, ['vaep_value']))

if not os.path.exists(neural_features_h5):
    print("test")
    neural_features = create_neural_features(games)
else:
    neural_features = pd.read_hdf(neural_features_h5, 'all_neural_features')

for feature_name in discrete_features_names:
    lb_make = LabelEncoder()
    if not os.path.exists(neural_features_h5):
        neural_features[feature_name] = lb_make.fit_transform(neural_features[feature_name])
    else:
        neural_features[feature_name] = lb_make.fit(neural_features[feature_name])
    label_encoders[feature_name] = lb_make
    discrete_features_value_counts[feature_name] = len(lb_make.classes_)
    features_column_index[feature_name] = neural_features.columns.get_loc(feature_name)
for feature_name in real_features_names:
    features_column_index[feature_name] = neural_features.columns.get_loc(feature_name)
features_column_index[time_column] = neural_features.columns.get_loc(time_column)
if not os.path.exists(neural_features_h5):
    save_neural_features_for_each_games(neural_features, games)
del neural_features

In [86]:
# PLAYER POSITION SUMMARY
substitution_event_id = 19
starting_xi_event_id = 35
tactical_shift_event_id = 36


def get_position_from_position_id(id):
    if id <= 6:
        return PlayerPosition.DEFENDER
    elif id > 6 and id <= 21:
        return PlayerPosition.MIDFIELDER
    else:
        return PlayerPosition.ATTACKER


def get_substitute_position(game, player_id, team_id):
    game_events = SBL.events(game.game_id)
    team_events = game_events[game_events.team_id == team_id]
    sub_events = team_events[team_events.type_id == substitution_event_id]
    last_tactical_shift = \
        team_events[
            (team_events.type_id == tactical_shift_event_id) | (team_events.type_id == starting_xi_event_id)].iloc[
            -1]
    position = None
    for player in last_tactical_shift['extra']['tactics']['lineup']:
        if player['player']['id'] == player_id:
            position = get_position_from_position_id(player['position']['id'])
    if position is None:
        for sub_event in sub_events.itertuples():
            extra = sub_event.extra
            if extra['substitution']['replacement']['id'] == player_id:
                position = get_position_from_position_id(sub_event.position_id)
    return position


def get_player_position_in_career(games):
    player_map = {}
    for game in tqdm(games.itertuples()):
        players = SBL.players(game.game_id)
        for player in players.itertuples():
            player_id = player.player_id
            team_id = player.team_id
            if player_id not in player_map.keys():
                player_map[player_id] = {
                    PlayerPosition.DEFENDER: 0,
                    PlayerPosition.MIDFIELDER: 0,
                    PlayerPosition.ATTACKER: 0
                }
            if player.is_starter:
                player_position_id = player.starting_position_id
                player_position_category = get_position_from_position_id(player_position_id)
                player_map[player_id][player_position_category] += 1
            else:
                player_position_category = get_substitute_position(game, player_id, team_id)
                player_map[player_id][player_position_category] += 1
    player_positions = {}
    for player_id, player_info in player_map.items():
        player_positions[player_id] = max(player_info, key=player_info.get)
    return player_positions


player_positions = get_player_position_in_career(games)

0it [00:00, ?it/s]

In [87]:
max_minutes = 105
barcelona_id = 217
shot_action_type_id = 11
pass_action_type_id = 0
carry_action_type_id = 21
shot_event_type_id = 16
pass_event_type_id = 30
carry_event_type_id = 43

player_stats_num_features = 6
context_stats_game_num_features = 9



def get_position_from_player_id(id):
    return player_positions[id].value


if not os.path.exists(cumsum_current_player_stats_npy):
    goal_difference_stats = np.zeros((len(games), max_minutes + 1, 1))
    if dataset_version == StatsBombDataset.BARCELONA:
        player_stats_game = np.zeros((len(games), max_minutes + 1, 3, player_stats_num_features))
        context_stats_game = np.zeros((len(games), max_minutes + 1, context_stats_game_num_features))
    else:
        player_stats_game = np.zeros((len(games) * 2, max_minutes + 1, 3, player_stats_num_features))
        context_stats_game = np.zeros((len(games) * 2, max_minutes + 1, context_stats_game_num_features))

    game_i = 0
    season_i = 0
    game_map = {}
    seasons = pd.read_hdf(spadl_h5, "competitions")
    sorted_seasons = seasons.sort_values('season_name', ascending=True)
    game_id_list = []
    for season in seasons.itertuples():
        season_games = games[games.season_id == season.season_id]
        sorted_season_games = season_games.sort_values('game_day', ascending=True)
        for game in tqdm(sorted_season_games.itertuples()):
            game_map[game_i] = game.game_id
            game_vaep = pd.read_hdf(vaep_values_h5, f'game_{game.game_id}')
            game_actions = pd.read_hdf(spadl_h5, f'actions/game_{game.game_id}')
            game_actions['vaep_value'] = game_vaep['vaep_value']
            game_events = SBL.events(game.game_id)
            game_neural = pd.read_hdf(neural_features_h5, f'neural_features/game_{game.game_id}')
            game_neural = game_neural.drop(['team_id', 'type_id', 'game_id', 'vaep_value'], axis=1)
            game_actions = pd.concat([game_actions, game_neural], axis=1)
            game_actions = game_actions.merge(game_events[['event_id', 'minute']], left_on='original_event_id',
                                              right_on='event_id')
            game_actions['gap_difference'] = game_actions['end_dist_to_goal_a0'] - game_actions['start_dist_to_goal_a0']
            game_actions['goal_difference'] = game_actions['goalscore_team'] - game_actions['goalscore_opponent']
            game_actions['player_position'] = game_actions['player_id'].apply(get_position_from_player_id)
            if dataset_version == StatsBombDataset.BARCELONA:
                team_actions = game_actions[game_actions['team_id'] == barcelona_id]
                grouped_actions = team_actions.groupby(['minute', 'player_position'])
                grouped_action_types = team_actions.groupby(['minute', 'player_position', 'type_id'])
                for (min, player_position), player_actions in grouped_actions:
                    player_stats_game[game_i][min][player_position][0] = player_actions.sum()[
                        'vaep_value']
                    player_stats_game[game_i][min][player_position][4] = player_actions.sum()[
                        'movement_a0']
                    player_stats_game[game_i][min][player_position][5] = player_actions.sum()[
                        'gap_difference']
                for (min, player_position, type_id), player_actions in grouped_action_types:
                    if type_id == shot_action_type_id:
                        player_stats_game[game_i][min][player_position][1] = len(player_actions.index)
                    elif type_id == pass_action_type_id:
                        player_stats_game[game_i][min][player_position][2] = len(player_actions.index)
                    elif type_id == carry_action_type_id:
                        player_stats_game[game_i][min][player_position][3] = len(player_actions.index)

                context_actions = game_actions[game_actions['team_id'] != barcelona_id]
                grouped_context_actions = context_actions.groupby(['minute'])
                grouped_context_action_types = context_actions.groupby(['minute', 'type_id'])
                for min, actions in grouped_context_actions:
                    context_stats_game[game_i][min][0] = actions.sum()['vaep_value']
                    context_stats_game[game_i][min][4] = actions.sum()['movement_a0']
                    context_stats_game[game_i][min][5] = actions.sum()['gap_difference']
                    context_stats_game[game_i][min][6] = actions.iloc[-1]['goal_difference']
                    context_stats_game[game_i][min][7] = actions.iloc[-1]['goalscore_team']
                    context_stats_game[game_i][min][8] = actions.iloc[-1]['goalscore_opponent']
                for (min, type_id), actions in grouped_context_action_types:
                    if type_id == shot_action_type_id:
                        context_stats_game[game_i][min][1] = len(actions.index)
                    elif type_id == pass_action_type_id:
                        context_stats_game[game_i][min][2] = len(actions.index)
                    elif type_id == carry_action_type_id:
                        context_stats_game[game_i][min][3] = len(actions.index)
            else:
                game_actions['player_position'] = game_actions['player_id'].apply(get_position_from_player_id)
                home_actions = game_actions[game_actions['team_id'] == game.home_team_id]
                away_actions = game_actions[game_actions['team_id'] == game.away_team_id]

                # minutes = team_actions['minute'].unique()

                grouped_home_actions = home_actions.groupby(['minute', 'player_position'])
                grouped_away_actions = away_actions.groupby(['minute', 'player_position'])
                grouped_home_action_types = home_actions.groupby(['minute', 'player_position', 'type_id'])
                grouped_away_action_types = away_actions.groupby(['minute', 'player_position', 'type_id'])

                for (min, player_position), player_actions in grouped_home_actions:
                    player_stats_game[game_i][min][player_position][0] = player_actions.sum()[
                        'vaep_value']
                    player_stats_game[game_i][min][player_position][4] = player_actions.sum()[
                        'movement_a0']
                    player_stats_game[game_i][min][player_position][5] = player_actions.sum()[
                        'gap_difference']
                for (min, player_position, type_id), player_actions in grouped_home_action_types:
                    if type_id == shot_action_type_id:
                        player_stats_game[game_i][min][player_position][1] = len(player_actions.index)
                    elif type_id == pass_action_type_id:
                        player_stats_game[game_i][min][player_position][2] = len(player_actions.index)
                    elif type_id == carry_action_type_id:
                        player_stats_game[game_i][min][player_position][3] = len(player_actions.index)

                for (min, player_position), player_actions in grouped_away_actions:
                    player_stats_game[len(games) + game_i][min][player_position][0] = player_actions.sum()['vaep_value']
                    player_stats_game[len(games) + game_i][min][player_position][4] = player_actions.sum()[
                        'movement_a0']
                    player_stats_game[len(games) + game_i][min][player_position][5] = player_actions.sum()[
                        'gap_difference']
                for (min, player_id, type_id), player_actions in grouped_away_action_types:
                    if type_id == shot_action_type_id:
                        player_stats_game[len(games) + game_i][min][player_position][1] = len(
                            player_actions.index)
                    elif type_id == pass_action_type_id:
                        player_stats_game[len(games) + game_i][min][player_position][2] = len(
                            player_actions.index)
                    elif type_id == carry_action_type_id:
                        player_stats_game[len(games) + game_i][min][player_position][3] = len(
                            player_actions.index)

                home_context_actions = away_actions
                away_context_actions = home_actions

                grouped_home_context_actions = home_context_actions.groupby(['minute'])
                grouped_away_context_actions = away_context_actions.groupby(['minute'])
                grouped_home_context_action_types = home_context_actions.groupby(['minute', 'type_id'])
                grouped_away_context_action_types = away_context_actions.groupby(['minute', 'type_id'])

                for min, actions in grouped_home_context_actions:
                    context_stats_game[game_i][min][0] = actions.sum()['vaep_value']
                    context_stats_game[game_i][min][4] = actions.sum()['movement_a0']
                    context_stats_game[game_i][min][5] = actions.sum()['gap_difference']
                    context_stats_game[game_i][min][6] = actions.iloc[-1]['goal_difference']
                    context_stats_game[game_i][min][7] = actions.iloc[-1]['goalscore_team']
                    context_stats_game[game_i][min][8] = actions.iloc[-1]['goalscore_opponent']
                for (min, type_id), actions in grouped_home_context_action_types:
                    if type_id == shot_action_type_id:
                        context_stats_game[game_i][min][1] = len(actions.index)
                    elif type_id == pass_action_type_id:
                        context_stats_game[game_i][min][2] = len(actions.index)
                    elif type_id == carry_action_type_id:
                        context_stats_game[game_i][min][3] = len(actions.index)

                for min, actions in grouped_away_context_actions:
                    context_stats_game[len(games) + game_i][min][0] = actions.sum()['vaep_value']
                    context_stats_game[len(games) + game_i][min][4] = actions.sum()['movement_a0']
                    context_stats_game[len(games) + game_i][min][5] = actions.sum()['gap_difference']
                    context_stats_game[len(games) + game_i][min][6] = actions.iloc[-1]['goal_difference']
                    context_stats_game[len(games) + game_i][min][7] = actions.iloc[-1]['goalscore_opponent']
                    context_stats_game[len(games) + game_i][min][8] = actions.iloc[-1]['goalscore_team']
                for (min, type_id), actions in grouped_away_context_action_types:
                    if type_id == shot_action_type_id:
                        context_stats_game[len(games) + game_i][min][1] = len(actions.index)
                    elif type_id == pass_action_type_id:
                        context_stats_game[len(games) + game_i][min][2] = len(actions.index)
                    elif type_id == carry_action_type_id:
                        context_stats_game[len(games) + game_i][min][3] = len(actions.index)

            game_i += 1
        season_i += 1

    current_player_stats = player_stats_game
    cumsum_current_player_stats = np.cumsum(current_player_stats, axis=1)

    with open(current_player_stats_npy, 'wb') as fp:
        np.save(fp, current_player_stats)
    with open(cumsum_current_player_stats_npy, 'wb') as fp:
        np.save(fp, cumsum_current_player_stats)
    with open(context_game_stats_npy, 'wb') as fp:
        np.save(fp, context_stats_game)
else :
    with open(current_player_stats_npy, 'rb') as fp:
        current_player_stats = np.load(fp)
    with open(cumsum_current_player_stats_npy, 'rb') as fp:
        cumsum_current_player_stats = np.load(fp)
    with open(context_game_stats_npy, 'rb') as fp:
        context_stats_game = np.load(fp)

In [88]:
# SUBSTITUTION LABELLING
import more_itertools as mit


def fix_event_game_time_seconds(events):
    events['time_seconds'] = events.minute * 60 + events.second
    first_half_events = events[events.period_id == 1]
    first_half_time = first_half_events.iloc[0].time_seconds
    second_half_events_mask = events.period_id == 2
    events.loc[second_half_events_mask, 'time_seconds'] += first_half_time
    return events


def group_subs_by_batches(sub_events):
    indices = sub_events.index.values
    grouped_subs = [list(group) for group in mit.consecutive_groups(indices)]
    grouped_sub_events = []
    for group in grouped_subs:
        group_event = []
        for sub in group:
            group_event.append(sub_events.loc[sub])
        grouped_sub_events.append(group_event)
    return grouped_sub_events


def get_substitution_category_with_player_positions(out_player_position, in_player_position):
    if out_player_position == PlayerPosition.DEFENDER:
        if in_player_position == PlayerPosition.DEFENDER:
            return SubstitutionType.NEUTRAL
        elif in_player_position == PlayerPosition.MIDFIELDER or in_player_position == PlayerPosition.ATTACKER:
            return SubstitutionType.OFFENSIVE
    elif out_player_position == PlayerPosition.MIDFIELDER:
        if in_player_position == PlayerPosition.DEFENDER:
            return SubstitutionType.DEFENSIVE
        elif in_player_position == PlayerPosition.MIDFIELDER:
            return SubstitutionType.NEUTRAL
        elif in_player_position == PlayerPosition.ATTACKER:
            return SubstitutionType.OFFENSIVE
    elif out_player_position == PlayerPosition.ATTACKER:
        if in_player_position == PlayerPosition.DEFENDER or in_player_position == PlayerPosition.MIDFIELDER:
            return SubstitutionType.DEFENSIVE
        elif in_player_position == PlayerPosition.ATTACKER:
            return SubstitutionType.NEUTRAL


if not os.path.exists(game_substitutions_json):

    substitutions = []

    max_groups = 0

    game_substitutions = {}

    game_i = 0
    season_i = 0
    seasons = pd.read_hdf(spadl_h5, "competitions")
    sorted_seasons = seasons.sort_values('season_name', ascending=True)
    game_id_list = []
    for season in seasons.itertuples():
        season_games = games[games.season_id == season.season_id]
        sorted_season_games = season_games.sort_values('game_day', ascending=True)
        for game in tqdm(sorted_season_games.itertuples()):
            events = SBL.events(game.game_id)
            events = fix_event_game_time_seconds(events)
            events.to_hdf(spadl_h5, f'events/game_{game.game_id}')
            game_substitutions[str(game.game_id)] = {}
            for team_id in [game.home_team_id, game.away_team_id]:
                team_events = events[events.team_id == team_id]
                team_startingxi = team_events[team_events.type_id == starting_xi_event_id]
                team_sub = team_events[team_events.type_id == substitution_event_id]
                team_tactical_shift = team_events[team_events.type_id == tactical_shift_event_id]
                grouped_sub_events = group_subs_by_batches(team_sub)
                grouped_sub_categories = []
                for sub_events in grouped_sub_events:
                    subs_integer = 0
                    sub_minute = 0
                    for sub_event in sub_events:
                        sub_minute = sub_event.minute
                        out_player_id = int(sub_event.player_id)
                        in_player_id = int(sub_event.extra['substitution']['replacement']['id'])
                        out_player_position = player_positions[out_player_id]
                        in_player_position = player_positions[in_player_id]
                        sub_category = get_substitution_category_with_player_positions(out_player_position,
                                                                                       in_player_position)
                        if sub_category == SubstitutionType.OFFENSIVE:
                            subs_integer += 1
                        elif sub_category == SubstitutionType.DEFENSIVE:
                            subs_integer -= 1
                    if subs_integer > 0:
                        subs_category = SubstitutionType.OFFENSIVE
                    elif subs_integer < 0:
                        subs_category = SubstitutionType.DEFENSIVE
                    else:
                        subs_category = SubstitutionType.NEUTRAL
                    grouped_sub_categories.append({'minute': int(sub_minute), 'category': subs_category})
                game_substitutions[str(game.game_id)][int(team_id)] = grouped_sub_categories

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [89]:
nb_defender = 0
nb_midfielder = 0
nb_attacker = 0
for player_position in player_positions.values():
    if player_position == PlayerPosition.DEFENDER:
        nb_defender += 1
    elif player_position == PlayerPosition.MIDFIELDER:
        nb_midfielder += 1
    elif player_position == PlayerPosition.ATTACKER:
        nb_attacker += 1
print(nb_defender)
print(nb_midfielder)
print(nb_attacker)

165
197
46


In [90]:
nb_neutral = 0
nb_offensive = 0
nb_defensive = 0
for game_id, teams in game_substitutions.items():
    for team_id, team_subs in teams.items():
        for team_sub in team_subs:
            if team_sub['category'] == SubstitutionType.OFFENSIVE:
                nb_offensive += 1
            elif team_sub['category'] == SubstitutionType.DEFENSIVE:
                nb_defensive += 1
            else:
                nb_neutral += 1
print(nb_defensive)
print(nb_neutral)
print(nb_offensive)

352
902
295


In [91]:
game_i = 0
game_map = {}
seasons = pd.read_hdf(spadl_h5, "competitions")
sorted_seasons = seasons.sort_values('season_name', ascending=True)
for season in seasons.itertuples():
    season_games = games[games.season_id == season.season_id]
    sorted_season_games = season_games.sort_values('game_day', ascending=True)
    for game in tqdm(sorted_season_games.itertuples()):
        game_map[game_i] = game.game_id
        game_i += 1

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

1 = GLOBAL
T1 = CURRENT SUBSTITUTION TIME
T2 = END OF GAME

SubstitutionStrategy 1 AGGRESSIVE
OFFENSIVE - GOOD = HOME_GOAL(T2) - HOME_GOAL(T1) > 0
NEUTRAL - GOOD = HOME_GOAL(T2) - HOME_GOAL(T1) > 0 AND AWAY_GOAL(T2) - AWAY_GOAL(T1) == 0
DEFENSIVE - GOOD = AWAY_GOAL(T2) - AWAY_GOAL(T1) == 0

SubstitutionStrategy 2 RESERVED
OFFENSIVE - GOOD = SCORE_DIFF(T2) - SCORE_DIFF(T1) > 0
NEUTRAL - GOOD = SCORE_DIFF(T2) - SCORE_DIFF(T1) >= 0
DEFENSIVE - GOOD = SCORE_DIFF(T2) - SCORE_DIFF(T1) >= 0

2 = LOCAL
T1 = CURRENT SUBSTITUTION TIME
T2 = NEXT CHANGE OF SCORE OR END OF MATCH
SubstitutionStrategy 1 AGGRESSIVE
OFFENSIVE - GOOD = HOME_GOAL(T2) - HOME_GOAL(T1) > 0
NEUTRAL - GOOD = SCORE_DIFF(T2) - SCORE_DIFF(T1) > 0
DEFENSIVE - GOOD = AWAY_GOAL(T2) - AWAY_GOAL(T1) == 0

SubstitutionStrategy 2 RESERVED
OFFENSIVE - GOOD = SCORE_DIFF(T2) - SCORE_DIFF(T1) > 0
NEUTRAL - GOOD = SCORE_DIFF(T2) - SCORE_DIFF(T1) >= 0
DEFENSIVE - GOOD = AWAY_GOAL(T2) - AWAY_GOAL(T1) == 0


In [92]:
import copy

goalscore_team_index = 7
goalscore_opponent_index = 8

labelled_game_substitutions = copy.deepcopy(game_substitutions)

if not os.path.exists(game_substitutions_json):
    for game_i, game_id in tqdm(game_map.items()):
        game_subs = labelled_game_substitutions[str(game_id)]
        game_events = SBL.events(game.game_id)
        sub_events = game_events[game_events.type_id == substitution_event_id]
        game_goals = np.maximum.accumulate(
            context_stats_game[game_i, :, [goalscore_team_index, goalscore_opponent_index]],
            axis=1)
        team_i = 0

        for team_id, subs in game_subs.items():
            team_goals = game_goals[team_i]
            opponent_goals = game_goals[1 - team_i]
            total_subs = len(subs) - 1
            for sub_strategy in SubstitutionStrategy:
                sub_i = 0
                for sub in subs:
                    sub_minute, sub_type = sub['minute'], sub['category']
                    if sub_i == total_subs:
                        team_goals_after = team_goals[-1]
                        opponent_after = opponent_goals[-1]
                    else :
                        if sub_strategy in [SubstitutionStrategy.RESERVED_2, SubstitutionStrategy.AGGRESSIVE_2]:
                            next_team_goal_change = np.where(np.diff(team_goals[sub_minute:],prepend=np.nan))[0]
                            next_opponent_goal_change = np.where(np.diff(opponent_goals[sub_minute:],prepend=np.nan))[0]
                            if len(next_team_goal_change) == 1 and len(next_opponent_goal_change) == 1 :
                                team_goals_after = team_goals[-1]
                                opponent_goals_after = opponent_goals[-1]
                            else :
                                if len(next_team_goal_change) == 1 :
                                    next_goal_minute = sub_minute + next_opponent_goal_change[1]
                                elif len(next_opponent_goal_change) == 1 :
                                    next_goal_minute = sub_minute + next_team_goal_change[1]
                                else :
                                    next_goal_minute = sub_minute + min(next_opponent_goal_change[1], next_team_goal_change[1])
                                team_goals_after = team_goals[next_goal_minute]
                                opponent_after = opponent_goals[next_goal_minute]
                        else:
                            next_sub_minute = subs[sub_i + 1]['minute']
                            team_goals_after = team_goals[next_sub_minute]
                            opponent_after = opponent_goals[next_sub_minute]
                    team_goals_before = team_goals[sub_minute - 1]
                    opponent_before = opponent_goals[sub_minute - 1]
                    score_diff_before = team_goals_before - opponent_before
                    score_diff_after = team_goals_after - opponent_after

                    if sub_strategy == SubstitutionStrategy.AGGRESSIVE_1 :
                        if sub_type == SubstitutionType.OFFENSIVE:
                            if team_goals_after - team_goals_before > 0:
                                sub_result = SubstitutionResult.GOOD
                            else:
                                sub_result = SubstitutionResult.BAD
                        elif sub_type == SubstitutionType.NEUTRAL:
                            if team_goals_after - team_goals_before > 0 and opponent_after - opponent_before == 0:
                                sub_result = SubstitutionResult.GOOD
                            else:
                                sub_result = SubstitutionResult.BAD
                        else:
                            if opponent_after - opponent_before == 0:
                                sub_result = SubstitutionResult.GOOD
                            else:
                                sub_result = SubstitutionResult.BAD
                    elif sub_strategy == SubstitutionStrategy.RESERVED_1 :
                        if sub_type == SubstitutionType.OFFENSIVE:
                            if score_diff_after - score_diff_before > 0:
                                sub_result = SubstitutionResult.GOOD
                            else:
                                sub_result = SubstitutionResult.BAD
                        else:
                            if score_diff_after - score_diff_before >= 0:
                                sub_result = SubstitutionResult.GOOD
                            else:
                                sub_result = SubstitutionResult.BAD
                    elif sub_strategy == SubstitutionStrategy.AGGRESSIVE_2 :
                        if sub_type == SubstitutionType.OFFENSIVE:
                            if team_goals_after - team_goals_before > 0 :
                                sub_result = SubstitutionResult.GOOD
                            else :
                                sub_result = SubstitutionResult.BAD
                        elif sub_type == SubstitutionType.NEUTRAL :
                            if score_diff_after - score_diff_before > 0 :
                                sub_result = SubstitutionResult.GOOD
                            else :
                                sub_result = SubstitutionResult.BAD
                        else :
                            if opponent_after - opponent_before == 0 :
                                sub_result = SubstitutionResult.GOOD
                            else :
                                sub_result = SubstitutionResult.BAD
                    elif sub_strategy == SubstitutionStrategy.RESERVED_2 :
                        if sub_type == SubstitutionType.OFFENSIVE:
                            if team_goals_after - team_goals_before > 0 :
                                sub_result = SubstitutionResult.GOOD
                            else :
                                sub_result = SubstitutionResult.BAD
                        elif sub_type == SubstitutionType.NEUTRAL :
                            if score_diff_after - score_diff_before >= 0 :
                                sub_result = SubstitutionResult.GOOD
                            else :
                                sub_result = SubstitutionResult.BAD
                        else :
                            if score_diff_after - score_diff_before >= 0 :
                                sub_result = SubstitutionResult.GOOD
                            else :
                                sub_result = SubstitutionResult.BAD

                    labelled_game_substitutions[str(game_id)][team_id][sub_i][sub_strategy] = sub_result
                    sub_i += 1
            team_i += 1

    with open(game_substitutions_json, "w") as f:
        json.dump(labelled_game_substitutions, f)
else:
    with open(game_substitutions_json, 'r') as fp:
        labelled_game_substitutions = json.load(fp)


  0%|          | 0/326 [00:00<?, ?it/s]

62
90
62
90
90
90
90
90
90
90
60
60
59
59
78
78
79
79
79
79
79
79
79
79
57
57
57
57
57
57
79
79
47
47
51
51
51
51
78
78
78
78
52
72
52
72
23
72
23
72
66
66
66
73
66
73
68
68
51
80
51
80
86
86
86
86
51
51
51
51
51
51
26
26
69
69
69
69
71
92
71
92
71
71
68
68
68
68
68
68
72
72
72
72
72
80
72
80
72
72
48
82
82
48
82
82
82
82
82
82
57
75
57
75
75
75
71
71
83
83
83
83
83
83
50
50
68
68
68
68
79
79
79
79
42
64
42
64
73
73
73
73
83
83
83
83
83
83
83
83
83
83
61
93
61
93
93
93
25
25
74
74
74
74
74
74
74
74
58
58
81
81
86
86
86
86
54
74
54
74
64
92
64
92
64
92
64
92
69
69
69
69
72
72
72
72
72
72
50
50
70
70
70
70
70
70
89
89
89
89
26
26
87
87
87
87
56
63
56
63
63
63
63
63
94
94
94
94
94
94
83
83
83
83
83
83
88
88
37
88
37
88
58
58
58
86
58
86
63
63
63
63
63
63
63
63
60
60
91
91
44
91
44
91
66
66
63
63
63
63
63
63
63
63
63
72
63
72
51
51
71
71
71
71
53
86
86
53
86
86
86
86
59
59
73
88
73
88
59
59
79
79
79
79
79
79
79
79
79
79
58
58
35
35
61
61
55
55
90
90
55
55
62
62
57
57
78
78
78
78
77
77
58
6

In [93]:
nb_aggressive_1_good_subs = 0
nb_aggressive_1_bad_subs = 0
nb_aggressive_2_good_subs = 0
nb_aggressive_2_bad_subs = 0


nb_reserved_1_good_subs = 0
nb_reserved_1_bad_subs = 0
nb_reserved_2_good_subs = 0
nb_reserved_2_bad_subs = 0

for game_id, game_subs in labelled_game_substitutions.items():
    for team_id, team_subs in game_subs.items():
        for sub in team_subs:
            minute = sub['minute']
            sub_type = sub['category'],
            aggressive_1_sub_result = sub[SubstitutionStrategy.AGGRESSIVE_1]
            reserved_1_sub_result = sub[SubstitutionStrategy.RESERVED_1]
            aggressive_2_sub_result = sub[SubstitutionStrategy.AGGRESSIVE_2]
            reserved_2_sub_result = sub[SubstitutionStrategy.RESERVED_2]
            if aggressive_1_sub_result == SubstitutionResult.GOOD:
                nb_aggressive_1_good_subs += 1
            elif aggressive_1_sub_result == SubstitutionResult.BAD:
                nb_aggressive_1_bad_subs += 1
            if aggressive_2_sub_result == SubstitutionResult.GOOD:
                nb_aggressive_2_good_subs += 1
            elif aggressive_2_sub_result == SubstitutionResult.BAD:
                nb_aggressive_2_bad_subs += 1
            if reserved_1_sub_result == SubstitutionResult.GOOD:
                nb_reserved_1_good_subs += 1
            elif reserved_1_sub_result == SubstitutionResult.BAD:
                nb_reserved_1_bad_subs += 1
            if reserved_2_sub_result == SubstitutionResult.GOOD:
                nb_reserved_2_good_subs += 1
            elif reserved_2_sub_result == SubstitutionResult.BAD:
                nb_reserved_2_bad_subs += 1

print(nb_aggressive_1_good_subs)
print(nb_aggressive_1_bad_subs)
print(nb_reserved_1_good_subs)
print(nb_reserved_1_bad_subs)

print(nb_aggressive_2_good_subs)
print(nb_aggressive_2_bad_subs)
print(nb_reserved_2_good_subs)
print(nb_reserved_2_bad_subs)

504
1045
1085
464
581
968
1019
530


In [94]:
if not os.path.exists(cumsum_history_player_stats_npy):
    i = 0
    history_player_stats = current_player_stats.copy()
    for season in seasons.itertuples():
        season_games = games[games.season_id == season.season_id]
        sorted_season_games = season_games.sort_values('game_day', ascending=True)

        season_first_game_index = i
        season_last_game_index = i + len(sorted_season_games)

        cumulated_player_stats_season_games = current_player_stats[season_first_game_index:season_last_game_index]

        for j in range(season_last_game_index - season_first_game_index):
            cumulated_before = cumulated_player_stats_season_games[0:j + 1]
            expected_before = np.sum(cumulated_before, axis=0) / (j + 1)
            history_player_stats[season_first_game_index + j] = expected_before

        i = season_last_game_index

    cumsum_history_player_stats = np.cumsum(history_player_stats, axis=1)

    with open(history_player_stats_npy, 'wb') as fp:
        np.save(fp, history_player_stats)

    with open(cumsum_history_player_stats_npy, 'wb') as fp:
        np.save(fp, cumsum_history_player_stats)
else :
    with open(history_player_stats_npy, 'rb') as fp:
        history_player_stats = np.load(fp)
    with open(history_player_stats_npy, 'rb') as fp:
        cumsum_history_player_stats = np.load(fp)