In [1]:
import pandas as pd
import numpy as np
import os
import tqdm

In [2]:
datafolder = "../data-fifa"
spadl_h5 = os.path.join(datafolder, "spadl-statsbomb.h5")
dfs = []
with pd.HDFStore(spadl_h5) as spadlstore:
    games = (
        spadlstore["games"]
        .merge(spadlstore["competitions"], how='left')
        .merge(spadlstore["teams"].add_prefix('home_'), how='left')
        .merge(spadlstore["teams"].add_prefix('away_'), how='left'))
    
    game_ids = games.game_id.unique()
    for game_id in tqdm.tqdm(game_ids):
        actions = spadlstore[f"actions/game_{game_id}"]
        actions = (
                actions.merge(spadlstore["actiontypes"], how="left")
                .merge(spadlstore["results"], how="left")
                .merge(spadlstore["bodyparts"], how="left")
                .merge(spadlstore["players"], how="left")
                .merge(spadlstore["teams"], how="left")
        )
        dfs.append(actions)

100%|██████████████████████████████████████| 64/64 [00:08<00:00,  7.94it/s]


In [3]:
valid_df = pd.concat(dfs, ignore_index=True) 
train_df = pd.read_csv("../xt_pre_data.csv")

In [4]:
train_df.head(5)

Unnamed: 0,period_id,time_minutes,player_id,team_id,type_name,start_x,start_y,end_x,end_y,result_name,...,player_name,jersey_number,team_name,country_name,extra,country_id,timestamp,type_id,result_id,bodypart_id
0,1,0.02,44120,13,pass,52.605,33.796,44.1,34.748,success,...,Pierre-Emerick Aubameyang,14,Arsenal,Arsenal,{},13,00:00:01.200000,0.0,1.0,0
1,1,0.02,89401,13,dribble,44.1,34.748,44.52,33.116,success,...,Granit Xhaka,34,Arsenal,Arsenal,{},13,00:00:01.200000,21.0,1.0,0
2,1,0.05,328983,13,dribble,39.585,24.82,41.58,23.256,success,...,Matteo Guendouzi,29,Arsenal,Arsenal,{},13,00:00:03,21.0,1.0,0
3,1,0.05,89401,13,pass,44.52,33.116,39.585,24.82,success,...,Granit Xhaka,34,Arsenal,Arsenal,{},13,00:00:03,0.0,1.0,0
4,1,0.08,239872,13,dribble,49.665,21.76,45.57,20.876,success,...,Nicolas Pépé,19,Arsenal,Arsenal,{},13,00:00:04.800000,21.0,1.0,0


## 10 features of the GAM 

* distance to goal at end of game state
* angle to goal at end of game state
* x-coordinate at end of game state
* was last action successful
* was last action foul
* goalscore difference at end of gs
* forward movement of the last action
* forward movt of the second last action
* forward movt of the third last action
* time b/w the last and third last action




In [5]:
def get_angle(val):
    x1, y1, = 105, 68/2
    x2, y2 = val
    
    dx = x2 - x1
    dy = y2 - y1
    result = np.arctan2(dy, dx)
    return result if result>=0 else result + 2*np.pi

def get_features_and_labels(df):
    """Create the 10 features and the targets and return them
    
    Parameters:
    ------------
    
    df: the SPADL format dataframe
    
    Returns:
    ------------
    Xs:pd.DataFrame, ys:pd.DataFrame
    
    Xs: the Dataframe containing the 10 features
    ys: the two targets - scoring and conceding
    """
    
    features = ["dist_to_goal", "angle_to_goal", "end_x", "last_action_succ",
                "last_action_foul", "goal_diff", "forward_movt_l", "forward_movt_l2",
                "forward_movt_l3", "time_del_l1_l3"]

    labels = ["scoring", "conceding"]
    
    df["dist_to_goal"] = ((105 - df["end_x"])**2 + ((68/2) - df["end_y"])**2)**0.5
    df['angle_to_goal'] = df[['end_x', 'end_y']].apply(get_angle, axis=1)
    df[["last_action", "last_result_name"]] = df[["type_name", "result_name"]].shift(1)
    df["last_action_succ"] = np.where(df["last_result_name"] == "success", 1, 0)
    df["last_action_foul"] = np.where(df["last_action"]=="foul", 1, 0)
    
    #######create features and labels game-by-game
    groups_list = []

    for _, group in df.groupby("game_id"):
        group = group.reset_index(drop=True)
        team_1, team_2 = sorted(group["team_name"].unique())
        group["team_1"] = team_1
        group["team_2"] = team_2

        group["team_1_goals"] = 0
        group["team_2_goals"] = 0

        ##scores
        goal_indices = group.query("type_name==['shot', 'shot_freekick', 'shot_penalty'] & result_name==['success', 'owngoal']").index

        for idx in goal_indices:

            if group.loc[idx, "result_name"] == "success":
                if group.loc[idx, "team_name"] == team_1:           
                    group.loc[idx+1:, "team_1_goals"]+=1

                elif group.loc[idx, "team_name"] == team_2:
                    group.loc[idx+1:, "team_2_goals"]+=1

            ####own goals means other team's goal count increases
            elif group.loc[idx, "result_name"] == "owngoal":
                if group.loc[idx, "team_name"] == team_1:           
                    group.loc[idx+1:, "team_2_goals"]+=1

                elif group.loc[idx, "team_name"] == team_2:
                    group.loc[idx+1:, "team_1_goals"]+=1 

        ## goal difference            
        group["t1_goal_diff"] = group["team_1_goals"] - group["team_2_goals"]
        group["t2_goal_diff"] = -group["t1_goal_diff"]
        group["goal_diff"] = np.where(group["team_name"]==group["team_1"], group["t1_goal_diff"], group["t2_goal_diff"])            

        ##forward movemements    
        group["forward_movt"] = group["end_x"] - group["start_x"]
        group["forward_movt_l"] = group["forward_movt"].shift(1) ##forward movement of last action
        group["forward_movt_l2"] = group["forward_movt"].shift(2) ##forward movement of second last action
        group["forward_movt_l3"] = group["forward_movt"].shift(3) ##forward movement of third last action

        ##time passed b/w actions
        group["time_l"] = group["time_seconds"].shift(1) ##time of the last action
        group["time_l3"] = group["time_seconds"].shift(3) ##time of the last action
        group["time_del_l1_l3"] = group["time_l"] - group["time_l3"] ##time passed b/w last and third last action


        ###targets
        team_1_goal_idx = group.query("((team_name == @team_1) &\
                                               (type_name==['shot', 'shot_freekick', 'shot_penalty']) &\
                                               (result_name=='success')) |\
                                               ((team_name != @team_1) & (result_name=='owngoal'))").index

        team_2_goal_idx = group.query("((team_name == @team_2) &\
                                               (type_name==['shot', 'shot_freekick', 'shot_penalty']) &\
                                               (result_name=='success')) |\
                                               ((team_name != @team_2) & (result_name=='owngoal'))").index

        ##create labels
        team_1_scoring = np.zeros(len(group), dtype=int)
        team_2_scoring = np.zeros(len(group), dtype=int)

        for i in team_1_goal_idx:
            team_1_scoring[i-10:i]=1

        for i in team_2_goal_idx:
             team_2_scoring[i-10:i]=1 

        group["team_1_scoring"] = team_1_scoring
        group["team_2_scoring"] = team_2_scoring

        group["team_2_conceding"] = group["team_1_scoring"]
        group["team_1_conceding"] = group["team_2_scoring"]

        group["scoring"] = np.where(group["team_name"]==team_1, group["team_1_scoring"], group["team_2_scoring"])
        group["conceding"] = np.where(group["team_name"]==team_1, group["team_1_conceding"], group["team_2_conceding"])

        groups_list.append(group)   

    new_df = pd.concat(groups_list, ignore_index=True) 
    new_df = new_df[features + labels].dropna(subset=features)
    
    Xs = new_df[features]
    ys = new_df[labels]
    
    return Xs, ys

In [6]:
train_Xs, train_ys = get_features_and_labels(train_df)
valid_Xs, valid_ys = get_features_and_labels(valid_df)

In [7]:
print("Training Data:", train_Xs.shape, train_ys.shape)
print("Validation Data", valid_Xs.shape, valid_ys.shape)

Training Data: (502896, 10) (502896, 2)
Validation Data (128745, 10) (128745, 2)


In [8]:
train_ys["scoring"].value_counts(normalize=True)

0    0.990787
1    0.009213
Name: scoring, dtype: float64

In [9]:
valid_ys["scoring"].value_counts(normalize=True)

0    0.989157
1    0.010843
Name: scoring, dtype: float64

In [11]:
train_Xs.to_csv("../preprocessed_data/train_Xs.csv", index=False)
train_ys.to_csv("../preprocessed_data/train_ys.csv", index=False)

valid_Xs.to_csv("../preprocessed_data/valid_Xs.csv", index=False)
valid_ys.to_csv("../preprocessed_data/valid_ys.csv", index=False)

In [12]:
train_df.head()

Unnamed: 0,period_id,time_minutes,player_id,team_id,type_name,start_x,start_y,end_x,end_y,result_name,...,timestamp,type_id,result_id,bodypart_id,dist_to_goal,angle_to_goal,last_action,last_result_name,last_action_succ,last_action_foul
0,1,0.02,44120,13,pass,52.605,33.796,44.1,34.748,success,...,00:00:01.200000,0.0,1.0,0,60.904593,3.129311,,,0,0
1,1,0.02,89401,13,dribble,44.1,34.748,44.52,33.116,success,...,00:00:01.200000,21.0,1.0,0,60.48646,3.156208,pass,success,1,0
2,1,0.05,328983,13,dribble,39.585,24.82,41.58,23.256,success,...,00:00:03,21.0,1.0,0,64.323634,3.30941,dribble,success,1,0
3,1,0.05,89401,13,pass,44.52,33.116,39.585,24.82,success,...,00:00:03,0.0,1.0,0,66.055996,3.281017,dribble,success,1,0
4,1,0.08,239872,13,dribble,49.665,21.76,45.57,20.876,success,...,00:00:04.800000,21.0,1.0,0,60.861846,3.358936,pass,success,1,0


In [14]:
train_df["game_id"].nunique()

288