In [2]:
from re import match
import pandas as pd
import numpy as np
import socceraction
from socceraction.data.statsbomb import StatsBombLoader
import socceraction.xthreat as xthreat
import socceraction.spadl as spadl
import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab
import socceraction.vaep.formula as vaepformula
import matplotsoccer as mps
import matplotlib.pyplot as plt
import jellyfish
from tqdm import tqdm
import xgboost
from sklearn.metrics import brier_score_loss, roc_auc_score, log_loss
import warnings


warnings.filterwarnings('ignore')

In [3]:
#get indices for euro 2020 competition data
COMPETITION_IDX = 55
SEASON_IDX = 43

In [4]:
#define StatsBombLoader
SBL = StatsBombLoader()
df_competitions = SBL.competitions()

In [5]:
#function getting game idices - there are 51 games

def get_games_idx(competition_id, season_id):
    df_games = SBL.games(competition_id=55, season_id=43)
    games_idx = list(df_games["game_id"])
    games_dict = {row["game_id"]: {"home_team_id": row["home_team_id"], "away_team_id": row["away_team_id"]} for idx, row in df_games.iterrows()}
    return games_idx, games_dict

In [6]:
#extract all players that participated in a single game
def get_all_players_single_game(game_id):
    df_players = SBL.players(game_id)
    df_players = df_players[
        [
            "player_id",
            "player_name",
            "team_id"
        ]
    ]
    return df_players

#extract all players that participated in a competition
def get_all_players(competition_id, season_id):
    games_idx, _ = get_games_idx(competition_id, season_id)
    players_df_list = []
    pbar = tqdm(games_idx)
    for game_idx in pbar:
        pbar.set_description(f"Extracting Players Data for Game ID: {game_idx}")
        
        df_players = get_all_players_single_game(game_id=game_idx)
        players_df_list.append(df_players)
    cumulative_players_df = pd.concat(players_df_list).reset_index(drop=True)
    cumulative_players_df.drop_duplicates(inplace=True)
    return cumulative_players_df

#create a dictionary with game events (passes, shots, etc). key - game ID, value - dataframe with events
def get_game_events(game_id):
    df_events_raw = SBL.events(game_id)
    df_events_game = dict(tuple(df_events_raw.groupby("team_id")))
    return df_events_game

#convert extracted competetions to socceraction SPADL format. key - game ID, value - dataframe with events
def get_games_actions_spadl_format(games_dict):
    games_actions_dict = {}
    pbar = tqdm(games_dict.items())
    for game_idx, team_dict in pbar:
        df_events = SBL.events(game_idx)
        pbar.set_description(f"Calculating Actions Data for Game ID: {game_idx}")
        
        df_events = SBL.events(game_idx)
        df_actions = spadl.statsbomb.convert_to_actions(df_events, team_dict["home_team_id"])
        df_actions_ltr = spadl.play_left_to_right(df_actions, team_dict["home_team_id"])
        
        games_actions_dict[game_idx] = df_actions_ltr
        
    return games_actions_dict

In [7]:
#build xT model
def build_xt_model(df_actions, l=16, w=12):
    xTModel = xthreat.ExpectedThreat(l=l, w=w)
    xTModel.fit(df_actions)
    return xTModel

#predict xT on a trained model 
def predict_xt_model(df_actions, xt_model):
    mov_actions = xthreat.get_successful_move_actions(df_actions)
    mov_actions["xT_value"] = xt_model.predict(mov_actions)
    xt_df = mov_actions[
        [
            "player_id",
            "start_x", 
            "start_y", 
            "end_x", 
            "end_y",
            "xT_value"
        ]
    ]
    return xt_df

#calculate xT for each player that participated in euro. Average over all games
def build_cumulative_players_xt(players_df, games_actions_dict, l=16, w=12):
    player_xt_list = []
    
    df_actions_cumulative = pd.concat([df_actions for _, df_actions in games_actions_dict.items()])
    xTModel = build_xt_model(df_actions_cumulative, l=l, w=w)
    xt_df = predict_xt_model(df_actions_cumulative, xTModel)
    players_xt_df = pd.DataFrame(xt_df.groupby("player_id")["xT_value"].sum())
    players_xt_df = players_xt_df.reset_index(drop=False)
    players_xt_df = pd.DataFrame(players_xt_df.groupby("player_id")["xT_value"].sum()).reset_index(drop=False)
    players_xt_df = pd.merge(
        left=players_xt_df,
        right=players_df,
        on=["player_id", "player_id"]
    )
    players_xt_df.sort_values("xT_value", inplace=True, ascending=False)
    
    return players_xt_df
            

In [8]:
#xT computation
games_idx, games_dict = get_games_idx(competition_id=COMPETITION_IDX, season_id=SEASON_IDX)
players_df = get_all_players(competition_id=COMPETITION_IDX, season_id=SEASON_IDX)
games_actions_dict = get_games_actions_spadl_format(games_dict=games_dict)
players_xt_df = build_cumulative_players_xt(
    players_df=players_df,
    games_actions_dict=games_actions_dict, 
    l=16, 
    w=12
)

Extracting Players Data for Game ID: 3788744: 100%|██████████| 51/51 [01:29<00:00,  1.75s/it]
Calculating Actions Data for Game ID: 3788744: 100%|██████████| 51/51 [00:32<00:00,  1.56it/s]


# iterations:  55


In [9]:
players_xt_df.head(10).sort_values("xT_value", ascending=True)

Unnamed: 0,player_id,xT_value,player_name,team_id
26,3311.0,0.947641,Daley Blind,941
192,7037.0,0.978972,Lorenzo Insigne,914
163,6399.0,1.039155,Gareth Frank Bale,907
133,5579.0,1.06895,Joshua Kimmich,770
33,3382.0,1.115606,Luke Shaw,768
20,3233.0,1.119845,Raheem Sterling,768
66,4353.0,1.235106,Aymeric Laporte,772
360,16554.0,1.38448,Joakim Mæhle,776
363,16570.0,1.4666,Vladimír Coufal,912
99,5211.0,1.761272,Jordi Alba Ramos,772


# Computation of Features Needed for VAEP calculation

In [10]:
#similar like in socceration notebooks. Create features and labels for fuether calculation
def compute_features_and_labels(games_dict, games_actions_dict):
    xfns = [
        fs.actiontype,
        fs.actiontype_onehot,
        fs.bodypart,
        fs.bodypart_onehot,
        fs.result,
        fs.result_onehot,
        fs.goalscore,
        fs.startlocation,
        fs.endlocation,
        fs.movement,
        fs.space_delta,
        fs.startpolar,
        fs.endpolar,
        fs.team,
        fs.time,
        fs.time_delta
    ]
    yfns = [
        lab.scores, 
        lab.concedes, 
        lab.goal_from_shot
    ]
    
    features_labels_dict = {}
    
    pbar = tqdm(games_dict.items())
    for game_idx, games_data in pbar:
        pbar.set_description(f"Calculating Features and Labels for Game ID: {game_idx}")
        features_labels_dict[game_idx] = {}
        gamestates = fs.gamestates(spadl.add_names(games_actions_dict[game_idx]), 3)
        gamestates = fs.play_left_to_right(gamestates, games_data["home_team_id"])
        X = pd.concat([fn(gamestates) for fn in xfns], axis=1)        
        Y = pd.concat([fn(spadl.add_names(games_actions_dict[game_idx])) for fn in yfns], axis=1)
        features_labels_dict[game_idx]["features"] = X
        features_labels_dict[game_idx]["labels"] = Y
        
    return features_labels_dict

In [11]:
features_labels_dict = compute_features_and_labels(games_dict, games_actions_dict)

Calculating Features and Labels for Game ID: 3788744: 100%|██████████| 51/51 [00:15<00:00,  3.30it/s]


## Estimate scoring and conceding probas

In [12]:
#similar like in socceration notebooks. Create X and Y data (features and labels) for further model training 
def getXY(games_dict, features_labels_dict):
    xfns = [
        fs.actiontype,
        fs.actiontype_onehot,
        fs.bodypart_onehot,
        fs.result,
        fs.result_onehot,
        fs.goalscore,
        fs.startlocation,
        fs.endlocation,
        fs.movement,
        fs.space_delta,
        fs.startpolar,
        fs.endpolar,
        fs.team,
        fs.time_delta,
    ]
    nb_prev_actions = 1
    
    Xcols = fs.feature_column_names(xfns, nb_prev_actions)
    Ycols = ["scores", "concedes"]
    X = []
    Y = []
    
    pbar = tqdm(games_dict.items())
    for game_idx, games_data in pbar:
        pbar.set_description(f"Processing Scoring and Conceding Features and Labels for Game ID: {game_idx}")
        X.append(features_labels_dict[game_idx]["features"][Xcols])
        Y.append(features_labels_dict[game_idx]["labels"][Ycols])
        
    X = pd.concat(X).reset_index(drop=True)
    Y = pd.concat(Y).reset_index(drop=True)
    return X, Y

In [13]:
X, Y = getXY(games_dict, features_labels_dict)

Processing Scoring and Conceding Features and Labels for Game ID: 3788744: 100%|██████████| 51/51 [00:00<00:00, 110.33it/s]


## Train a model for Scoring and Conceding Probability Estimation

In [14]:
#similar like in socceration notebooks. train XGBoost model for estimating probabilities of Scoring and Conceding goals
def train_model_scoring_conceding(features, labels):
    models = {}
    for col in list(labels.columns):
        model = xgboost.XGBClassifier(n_estimators=50, max_depth=3, n_jobs=-3, verbosity=1)
        model.fit(features, labels[col])
        models[col] = model
    return models

In [15]:
#similar like in socceration notebooks. predict Scoring and Conceding probabilities based on pre-calculated features
def predict_scoring_conceding_probas(models, features, labels, games_actions_dict):
    pred_dict = {}
    for col in list(labels.columns):
        pred_dict[col] = [p[1] for p in models[col].predict_proba(features)]
    
    pred_df = pd.concat([df for game_idx, df in games_actions_dict.items()])[["game_id"]].reset_index(drop=True)
    for col, pred in pred_dict.items():
        pred_df[col] = pred
    
    grouped_pred = dict(tuple(pred_df.groupby("game_id")))
    return grouped_pred

In [16]:
scoring_conceding_models = train_model_scoring_conceding(features=X, labels=Y)
grouped_pred = predict_scoring_conceding_probas(scoring_conceding_models, X, Y, games_actions_dict)

# VAEP Calculation

In [17]:
#similar like in socceration notebooks. calculate VAEP values based on trained model
def calculate_vaep_values(games_actions_dict, grouped_pred):
    actions_cumulative = spadl.add_names(pd.concat([df for game_idx, df in games_actions_dict.items()]))
    grouped_actions_cumulative = dict(tuple(actions_cumulative.groupby("game_id")))
    
    vaep_list = []
    for game_id, action_df in grouped_actions_cumulative.items():
        vaep_values = vaepformula.value(action_df, grouped_pred[game_id]["scores"], grouped_pred[game_id]["concedes"])
        vaep_list.append(pd.concat([action_df[["player_id"]], vaep_values], axis=1))
        
    vaep_df = pd.concat(vaep_list).dropna().reset_index(drop=True)
    vaep_df = pd.DataFrame(vaep_df.groupby("player_id").sum()).reset_index(drop=False)
    return vaep_df

In [18]:
vaep_df = calculate_vaep_values(games_actions_dict, grouped_pred)

In [19]:
players_xt_vaep = pd.merge(
    left=players_xt_df,
    right=vaep_df,
    on=["player_id", "player_id"]
)

In [20]:
players_xt_vaep.head(40).sort_values("defensive_value", ascending=True)

Unnamed: 0,player_id,xT_value,player_name,team_id,offensive_value,defensive_value,vaep_value
10,16532.0,0.93247,Daniel Olmo Carvajal,772,0.505328,-1.535873,-1.030545
14,7173.0,0.851108,Leonardo Bonucci,914,0.90999,-1.052405,-0.142414
29,5207.0,0.752445,Cristiano Ronaldo dos Santos Aveiro,780,2.201384,-0.806136,1.395249
2,16554.0,1.38448,Joakim Mæhle,776,3.005048,-0.508523,2.496526
38,8220.0,0.631284,David Olatukunbo Alaba,915,0.384988,-0.448035,-0.063047
39,8779.0,0.628274,Stefan Lainer,915,1.581466,-0.407063,1.174403
7,6399.0,1.039155,Gareth Frank Bale,907,-0.480698,-0.314538,-0.795235
33,2988.0,0.690735,Memphis Depay,941,0.781956,-0.27711,0.504846
4,3233.0,1.119845,Raheem Sterling,768,2.677634,-0.276107,2.401527
11,24443.0,0.867408,Mikkel Damsgaard,776,1.69157,-0.250202,1.441368


# xG Calculation

In [21]:
#similar like in socceration notebooks. Create X and Y data for training xG mode
def create_features_labels_xg(feature_names, shots_actions):
    return shots_actions[feature_names], shots_actions["result_success_a0"]

In [22]:
#train xG model using XBGboost
def train_model_xg(features, labels):
    xGmodel = xgboost.XGBClassifier()
    xGmodel.fit(features, labels)
    return xGmodel

#predict xG values by using the model
def predict_xg(model, features, labels):
    pred = model.predict_proba(features)
    pred = [p[1] for p in pred]
    return pred

In [23]:
#similar like in socceration notebooks. Create dataframe with shot events for xG calculation
def calculate_xg_features(games_actions_dict, features_labels_dict):
    xfns = [
        fs.actiontype_onehot,
        fs.bodypart_onehot,
        fs.startlocation,
        fs.movement,
        fs.space_delta,
        fs.startpolar,
        fs.team,
    ]
    nb_prev_actions = 2
    feature_names = fs.feature_column_names(xfns, nb_prev_actions)
    feature_names = list(filter(lambda v: not match('type_[a-z_]+_a0', v), feature_names))
    feature_names.remove("dx_a0")
    feature_names.remove("dy_a0")
    feature_names.remove("movement_a0")

    actions_cumulative = spadl.add_names(pd.concat([df for game_idx, df in games_actions_dict.items()]))
    features_cumulative = pd.concat([df["features"] for game_idx, df in features_labels_dict.items()])
    #below you can see how extracting shot actions was differrent in the example notebook
    #shot_idx = actions_cumulative["type_name"].str.contains("shot")
    #shots_actions = features_cumulative[shot_idx]
    shot_idx = [idx for idx, row in actions_cumulative[["type_name"]].iterrows() if "shot" in row["type_name"]]
    shots_actions = features_cumulative.iloc[shot_idx]
    return shots_actions, feature_names, actions_cumulative, shot_idx

#Create a dataframe with Players and their respective xG values
def get_players_xg(actions_cumulative, pred_xg):
    xg_raw = actions_cumulative[["player_id"]]
    xg_raw["xG_value"] = 0
    xg_raw.iloc[shot_idx, 1] = pred_xg
    xg_df = pd.DataFrame(xg_raw.groupby("player_id").sum()).reset_index(drop=False)
    return xg_df

In [24]:
#Run xG calculation
shots_actions, feature_names, actions_cumulative, shot_idx = calculate_xg_features(games_actions_dict, features_labels_dict)
X_xg, Y_xg = create_features_labels_xg(feature_names, shots_actions)
xGmodel = train_model_xg(features=X_xg, labels=Y_xg)
pred_xg = predict_xg(xGmodel, X_xg, Y_xg)
xg_df = get_players_xg(actions_cumulative, pred_xg)

In [25]:
#merge all results from xT VAEP and xG into a single data frame
players_xt_vaep_xg = pd.merge(
    left=players_xt_vaep,
    right=xg_df,
    on=["player_id", "player_id"]
)

In [26]:
players_xt_vaep_xg.head()

Unnamed: 0,player_id,xT_value,player_name,team_id,offensive_value,defensive_value,vaep_value,xG_value
0,5211.0,1.761272,Jordi Alba Ramos,772,2.099831,-0.079838,2.019993,0.053367
1,16570.0,1.4666,Vladimír Coufal,912,1.772318,0.310737,2.083055,0.021104
2,16554.0,1.38448,Joakim Mæhle,776,3.005048,-0.508523,2.496526,1.838846
3,4353.0,1.235106,Aymeric Laporte,772,1.261502,0.306275,1.567777,0.935935
4,3233.0,1.119845,Raheem Sterling,768,2.677634,-0.276107,2.401527,3.037921


# xGA Calculation

# Merging calculated data frames with transfermarkt values

In [27]:
def read_and_prepare_tm_data(path):
    tm_df = pd.read_csv(path)
    tm_df["season"] = tm_df["season"].apply(lambda x: int(x.split("/")[1]))
    tm_df = tm_df[tm_df["season"]==2021]
    tm_df = tm_df[
        [
            "player_name",
            "season",
            "market_value_eur",
            "dob",
            "position_code"
        ]
    ]
    tm_df.dropna(inplace=True)
    tm_df.reset_index(drop=True, inplace=True)
    tm_df.drop_duplicates(subset=["player_name"], inplace=True)
    return tm_df
    

In [28]:
def string_similarity(a, b):
    return jellyfish.jaro_distance(a, b)

In [29]:
def get_similarity_idx(player_names_sb, player_names_tm):
    players_to_drop = {}
    player_names_mapping = {}
    for pn_sb in tqdm(player_names_sb):
        similarity_idx = [string_similarity(pn_sb, pn_tm) for pn_tm in player_names_tm]
        matching_idx = np.argmax(similarity_idx)
        if similarity_idx[matching_idx] < 0.78:
            players_to_drop[pn_sb] = player_names_tm[matching_idx]
            continue
        else:
            player_names_mapping[pn_sb] = player_names_tm[matching_idx]
    return player_names_mapping, players_to_drop
            

In [32]:
def merge_tm_detadata(sb_data, tm_data, player_names_mapping, players_to_drop):
    sb_data_clean = sb_data[~sb_data["player_name"].str.lower().isin(players_to_drop.keys())]
    sb_data_clean["player_name_tm"] = sb_data_clean["player_name"].apply(lambda x: player_names_mapping[x.lower()])
    sb_data_clean.drop_duplicates(subset=["player_name", "player_name_tm"], inplace=True)
    sb_data_clean.dropna(inplace=True)
    
    data_upd = pd.merge(
        left=sb_data_clean,
        right=tm_data[["player_name", "market_value_eur", "dob"]],
        left_on="player_name_tm", 
        right_on="player_name",
        how="left"
    )
    data_upd["dob"] = pd.to_datetime(data_upd["dob"])
    data_upd["age"] = data_upd["dob"].apply(lambda x: int((pd.to_datetime("2021-06-01") - x).days/365))
    
    data_upd.drop(["player_name_tm", "player_name_y", "dob"], axis=1, inplace=True)
    return data_upd

In [33]:
tm_df = read_and_prepare_tm_data(path="tm_player_valuations_all_1617-2122_latest.csv")
player_names_sb = players_xt_vaep_xg["player_name"].str.lower().tolist()
player_names_tm = tm_df["player_name"].str.lower().tolist()

player_names_mapping, players_to_drop = get_similarity_idx(
    player_names_sb, 
    player_names_tm
)
data_final = merge_tm_detadata(players_xt_vaep_xg, tm_df, player_names_mapping, players_to_drop)

100%|██████████| 485/485 [00:06<00:00, 73.20it/s]


In [34]:
data_final = merge_tm_detadata(players_xt_vaep_xg, tm_df, player_names_mapping, players_to_drop)

In [35]:
data_final.head()

Unnamed: 0,player_id,xT_value,player_name_x,team_id,offensive_value,defensive_value,vaep_value,xG_value,market_value_eur,age
0,5211.0,1.761272,Jordi Alba Ramos,772,2.099831,-0.079838,2.019993,0.053367,30000000,32
1,16570.0,1.4666,Vladimír Coufal,912,1.772318,0.310737,2.083055,0.021104,8000000,28
2,16554.0,1.38448,Joakim Mæhle,776,3.005048,-0.508523,2.496526,1.838846,10000000,24
3,4353.0,1.235106,Aymeric Laporte,772,1.261502,0.306275,1.567777,0.935935,60000000,27
4,3233.0,1.119845,Raheem Sterling,768,2.677634,-0.276107,2.401527,3.037921,110000000,26
