In [None]:
import soccerdata as sd
import socceraction.spadl as spadl

# Get the data

In [None]:
ws = sd.WhoScored(leagues="INT-World Cup", seasons=2018)

In [None]:
epl_schedule = ws.read_schedule()
epl_schedule.head()

In [None]:
loader = ws.read_events(match_id=1249919, output_fmt="loader")
df_games = loader.games(competition_id="INT-World Cup", season_id="2018")

In [None]:
events = loader.events(game_id=1249919)

In [None]:

actions = spadl.add_names(events)

In [None]:
actions.head()

# Plot the events

In [None]:
import matplotsoccer

# Select the 5 actions preceding the 2-0
shot = 1700
a = actions[shot-4:shot+1].copy()

# Print the game date and timestamp of the goal
# g = game.iloc[0]
# minute = int((a.period_id.values[0]-1) * 45 + a.time_seconds.values[0] // 60)
# game_info = f"{g.game_date} {g.home_team_name} {g.home_score}-{g.away_score} {g.away_team_name} {minute + 1}'"
# print(game_info)

# Plot the actions
def nice_time(row):
    minute = int((row.period_id-1)*45 +row.time_seconds // 60)
    second = int(row.time_seconds % 60)
    return f"{minute}m{second}s"

a["nice_time"] = a.apply(nice_time, axis=1)
labels = a[["nice_time", "type_name", "player", "team"]]

ax = matplotsoccer.actions(
    location=a[["start_x", "start_y", "end_x", "end_y"]],
    action_type=a.type_name,
    team= a.team,
    result= a.result_name == "success",
    label=labels,
    labeltitle=["time", "actiontype", "player", "team"],
    zoom=False,
    figsize=6
)

# Compute Features and Labels

In [None]:
import os
import warnings
import tqdm
import pandas as pd
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [None]:
%load_ext autoreload
%autoreload 2
import socceraction.spadl as spadl
import socceraction.vaep.features as fs
import socceraction.vaep.labels as lab

# Compute features

In [None]:
# Configure file and folder names
datafolder = "../data"
features_h5 = os.path.join(datafolder, "features.h5")
labels_h5 = os.path.join(datafolder, "labels.h5")

In [None]:
xfns = [
    fs.actiontype,
    fs.actiontype_onehot,
    fs.bodypart,
    fs.bodypart_onehot,
    fs.result,
    fs.result_onehot,
    fs.goalscore,
    fs.startlocation,
    fs.endlocation,
    fs.movement,
    fs.space_delta,
    fs.startpolar,
    fs.endpolar,
    fs.team,
    fs.time,
    fs.time_delta
]

with pd.HDFStore(features_h5) as featurestore:
    gamestates = fs.gamestates(spadl.add_names(actions), 3)
    gamestates = fs.play_left_to_right(gamestates, 338)
    
    X = pd.concat([fn(gamestates) for fn in xfns], axis=1)
    featurestore.put(f"game_1249919", X, format='table')

In [None]:
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

with pd.HDFStore(labels_h5) as labelstore:
    Y = pd.concat([fn(spadl.add_names(actions)) for fn in yfns], axis=1)
    labelstore.put(f"game_1249919", Y, format='table')

# Train Model

In [None]:
game_id = 1249919

In [None]:
# 1. Select feature set X
xfns = [
    fs.actiontype,
    fs.actiontype_onehot,
    #fs.bodypart,
    fs.bodypart_onehot,
    fs.result,
    fs.result_onehot,
    fs.goalscore,
    fs.startlocation,
    fs.endlocation,
    fs.movement,
    fs.space_delta,
    fs.startpolar,
    fs.endpolar,
    fs.team,
    #fs.time,
    fs.time_delta,
    #fs.actiontype_result_onehot
]
nb_prev_actions = 1

Xcols = fs.feature_column_names(xfns, nb_prev_actions)

def getXY(game_id, Xcols):
    # generate the columns of the selected feature
    X = []
    for game_id in tqdm.tqdm([game_id], desc="Selecting features"):
        Xi = pd.read_hdf(features_h5, f"game_{game_id}")
        X.append(Xi[Xcols])
    X = pd.concat(X).reset_index(drop=True)

    # 2. Select label Y
    Ycols = ["scores","concedes"]
    Y = []
    for game_id in tqdm.tqdm([game_id], desc="Selecting label"):
        Yi = pd.read_hdf(labels_h5, f"game_{game_id}")
        Y.append(Yi[Ycols])
    Y = pd.concat(Y).reset_index(drop=True)
    return X, Y

X, Y = getXY(game_id, Xcols)
print("X:", list(X.columns))
print("Y:", list(Y.columns))

In [None]:
# Train a model
# 3. train classifiers F(X) = Y
import xgboost

Y_hat = pd.DataFrame()
models = {}
for col in list(Y.columns):
    model = xgboost.XGBClassifier(n_estimators=50, max_depth=3, n_jobs=-3, verbosity=1, enable_categorical=True)
    model.fit(X, Y[col])
    models[col] = model

# Evaluate the model

In [None]:
from sklearn.metrics import brier_score_loss, roc_auc_score, log_loss

testX, testY = X, Y

def evaluate(y, y_hat):
    p = sum(y) / len(y)
    base = [p] * len(y)
    brier = brier_score_loss(y, y_hat)
    print(f"  Brier score: %.5f (%.5f)" % (brier, brier / brier_score_loss(y, base)))
    ll = log_loss(y, y_hat)
    print(f"  log loss score: %.5f (%.5f)" % (ll, ll / log_loss(y, base)))
    print(f"  ROC AUC: %.5f" % roc_auc_score(y, y_hat))

for col in testY.columns:
    Y_hat[col] = [p[1] for p in models[col].predict_proba(testX)]
    print(f"### Y: {col} ###")
    evaluate(testY[col], Y_hat[col])

# Save Predictions

In [None]:
predictions_h5 = os.path.join(datafolder, "predictions.h5")

In [None]:
# get rows with game id per action
A = []
for game_id in tqdm.tqdm([game_id], "Loading game ids"):
    Ai = actions[actions["game_id"] == game_id]
    A.append(Ai[["game_id"]])
A = pd.concat(A)
A = A.reset_index(drop=True)

# concatenate action game id rows with predictions and save per game
grouped_predictions = pd.concat([A, Y_hat], axis=1).groupby("game_id")
with pd.HDFStore(predictions_h5) as predictionstore:
    for k, df in tqdm.tqdm(grouped_predictions, desc="Saving predictions per game"):
        df = df.reset_index(drop=True)
        predictionstore.put(f"game_{int(k)}", df[Y_hat.columns])

In [None]:
players = loader.players(game_id=game_id)
teams = loader.teams(game_id=game_id)

In [None]:
teams

In [None]:
actions.columns

# Computer VAEP Players

In [None]:
import socceraction.vaep.formula as vaepformula

In [None]:
A = []
for game in tqdm.tqdm([game_id], desc="Rating actions"):
    game_actions = (
        actions[actions.game_id == game_id]
        .merge(players, how="left")
        .merge(teams, how="left")
        .sort_values(["game_id", "period_id", "action_id"])
        .reset_index(drop=True)
    )
    playersR = playersR.merge(players[["player_id", "player_name"]], how="left")
    playersR["player_name"] = playersR[["player_name"]].apply(lambda x: x.iloc[0] if x.iloc[0] else x.iloc[1], axis=1)
    preds = pd.read_hdf(predictions_h5, f"game_{game_id}")
    values = vaepformula.value(game_actions, preds.scores, preds.concedes)
    A.append(pd.concat([game_actions, preds, values], axis=1))
A = pd.concat(A).sort_values(["game_id", "period_id", "time_seconds"]).reset_index(drop=True)
A.columns

In [None]:
A["count"] = 1

# Compute each player's number of actions and total VAEP values
playersR = (
    A[["player_id", "player_name", "vaep_value", "offensive_value", "defensive_value", "count"]]
    .groupby(["player_id"])
    .agg(
        {
            "vaep_value": "sum",
            "offensive_value": "sum",
            "defensive_value": "sum",
            "count": "sum",
            "player_name": "first",
        }
    )
    .reset_index()
)
# Show results
playersR = playersR[["player_id", "player_name", "vaep_value", "offensive_value", "defensive_value", "count"]]
playersR['player_id'] = playersR['player_id'].astype(int)
playersR.sort_values("vaep_value", ascending=False)[:10]

In [None]:
# Normalize for minutes played
mp = players[["player_id", "minutes_played"]].groupby("player_id").sum().reset_index()

stats = playersR.merge(mp)
stats = stats[stats.minutes_played > 90 ] # at least one full games played
stats["vaep_rating"] = stats.vaep_value * 90 / stats.minutes_played
stats["offensive_rating"] = stats.offensive_value * 90 / stats.minutes_played
stats["defensive_rating"] = stats.defensive_value * 90 / stats.minutes_played
stats.sort_values("vaep_rating",ascending=False)[:10]

In [None]:
df_games.columns

In [None]:
df_games['home_team_name'] = df_games['home_team_id'].apply(lambda x: teams[teams['team_id'] == x]['team_name'].values[0])
df_games['away_team_name'] = df_games['away_team_id'].apply(lambda x: teams[teams['team_id'] == x]['team_name'].values[0])

### (optional) inspect Portugal's top 10 most valuable non-shot actions

In [None]:
import matplotsoccer

sorted_A = A.sort_values("vaep_value", ascending=False)
sorted_A = sorted_A[sorted_A.team_name == "Portugal"] # view only actions from Belgium
sorted_A = sorted_A[~sorted_A.type_name.str.contains("shot")] #eliminate shots

def get_time(period_id,time_seconds):
    m = int((period_id-1)*45 + time_seconds // 60)
    s = int(time_seconds % 60)
    return f"{m}m{s}s"

for j in range(0, 10):
    row = list(sorted_A[j:j+1].itertuples())[0]
    i = row.Index
    a = A[i - 3 : i+2].copy()
    
    a["player_name"] = a[["player_name"]].apply(lambda x: x.iloc[0] if x.iloc[0] else x.iloc[1], axis=1)
    
    g = list(df_games[df_games.game_id == a.game_id.values[0]].itertuples())[0]
    game_info = f"{g.game_date} {g.home_team_name} {g.home_score}-{g.away_score} {g.away_team_name}"
    minute = int((row.period_id-1)*45 + row.time_seconds // 60)
    print(f"{game_info} {minute}' {row.type_name} {row.player_name}")

    a["scores"] = a.scores.apply(lambda x : "%.3f" % x )
    a["vaep_value"] = a.vaep_value.apply(lambda x : "%.3f" % x )
    a["time"] = a[["period_id", "time_seconds"]].apply(lambda x: get_time(*x),axis=1)
    cols = ["time", "type_name", "player_name", "team_name", "scores", "vaep_value"]
    matplotsoccer.actions(a[["start_x", "start_y", "end_x",  "end_y"]],
                a.type_name,
                team=a.team_name,
                result = a.result_name == "success",
                label=a[cols],
                labeltitle = cols,
                zoom=False)