In [1]:
import pandas as pd
from pandas import json_normalize
from utils import read_json
import socceraction.vaep as vaep
import numpy as np
import pandas as pd
import json
import warnings
import os

from processing.eventing import StatsBombBasicPassingNetwork, StatsBombValuePassingNetwork

pd.set_option("display.max_columns", None)

warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# match_id = 68352
match_id = 69225

lineups = read_json(f"data/eventing/lineups/{match_id}.json")

# team_name = lineups[0]["team_name"]
team_name = "Real Madrid"

In [2]:
names_dict = {
    player["player_name"]: player["player_nickname"]
    for team in lineups
    for player in team["lineup"]
}
names_dict

{'Gnégnéri Yaya Touré': 'Yaya Touré',
 'Daniel Alves da Silva': 'Dani Alves',
 'Sergio Busquets i Burgos': 'Sergio Busquets',
 'Lionel Andrés Messi Cuccittini': 'Lionel Messi',
 'Rafael Márquez Álvarez': 'Rafael Márquez',
 'Eiður Smári Guðjohnsen': 'Eiður Guðjohnsen',
 'Thierry Henry': None,
 'Aliaksandr Hleb': 'Aleksandr Hleb',
 "Samuel Eto''o Fils": "Samuel Eto'o",
 'Carles Puyol i Saforcada': 'Carles Puyol',
 'Xavier Hernández Creus': 'Xavi',
 'Seydou Kéita': 'Seydou Kéita',
 'Víctor Valdés Arribas': 'Víctor Valdés',
 'Eric-Sylvain Bilal Abidal': 'Éric Abidal',
 'Wesley Sneijder': 'W. Sneijder',
 'Sergio Ramos García': 'Sergio Ramos',
 'Gonzalo Gerardo Higuaín': 'Gonzalo Higuaín',
 'Francisco Javier García Fernández': 'Javi García',
 'Iker Casillas Fernández': 'Iker Casillas',
 'Miguel Ángel Salgado Fernández': 'Míchel Salgado',
 'Raúl González Blanco': 'Raúl',
 'Fabio Cannavaro': None,
 'Rafael van der Vaart': 'Rafael van der Vaart',
 'Christoph Metzelder': None,
 'Royston Ricky Dr

In [3]:
events = read_json(f"data/eventing/events/{match_id}.json")
df_events = json_normalize(events, sep="_").assign(match_id=match_id)

In [4]:
# We select all successful passes done by the selected team before the minute
# of the first substitution or red card.

first_red_card_minute = df_events[df_events.foul_committed_card_name.isin(["Second Yellow", "Red Card"])].minute.min()
first_substitution_minute = df_events[df_events.type_name == "Substitution"].minute.min()
max_minute = df_events.minute.max()

num_minutes = min(first_substitution_minute, first_red_card_minute, max_minute)

- pass_end_location -> avg distance from opp goal
- pass_type_name -> no of each
- pass_through_ball -> %
- pass completion rate
- avg no of passes
- no of passes received by most advanced player
- avg position distance (exclude goalkeeper)
- Average Number of Unique Passing Links per Player
- Betti Number 0

In [5]:
# Taking into account all legitimate passes. Don't care about the same playing eleven on the field.

df_passes = df_events[
    (df_events.type_name == "Pass")
    & (df_events.pass_outcome_name != "Injury Clearance")
    & (df_events.team_name == team_name)
    # & (df_events.minute < num_minutes)
].copy()

In [6]:
# Function to convert location data to points
def _statsbomb_to_point(location, max_width=120, max_height=80):
    """
    Convert a point's coordinates from a StatsBomb's range to 0-1 range.
    """
    return location[0] / max_width, 1 - (location[1] / max_height)

# Euclidean distance function
def _calculate_distance(point1, point2):
    return np.sqrt((point2[0] - point1[0]) ** 2 + (point2[1] - point1[1]) ** 2)

In [7]:
# avg_dist_opp_goal -> avg distance from opp goal - pass_end_location

def calculate_average_distance_from_goal(df, fixed_point):
    """
    Calculate the average distance from a fixed point in the DataFrame.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the 'location' column.
    - fixed_point (tuple): A tuple (x, y) representing the fixed point.

    Returns:
    - float: The average distance from the fixed point to all points in the 'location' column.
    """

    # Apply transformations and calculate distances
    distances = df["location"].apply(
        lambda x: _calculate_distance(fixed_point, _statsbomb_to_point(x))
    )

    return distances.mean()


goal = (1, 0.5)
average_distance = calculate_average_distance_from_goal(df_passes, goal)
print("Average Distance:", average_distance)

Average Distance: 0.6497966942958443


In [8]:
# pass_type_name -> no of each

def count_pass_type(df, dropna=True, normalize=False):
    """
    Count the occurrences of each unique pass type in a DataFrame.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the pass data.

    Returns:
    - pd.Series: A Series with the count of each unique pass type.
    """
    pass_type_counts = df.pass_type_name.value_counts(normalize=normalize, dropna=dropna)
    return pass_type_counts

In [9]:
count_pass_type(df_passes)

pass_type_name
Recovery        52
Throw-in        21
Free Kick       16
Goal Kick       10
Interception     6
Corner           4
Kick Off         3
Name: count, dtype: int64

In [10]:
def calculate_through_ball_percentage(df, dropna=True):
    """
    Calculate the percentage of through balls in a given DataFrame column.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the pass data.
    - column_name (str): Name of the column indicating whether a pass is a through ball.

    Returns:
    - float: The percentage of through balls relative to all entries in the column, excluding NA values.
    """

    # Calculate the number of through balls
    through_balls_count = df.pass_through_ball.sum()  # True values are counted as 1, False as 0

    if dropna:
        # Calculate the total number of non-NA entries
        total_passes = df.pass_through_ball.count()
    else:
        # Calculate the total number of entries
        total_passes = df.pass_through_ball.shape[0]

    # Calculate the percentage of through balls
    if total_passes > 0:
        through_ball_percentage = (through_balls_count / total_passes) * 100
        return through_ball_percentage
    else:
        return 0

through_ball_percentage = calculate_through_ball_percentage(df_passes)
print("Percentage of Through Balls:", through_ball_percentage)

Percentage of Through Balls: 100.0


In [11]:
# pass completion rate

def calculate_pass_completion_rate(df):
    """
    Calculate the pass completion rate from the pass outcome column.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the pass data.
    - outcome_column (str): Name of the column indicating the outcome of a pass.

    Returns:
    - float: The pass completion rate as a percentage.
    """
    # Count the number of successful passes (NaN in the outcome column)
    successful_passes_count = df.pass_outcome_name.isna().sum()
    # Count the total number of passes
    total_passes = len(df.pass_outcome_name) 

    # Calculate the pass completion rate
    if total_passes > 0:
        completion_rate = (successful_passes_count / total_passes) * 100
        return completion_rate
    else:
        return 0


pass_completion_rate = calculate_pass_completion_rate(df_passes)
print("Pass Completion Rate:", pass_completion_rate)

Pass Completion Rate: 63.54838709677419


In [12]:
# no of passes received by most advanced player

def calculate_avg_passes_per_player(df):
    """
    Calculate the average number of passes per player.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the pass data.

    Returns:
    - float: Average number of passes per player.
    """
    
    # Total number of passes
    total_passes = df.shape[0]
    # Number of unique players
    num_unique_players = df.player_id.nunique()

    # Calculate the average number of passes per player
    if num_unique_players > 0:
        avg_passes_per_player = total_passes / num_unique_players
        return avg_passes_per_player
    else:
        return 0 

avg_passes_per_player = calculate_avg_passes_per_player(df_passes)
print("Average Passes Per Player:", avg_passes_per_player)

Average Passes Per Player: 22.142857142857142


In [13]:
# number of passes received by the most advanced player

def calculate_passes_to_most_advanced_player(df):
    """
    Calculate the number of passes received by the most advanced player based on the median of their pass origin positions.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the pass data.

    Returns:
    - int: Number of passes received by the most advanced player.
    """

    # Calculate each player's average pass origin's coordinates
    df["origin_pos_x"] = df.location.apply(lambda x: _statsbomb_to_point(x)[0])
    df["origin_pos_y"] = df.location.apply(lambda x: _statsbomb_to_point(x)[1])

    # Determine the median position for each player
    player_position = df.groupby("player_name").agg({"origin_pos_x": "median", "origin_pos_y": "median"})

    players_distances = player_position.apply(
        lambda x: _calculate_distance((0, 0.5), (x["origin_pos_x"], x["origin_pos_y"])), axis=1)

    # Identify the most advanced player
    most_advanced_player = players_distances.idxmax()
    # Count the number of passes received by this player
    passes_received = df[df["pass_recipient_name"] == most_advanced_player].shape[0]

    return most_advanced_player, passes_received

# Example usage:
# Assuming df_passes is your DataFrame and it includes necessary columns

most_advanced_player, number_of_passes_received = (
    calculate_passes_to_most_advanced_player(df_passes)
)
print("Most Advanced Player:", most_advanced_player)
print("Number of Passes Received by the Most Advanced Player:", number_of_passes_received)

Most Advanced Player: Miguel Palanca Fernández
Number of Passes Received by the Most Advanced Player: 19


In [14]:
# avg position distance (exclude goalkeeper)

def calculate_avg_x_position_excluding_goalkeepers(df):
    """
    Calculate the average x position for all players excluding goalkeepers.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the player data with positions.
    - position_column (str): Column name where player roles or positions are defined.
    - x_position_column (str): Column name where x positions are stored.

    Returns:
    - float: Average x position for non-goalkeepers.
    """

    # Filter out goalkeepers
    non_goalkeepers = df[df["position_name"] != "Goalkeeper"]

    # Calculate the average x position for non-goalkeepers
    non_goalkeepers["origin_pos_x"] = df.location.apply(lambda x: _statsbomb_to_point(x)[0])
    average_x_position = non_goalkeepers["origin_pos_x"].mean()

    return average_x_position


avg_x_position_non_gk = calculate_avg_x_position_excluding_goalkeepers(df_passes)
print("Average X Position (excluding goalkeepers):", avg_x_position_non_gk)

Average X Position (excluding goalkeepers): 0.45876325088339226


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_goalkeepers["origin_pos_x"] = df.location.apply(lambda x: _statsbomb_to_point(x)[0])


In [15]:
# Average Number of Unique Passing Links per Player

def average_unique_passing_links(df):
    """
    Calculate the average number of unique passing links per player.

    Parameters:
    - df (pd.DataFrame): DataFrame containing the pass data.
    
    Returns:
    - float: Average number of unique passing links per player.
    """

    # Group by player and list unique recipients
    unique_links_per_player = df.groupby("player_name")["pass_recipient_name"]\
        .agg(lambda x: x.dropna().nunique())

    # Calculate the average number of unique links
    average_links = unique_links_per_player.mean()

    return average_links

avg_links = average_unique_passing_links(df_passes)
print("Average Number of Unique Passing Links per Player:", avg_links)

Average Number of Unique Passing Links per Player: 7.071428571428571


In [16]:
def get_name_or_default(name):
    if not name or pd.isna(name):  # Checks if name is None or NaN
        return None
    # Safely gets the value from the dictionary or returns the name itself
    return names_dict.get(name, name) if names_dict.get(name) is not None else name


# If available, use player's nickname instead of full name to optimize space in plot
# df_new["pass_recipient_name"] = df_new.pass_recipient_name.apply(lambda x: None if not x else names_dict[x] if names_dict[x] else x)
df_passes["pass_recipient_name"] = df_passes["pass_recipient_name"].apply(
    get_name_or_default
)
df_passes["player_name"] = df_passes.player_name.apply(
    lambda x: names_dict[x] if names_dict[x] else x
)

In [17]:
predictions_h5 = os.path.join("data/eventing", "predictions.h5")

spadl_h5 = os.path.join("data/eventing", "spadl-statsbomb.h5")

actiontypes = pd.read_hdf(spadl_h5, "actiontypes")
bodyparts = pd.read_hdf(spadl_h5, "bodyparts")
results = pd.read_hdf(spadl_h5, "results")
players = pd.read_hdf(spadl_h5, "players")
teams = pd.read_hdf(spadl_h5, "teams")
actions = pd.read_hdf(spadl_h5, "actions/game_{0}".format(match_id))

actions = (
    actions.merge(actiontypes, how="left")
    .merge(results, how="left")
    .merge(bodyparts, how="left")
    .merge(players, how="left")
    .merge(teams, how="left")
)

In [18]:
preds = pd.read_hdf(predictions_h5, "game_{0}".format(match_id))
values = vaep.value(actions, preds.scores, preds.concedes)

df_vaep = pd.concat([actions, preds, values], axis=1)
# df_vaep["player_name"] = df_vaep.apply(lambda x: x["player_nickname"] if x["player_nickname"] else x["player_name"], axis=1)
df_vaep["player_name"] = df_vaep.player_name.apply(lambda x: names_dict[x] if names_dict[x] else x)

  prev_scores[penalty_idx] = 0.792453


In [19]:
def calculate_team_pass_value(df_passes, df_vaep):
    # Merge df_passes and df_vaep on timestamp and player_name
    df_result = pd.merge(df_passes[["timestamp", "player_name", "pass_recipient_name"]], df_vaep,
        on=["timestamp", "player_name"], how="left")

    # Filter out negative actions
    df_result["vaep_value"] = df_result.vaep_value.apply(lambda x: x if x >= 0 else None)

    # Group by player_name and calculate mean vaep_value
    player_pass_value = df_result.groupby("player_name").agg(pass_value=("vaep_value", "mean"))

    return player_pass_value.mean().values[0]

team_pass_value = calculate_team_pass_value(df_passes, df_vaep)
print("Average Team VAEP Value:", team_pass_value)

Average Team VAEP Value: 0.006071046009353653
