In [1]:
from src.data_loader import load_match_data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


In [2]:
AVAILABLE_MATCHES = {1886347: "Match 1886347", 1899585: "Match 1899585", 1925299: "Match 1925299", 1953632: "Match 1953632", 1996435: "Match 1996435", 2006229: "Match 2006229", 2011166: "Match 2011166", 2013725: "Match 2013725", 2015213: "Match 2015213", 2017461: "Match 2017461" }

In [None]:
event_data, enriched_tracking_data, synced = load_match_data(1886347, minutes=90)

In [None]:
all_events = []
all_tracking = []
all_synced = []

for match_id in AVAILABLE_MATCHES:
    print(f"Loading match {match_id}...")

    event_data, enriched_tracking_data, synced = load_match_data(
        match_id,
        minutes=90
    )

    # Add match_id for traceability
    event_data["match_id"] = match_id
    enriched_tracking_data["match_id"] = match_id
    synced["match_id"] = match_id

    all_events.append(event_data)
    all_tracking.append(enriched_tracking_data)
    all_synced.append(synced)

# Concatenate
events_df = pd.concat(all_events, ignore_index=True)
tracking_df = pd.concat(all_tracking, ignore_index=True)
synced_df = pd.concat(all_synced, ignore_index=True)

print(
    events_df.shape,
    tracking_df.shape,
    synced_df.shape)

Loading match 1886347...
Loading match 1899585...


  event_data = pd.read_csv(de_url)


Loading match 1925299...
Loading match 1953632...
Loading match 1996435...
Loading match 2006229...
Loading match 2011166...


In [None]:
synced_df.head()

(118272, 32)

In [4]:
synced.shape

(127358, 329)

In [6]:
synced.event_type.value_counts()

event_type
passing_option        7590
player_possession     3014
on_ball_engagement    2816
off_ball_run          1562
Name: count, dtype: int64

In [5]:
event_data.event_type.value_counts()

event_type
passing_option        2544
player_possession      999
on_ball_engagement     937
off_ball_run           599
Name: count, dtype: int64

In [12]:
list(synced.columns)

['x',
 'y',
 'player_id_tracking',
 'is_detected',
 'frame',
 'timestamp',
 'period_tracking',
 'possession_player_id',
 'possession_group',
 'ball_x',
 'ball_y',
 'ball_z',
 'is_detected_ball',
 'match_id_tracking',
 'start_time',
 'end_time',
 'match_name',
 'date_time',
 'home_team.name',
 'away_team.name',
 'id',
 'short_name',
 'number',
 'team_id_tracking',
 'team_name',
 'player_role.position_group',
 'total_time',
 'player_role.name',
 'player_role.acronym',
 'is_gk',
 'direction_player_1st_half',
 'direction_player_2nd_half',
 'event_id',
 'index',
 'match_id_event',
 'frame_start',
 'frame_end',
 'frame_physical_start',
 'time_start',
 'time_end',
 'minute_start',
 'second_start',
 'duration',
 'period_event',
 'attacking_side_id',
 'attacking_side',
 'event_type_id',
 'event_type',
 'event_subtype_id',
 'event_subtype',
 'player_id_event',
 'player_name',
 'player_position_id',
 'player_position',
 'player_in_possession_id',
 'player_in_possession_name',
 'player_in_possessi

In [14]:
synced.xthreat.describe()

count    9152.000000
mean        0.008337
std         0.031411
min         0.000000
25%         0.000300
50%         0.001400
75%         0.004900
max         0.347100
Name: xthreat, dtype: float64

In [11]:
synced.describe()

Unnamed: 0,x,y,player_id_tracking,timestamp,possession_player_id,ball_x,ball_y,ball_z,match_id_tracking,id,...,simultaneous_defensive_engagement_same_target_rank,affected_line_break_id,affected_line_breaking_passing_option_xthreat,affected_line_breaking_passing_option_run_subtype_id,xloss_player_possession_start,xloss_player_possession_end,xloss_player_possession_max,xshot_player_possession_start,xshot_player_possession_end,xshot_player_possession_max
count,127358.0,127358.0,127358.0,127358,32956.0,127358.0,127358.0,127358.0,127358.0,127358.0,...,1056.0,242.0,242.0,44.0,2794.0,2794.0,2794.0,2794.0,2794.0,2794.0
mean,3.708387,0.716677,314883.045455,2025-12-27 00:04:46.314129920,249393.026035,2.323574,0.822076,0.577017,1886347.0,314883.045455,...,1.5625,1.363636,0.002545,2.0,0.237772,0.214724,0.27311,0.016472,0.019787,0.024157
min,-54.79,-36.57,14736.0,2025-12-27 00:00:00,14736.0,-55.8,-37.2,-0.08,1886347.0,14736.0,...,1.0,1.0,0.0002,2.0,0.005,0.002,0.005,0.0,0.0,0.0
25%,-11.36,-10.82,50978.0,2025-12-27 00:02:15.500000,50983.0,-17.81,-24.47,0.17,1886347.0,50978.0,...,1.0,1.0,0.0003,2.0,0.138,0.073,0.186,0.0,0.0,0.0
50%,4.95,0.52,92605.5,2025-12-27 00:04:40.600000,51649.0,7.43,3.61,0.25,1886347.0,92605.5,...,2.0,1.0,0.0018,2.0,0.274,0.231,0.293,0.0,0.0,0.001
75%,20.5,12.8,735574.0,2025-12-27 00:07:19.300000,285188.0,21.56,22.51,0.6,1886347.0,735574.0,...,2.0,2.0,0.0058,2.0,0.316,0.341,0.357,0.002,0.001,0.004
max,51.72,36.96,966120.0,2025-12-27 00:10:00,966120.0,51.21,36.4,9.36,1886347.0,966120.0,...,3.0,2.0,0.0065,2.0,0.475,0.479,0.515,0.879,0.896,0.916
std,22.574811,16.627007,353124.635633,,312758.475704,24.592173,24.059575,0.878539,0.0,353124.635633,...,0.574229,0.482043,0.002316,0.0,0.114116,0.139312,0.121682,0.103567,0.111323,0.114869


In [7]:
possessions = synced[synced['event_type'] == 'player_possession'].copy()
passing_options = synced[synced['event_type'] == 'passing_option'].copy()

# Link passing options to their parent possession using event_id
if 'associated_player_possession_event_id' in passing_options.columns:
    passing_options['parent_possession_id'] = passing_options['associated_player_possession_event_id']
    
    # Aggregate runs per possession
    run_summary = passing_options.groupby('parent_possession_id').agg({
        'dangerous': ['sum', 'max'],
        'xthreat': ['mean', 'max', 'sum'],
        'targeted': 'sum',
        'player_name': 'count'  # number of runs
    }).reset_index()
    
    run_summary.columns = ['possession_id', 'n_dangerous_runs', 'any_dangerous_run',
                           'avg_xthreat', 'max_xthreat', 'total_xthreat', 
                           'n_targeted_runs', 'n_total_runs']
    
    # Merge back to possessions
    possessions_with_runs = possessions.merge(run_summary, 
                                               left_on='event_id', 
                                               right_on='possession_id', 
                                               how='left')
    
    possessions_with_runs['n_untargeted_dangerous'] = (possessions_with_runs['n_dangerous_runs'] - 
                                                         possessions_with_runs['n_targeted_runs'].fillna(0))
    
    print(f"Successfully linked {len(run_summary)} possessions to their runs")
else:
    print("Cannot find linking column - checking alternatives...")
    print(passing_options.columns[passing_options.columns.str.contains('event_id')].tolist())

print("\n" + "="*80)
print("POSSESSION OUTCOMES BY RUN CHARACTERISTICS")
print("="*80)

poss_with_runs_data = possessions_with_runs[possessions_with_runs['n_total_runs'] > 0]

print(f"\nPossessions with at least 1 dangerous run: {(poss_with_runs_data['any_dangerous_run'] > 0).sum()}")
print(f"Possessions with untargeted dangerous runs: {(poss_with_runs_data['n_untargeted_dangerous'] > 0).sum()}")

# Analyze outcome by run quality
outcome_cols = ['pass_outcome', 'lead_to_shot', 'lead_to_goal', 'xthreat', 'forward_momentum']
outcome_cols = [c for c in outcome_cols if c in possessions_with_runs.columns]

if outcome_cols:
    print("\n--- Comparing possessions WITH vs WITHOUT dangerous runs ---")
    
    has_dangerous = poss_with_runs_data[poss_with_runs_data['any_dangerous_run'] > 0]
    no_dangerous = poss_with_runs_data[poss_with_runs_data['any_dangerous_run'] == 0]
    
    for col in outcome_cols:
        if possessions_with_runs[col].dtype in ['float64', 'int64']:
            print(f"\n{col}:")
            print(f"  With dangerous run: {has_dangerous[col].mean():.3f}")
            print(f"  Without dangerous run: {no_dangerous[col].mean():.3f}")
        elif col == 'pass_outcome':
            print(f"\n{col} distribution:")
            print("With dangerous runs:")
            print(has_dangerous[col].value_counts(normalize=True).head(3))

print("\n" + "="*80)
print("UNTARGETED RUN VALUE HYPOTHESIS")
print("="*80)

# Compare possessions where dangerous runs were ignored vs targeted
if len(poss_with_runs_data) > 0:
    targeted_dangerous = poss_with_runs_data[
        (poss_with_runs_data['n_dangerous_runs'] > 0) & 
        (poss_with_runs_data['n_targeted_runs'] > 0)
    ]
    
    ignored_dangerous = poss_with_runs_data[
        (poss_with_runs_data['n_dangerous_runs'] > 0) & 
        (poss_with_runs_data['n_targeted_runs'] == 0)
    ]
    
    print(f"\nPossessions where dangerous run WAS targeted: {len(targeted_dangerous)}")
    print(f"Possessions where dangerous run was IGNORED: {len(ignored_dangerous)}")
    
    if len(targeted_dangerous) > 0 and len(ignored_dangerous) > 0:
        print("\nDid ignoring the dangerous run hurt the outcome?")
        
        for col in ['xthreat', 'pass_outcome', 'lead_to_shot']:
            if col in possessions_with_runs.columns:
                if col in ['xthreat']:
                    t_val = targeted_dangerous[col].mean()
                    i_val = ignored_dangerous[col].mean()
                    print(f"  {col}: Targeted={t_val:.3f}, Ignored={i_val:.3f}, Diff={t_val-i_val:.3f}")

print("\n" + "="*80)
print("DEFENSIVE IMPACT - DID RUNS CREATE SPACE?")
print("="*80)

# Check if untargeted runs affected defensive positioning
defensive_cols = ['n_opponents_ahead_start', 'n_opponents_ahead_end', 
                  'separation_start', 'separation_end', 'separation_gain']
defensive_cols = [c for c in defensive_cols if c in possessions_with_runs.columns]

if defensive_cols and len(ignored_dangerous) > 0:
    print("\nDefensive metrics when dangerous runs were ignored:")
    for col in defensive_cols:
        print(f"  {col}: {ignored_dangerous[col].mean():.2f}")
    
    print("\nDid the actual pass benefit from the decoy run?")
    if 'separation_gain' in possessions_with_runs.columns:
        print(f"  Avg separation gained: {ignored_dangerous['separation_gain'].mean():.2f}")

print("\n" + "="*80)
print("NEXT: BUILD RUN VALUE ADDED METRIC")
print("="*80)
print("Components to consider:")
print("1. Direct value: xthreat if targeted")
print("2. Indirect value: space created for actual pass")
print("3. Defensive disruption: opponents pulled out of position")
print("4. Timing value: simultaneous runs creating overloads")

Successfully linked 126 possessions to their runs

POSSESSION OUTCOMES BY RUN CHARACTERISTICS

Possessions with at least 1 dangerous run: 220
Possessions with untargeted dangerous runs: 176

--- Comparing possessions WITH vs WITHOUT dangerous runs ---

pass_outcome distribution:
With dangerous runs:
pass_outcome
successful      0.714286
unsuccessful    0.285714
Name: proportion, dtype: float64

xthreat:
  With dangerous run: nan
  Without dangerous run: nan

UNTARGETED RUN VALUE HYPOTHESIS

Possessions where dangerous run WAS targeted: 154
Possessions where dangerous run was IGNORED: 66

Did ignoring the dangerous run hurt the outcome?
  xthreat: Targeted=nan, Ignored=nan, Diff=nan

DEFENSIVE IMPACT - DID RUNS CREATE SPACE?

Defensive metrics when dangerous runs were ignored:
  n_opponents_ahead_start: 4.67
  n_opponents_ahead_end: 5.00
  separation_start: 5.89
  separation_end: 2.95
  separation_gain: -2.95

Did the actual pass benefit from the decoy run?
  Avg separation gained: -2.9

In [10]:
possessions = synced[synced['event_type'] == 'player_possession'].copy()
passing_options = synced[synced['event_type'] == 'passing_option'].copy()

print("="*80)
print("RUN VALUE ADDED (RVA) METRIC DEVELOPMENT")
print("="*80)

# 1. CREATE BASELINE: Possessions WITHOUT runs
poss_no_runs = possessions[possessions['n_off_ball_runs'] == 0].copy()
poss_with_runs = possessions[possessions['n_off_ball_runs'] > 0].copy()

print(f"\nPossessions without runs: {len(poss_no_runs)}")
print(f"Possessions with runs: {len(poss_with_runs)}")

# 2. CONTROL VARIABLES: Match on similar starting conditions
control_vars = ['third_start', 'n_opponents_ahead_start', 'game_state']
control_vars = [c for c in control_vars if c in possessions.columns]

print(f"\nControl variables available: {control_vars}")

# 3. OUTCOME METRICS
outcomes = {
    'pass_success': lambda x: (x['pass_outcome'] == 'successful').astype(float) if 'pass_outcome' in x.columns else None,
    'progression': lambda x: x['delta_to_last_defensive_line_gain'] if 'delta_to_last_defensive_line_gain' in x.columns else None,
    'separation_gained': lambda x: x['separation_gain'] if 'separation_gain' in x.columns else None,
    'lead_to_shot': lambda x: x['lead_to_shot'].fillna(0) if 'lead_to_shot' in x.columns else None,
}

print("\n" + "="*80)
print("IMPACT ANALYSIS: WITH RUNS vs WITHOUT RUNS")
print("="*80)

results = {}
for metric_name, metric_func in outcomes.items():
    with_runs_metric = metric_func(poss_with_runs)
    without_runs_metric = metric_func(poss_no_runs)
    
    if with_runs_metric is not None and without_runs_metric is not None:
        with_runs_metric = with_runs_metric.dropna()
        without_runs_metric = without_runs_metric.dropna()
        
        if len(with_runs_metric) > 0 and len(without_runs_metric) > 0:
            mean_with = with_runs_metric.mean()
            mean_without = without_runs_metric.mean()
            diff = mean_with - mean_without
            
            # Statistical test
            if len(with_runs_metric) > 20 and len(without_runs_metric) > 20:
                t_stat, p_val = stats.ttest_ind(with_runs_metric, without_runs_metric)
                sig = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
            else:
                p_val, sig = None, ""
            
            results[metric_name] = {
                'with_runs': mean_with,
                'without_runs': mean_without,
                'difference': diff,
                'p_value': p_val
            }
            
            print(f"\n{metric_name.upper()}:")
            print(f"  With runs: {mean_with:.3f}")
            print(f"  Without runs: {mean_without:.3f}")
            print(f"  Difference: {diff:+.3f} {sig}")
            if p_val:
                print(f"  p-value: {p_val:.4f}")

print("\n" + "="*80)
print("RUN CHARACTERISTICS & VALUE")
print("="*80)

# Merge run details to possessions
passing_options['parent_possession_id'] = passing_options['associated_player_possession_event_id']

run_features = passing_options.groupby('parent_possession_id').agg({
    'dangerous': 'sum',
    'xthreat': ['mean', 'max'],
    'n_simultaneous_runs': 'max',
    'distance_to_player_in_possession_start': 'mean',
    'player_name': 'count'
}).reset_index()

run_features.columns = ['possession_id', 'n_dangerous', 'avg_xthreat_runs', 
                        'max_xthreat_run', 'max_simultaneous', 'avg_distance', 'n_runs']

poss_enriched = poss_with_runs.merge(run_features, left_on='event_id', 
                                      right_on='possession_id', how='left')

# Analyze which run characteristics correlate with better outcomes
print("\nRun characteristics that improve outcomes:")

for outcome_name, outcome_func in outcomes.items():
    outcome_vals = outcome_func(poss_enriched)
    if outcome_vals is not None:
        outcome_vals = outcome_vals.dropna()
        
        print(f"\n{outcome_name.upper()}:")
        for feature in ['n_dangerous', 'max_simultaneous', 'n_runs']:
            if feature in poss_enriched.columns:
                corr = poss_enriched[feature].corr(outcome_vals)
                if not np.isnan(corr):
                    print(f"  Correlation with {feature}: {corr:.3f}")

print("\n" + "="*80)
print("UNTARGETED RUN SPECIFIC VALUE")
print("="*80)

# Focus on runs that created value despite not being targeted
untargeted_runs = passing_options[passing_options['targeted'] == 0].copy()
untargeted_dangerous = untargeted_runs[untargeted_runs['dangerous'] == 1].copy()

print(f"\nTotal untargeted runs: {len(untargeted_runs)}")
print(f"Untargeted dangerous runs: {len(untargeted_dangerous)} ({len(untargeted_dangerous)/len(untargeted_runs)*100:.1f}%)")

# Did these possessions succeed?
untargeted_poss_ids = untargeted_dangerous['parent_possession_id'].unique()
untargeted_possessions = possessions[possessions['event_id'].isin(untargeted_poss_ids)]

if len(untargeted_possessions) > 0:
    success_rate = (untargeted_possessions['pass_outcome'] == 'successful').mean()
    print(f"\nSuccess rate when dangerous run ignored: {success_rate:.1%}")
    
    if 'separation_gain' in untargeted_possessions.columns:
        avg_sep_gain = untargeted_possessions['separation_gain'].mean()
        print(f"Avg separation gain: {avg_sep_gain:.2f}m")

print("\n" + "="*80)
print("RUN VALUE ADDED (RVA) FORMULA")
print("="*80)

# Build composite metric
print("\nProposed RVA components:")
print("1. Direct Threat Value (if targeted): xthreat * xpass_completion")
print("2. Decoy Value (if untargeted): improvement in actual pass quality")
print("3. Disruption Value: opponents pulled out of position")
print("4. Overload Value: simultaneous runs creating numerical advantage")

# Calculate RVA for each run
passing_options['direct_value'] = (passing_options['targeted'] * 
                                   passing_options['xthreat'] * 
                                   passing_options['xpass_completion']).fillna(0)

# For untargeted runs, estimate decoy value
passing_options_merged = passing_options.merge(
    possessions[['event_id', 'pass_outcome', 'separation_gain']],
    left_on='parent_possession_id',
    right_on='event_id',
    how='left',
    suffixes=('', '_poss')
)

passing_options_merged['decoy_value'] = np.where(
    (passing_options_merged['targeted'] == 0) & (passing_options_merged['dangerous'] == 1),
    passing_options_merged['xthreat'] * 0.3,  # Weight for creating space
    0
)

passing_options_merged['disruption_value'] = np.where(
    passing_options_merged['dangerous'] == 1,
    passing_options_merged['xthreat'] * 0.2,  # Pulls defenders
    0
)

passing_options_merged['overload_value'] = pd.Series(
    np.where(
        passing_options_merged['n_simultaneous_runs'] > 1,
        passing_options_merged['xthreat'] * 0.15 * passing_options_merged['n_simultaneous_runs'],
        0
    ),
    index=passing_options_merged.index
).fillna(0)

passing_options_merged['RVA'] = (passing_options_merged['direct_value'] + 
                                 passing_options_merged['decoy_value'] + 
                                 passing_options_merged['disruption_value'] + 
                                 passing_options_merged['overload_value'])

print("\n" + "="*80)
print("RVA SUMMARY STATISTICS")
print("="*80)

print(f"\nAverage RVA per run: {passing_options_merged['RVA'].mean():.4f}")
print(f"Average RVA (targeted): {passing_options_merged[passing_options_merged['targeted']==1]['RVA'].mean():.4f}")
print(f"Average RVA (untargeted): {passing_options_merged[passing_options_merged['targeted']==0]['RVA'].mean():.4f}")
print(f"Average RVA (untargeted dangerous): {passing_options_merged[(passing_options_merged['targeted']==0) & (passing_options_merged['dangerous']==1)]['RVA'].mean():.4f}")

# Top value creators
print("\n" + "="*80)
print("TOP RUN VALUE CREATORS (by total RVA)")
print("="*80)

player_rva = passing_options_merged.groupby('player_name').agg({
    'RVA': ['sum', 'mean', 'count'],
    'targeted': 'sum',
    'dangerous': 'sum'
}).round(4)

player_rva.columns = ['total_RVA', 'avg_RVA', 'n_runs', 'n_targeted', 'n_dangerous']
player_rva = player_rva.sort_values('total_RVA', ascending=False)
print(player_rva.head(10))

print("\n" + "="*80)
print("KEY INSIGHTS")
print("="*80)
print("✓ Quantified value of untargeted runs")
print("✓ Developed RVA metric crediting all runs")
print("✓ Identified top value creators beyond assists")
print("\nNext: Visualizations and validation")

RUN VALUE ADDED (RVA) METRIC DEVELOPMENT

Possessions without runs: 1958
Possessions with runs: 1056

Control variables available: ['third_start', 'n_opponents_ahead_start', 'game_state']

IMPACT ANALYSIS: WITH RUNS vs WITHOUT RUNS

PASS_SUCCESS:
  With runs: 0.750
  Without runs: 0.764
  Difference: -0.014 
  p-value: 0.3898

PROGRESSION:
  With runs: -1.256
  Without runs: -0.116
  Difference: -1.140 ***
  p-value: 0.0000

SEPARATION_GAINED:
  With runs: -2.452
  Without runs: -1.362
  Difference: -1.090 ***
  p-value: 0.0000

LEAD_TO_SHOT:
  With runs: 0.125
  Without runs: 0.034
  Difference: +0.091 ***
  p-value: 0.0000

RUN CHARACTERISTICS & VALUE

Run characteristics that improve outcomes:

PASS_SUCCESS:
  Correlation with n_dangerous: -0.258
  Correlation with n_runs: 0.258

PROGRESSION:
  Correlation with n_dangerous: 0.024
  Correlation with n_runs: -0.418

SEPARATION_GAINED:
  Correlation with n_dangerous: -0.063
  Correlation with n_runs: -0.304

LEAD_TO_SHOT:
  Correlation

In [None]:
enriched_tracking_data[enriched_tracking_data[]]