In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.signal import savgol_filter

In [9]:
df_home=pd.read_pickle("DATASETS/Cleaned/Home_Tracking_Clean.pkl")
df_away=pd.read_pickle("DATASETS/Cleaned/Away_Tracking_Clean.pkl")
df_events=pd.read_pickle("DATASETS/Cleaned/Event_Tracking_Clean.pkl")

In [11]:
df_home

Unnamed: 0,Period,Frame,Time [s],Home_11_X,Home_11_Y,Home_1_X,Home_1_Y,Home_2_X,Home_2_Y,Home_3_X,...,Home_10_X,Home_10_Y,Home_12_X,Home_12_Y,Home_13_X,Home_13_Y,Home_14_X,Home_14_Y,Ball_X,Ball_Y
0,1,1,0.04,-52.41390,-1.19816,-18.21960,10.41896,-17.11395,-0.77316,-20.02665,...,5.50515,-4.57708,,,,,,,-4.75440,-7.67788
1,1,2,0.08,-52.39920,-1.19816,-18.21960,10.41896,-17.11395,-0.77316,-20.02665,...,5.50515,-4.57708,,,,,,,-0.37275,-6.35392
2,1,3,0.12,-52.38030,-1.19816,-18.21960,10.41896,-17.11395,-0.77316,-20.02665,...,5.50515,-4.57708,,,,,,,3.90180,-5.06192
3,1,4,0.16,-52.37295,-1.19816,-18.24690,10.41556,-17.12865,-0.68816,-20.00880,...,5.49780,-4.54716,,,,,,,5.61330,-5.28292
4,1,5,0.20,-52.36455,-1.19816,-18.27315,10.38292,-17.15280,-0.66776,-20.00460,...,5.46210,-4.54852,,,,,,,5.78760,-6.41240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145001,2,145002,5800.08,42.64470,-3.15860,,,-16.63515,9.92664,-19.71060,...,,,-20.9538,12.45148,-17.17380,10.46520,-16.63515,9.92664,,
145002,2,145003,5800.12,42.57750,-3.15860,,,-16.66665,9.91644,-19.70850,...,,,-20.9538,12.45148,-17.16120,10.47064,-16.66665,9.91644,,
145003,2,145004,5800.16,42.51660,-3.15860,,,-16.70655,9.90692,-19.72530,...,,,-20.9538,12.45148,-17.18115,10.44888,-16.70655,9.90692,,
145004,2,145005,5800.20,42.47880,-3.15792,,,-16.74120,9.89536,-19.77045,...,,,-20.9538,12.45148,-17.20425,10.41556,-16.74120,9.89536,,


In [50]:
def get_team_physicals_surgical(df, team_label='Home'):
    summary_data = []
    DT = 0.04 
    
    x_cols = [c for c in df.columns if c.lower().endswith('_x') and team_label.lower() in c.lower()]
    
    for x_col in x_cols:
        y_col = x_col[:-1] + ('Y' if x_col.endswith('_X') else 'y')
        if 'ball' in x_col.lower(): continue
            
        player_id = x_col.replace('_X','').replace('_x','').replace(f'{team_label}_','')
        
        # 1. Displacement
        dist_m = np.sqrt(df[x_col].diff()**2 + df[y_col].diff()**2)
        raw_speed_kmh = (dist_m / DT) * 3.6
        
        # 2. STATISTICAL CLEANING
        # We calculate the 95th percentile. Anything way above the 95th 
        # is usually a sensor jump.
        q95 = raw_speed_kmh.quantile(0.95)
        # If a spike is 50% faster than their 95th percentile, it's fake.
        clean_speed = raw_speed_kmh.clip(upper=q95 * 1.5)
        
        # 3. Aggressive Median + SavGol
        # Median filter (size 31) covers 1.2 seconds to ensure no flickers remain
        smooth_median = clean_speed.rolling(window=31, center=True).median().fillna(0)
        final_speed_series = savgol_filter(smooth_median, 21, 3)
        
        # 4. The "Natural" Top Speed
        # We take the 99th percentile of the smoothed curve to avoid 
        # picking up a single remaining "peak" pixel.
        top_speed = np.percentile(final_speed_series, 99)
        
        summary_data.append({
            'Full ID': f"{team_label}_{player_id}",
            'Total Distance (km)': np.round(dist_m.sum() / 1000, 2),
            'Top Speed (km/h)': np.round(top_speed, 2)
        })

    return pd.DataFrame(summary_data)

In [51]:
home_stats = get_team_physicals_surgical(df_home, 'Home')
away_stats = get_team_physicals_surgical(df_away, 'Away')

if home_stats.empty:
    print("Check df_home: No 'Home' columns with '_x' found.")
if away_stats.empty:
    print("Check df_away: No 'Away' columns with '_x' found.")

if not home_stats.empty or not away_stats.empty:
    all_stats = pd.concat([home_stats, away_stats]).sort_values('Top Speed (km/h)', ascending=False)
    print("Success! Leaderboard:")
   

Success! Leaderboard:


In [52]:
all_stats

Unnamed: 0,Full ID,Total Distance (km),Top Speed (km/h)
8,Home_8,11.3,25.33
5,Home_5,11.16,24.71
9,Away_23,10.36,24.22
9,Home_9,10.26,24.16
10,Home_10,8.5,22.93
4,Home_4,10.32,22.72
4,Away_18,11.34,22.63
6,Away_20,10.41,22.58
8,Away_22,7.99,22.39
7,Away_21,11.04,22.03


In [55]:
def prepare_player_data(df, team_label='Home'):
    # This prepares the columns needed for zone analysis
    DT = 0.04
    x_cols = [c for c in df.columns if c.lower().endswith('_x') and team_label.lower() in c.lower()]
    
    for x_col in x_cols:
        y_col = x_col[:-1] + ('Y' if x_col.endswith('_X') else 'y')
        player_id = x_col.replace('_X','').replace('_x','').replace(f'{team_label}_','')
        
        # Calculate distance and speed
        dist_m = np.sqrt(df[x_col].diff()**2 + df[y_col].diff()**2).fillna(0)
        
        # Clean the speed (using the median filter we liked earlier)
        raw_speed_kmh = (dist_m / DT) * 3.6
        smooth_speed = raw_speed_kmh.rolling(window=15, center=True).median().fillna(0)
        
        # Save columns back to the dataframe
        df[f'{team_label}_{player_id}_dist'] = dist_m
        df[f'{team_label}_{player_id}_speed'] = smooth_speed
    
    return df

# Run this first!
df_home = prepare_player_data(df_home, 'Home')
df_away = prepare_player_data(df_away, 'Away')

In [59]:
def get_distance_percentages(df, player_id, team_label='Home'):
    # Get the speed and distance columns we prepared
    speed = df[f'{team_label}_{player_id}_speed']
    dist = df[f'{team_label}_{player_id}_dist']
    
    total_dist = dist.sum()
    
    # If the player didn't move at all, return zeros to avoid math errors
    if total_dist == 0:
        return {zone: 0.0 for zone in ['Walking', 'Jogging', 'Running', 'HSR', 'Sprinting']}
    
    zones = {
        'Walking': (0, 7),
        'Jogging': (7, 15),
        'Running': (15, 20),
        'HSR': (20, 25),
        'Sprinting': (25, 100)
    }
    
    zone_percentages = {}
    for zone, (low, high) in zones.items():
        mask = (speed >= low) & (speed < high)
        zone_dist = dist[mask].sum()
        
        # Calculate percentage
        percentage = (zone_dist / total_dist) * 100
        zone_percentages[zone] = np.round(percentage, 2)
        
    return zone_percentages

In [60]:
zone_stats = get_distance_percentages(df_home, '11')

In [61]:
zone_stats

{'Walking': np.float64(88.91),
 'Jogging': np.float64(9.84),
 'Running': np.float64(1.0),
 'HSR': np.float64(0.15),
 'Sprinting': np.float64(0.09)}