In [1]:
import pandas as pd
import numpy as np

# Load the motion data
df = pd.read_csv('/home/pshmo/summer_milestone_2-1/datasets/nfl-playing-surface-analytics/PlayerTrackData.csv')

# clean injury record data
injury_record = pd.read_csv('/home/pshmo/summer_milestone_2-1/datasets/nfl-playing-surface-analytics/InjuryRecord.csv')

# fix missing PlayKeys and drop duplicates
playlist = pd.read_csv('/home/pshmo/summer_milestone_2-1/datasets/nfl-playing-surface-analytics/PlayList.csv')
last_plays = playlist.groupby('GameID')['PlayKey'].last() # get each players last play in each game
injury_record.loc[injury_record['PlayKey'].isnull(), 'PlayKey'] = injury_record['GameID'].map(last_plays)[injury_record['PlayKey'].isnull()].values  # fill in missing PlayKeys with last play of the game
injury_record = injury_record.groupby('PlayKey').first().reset_index()  # keep only the first record for each PlayKey to drop duplicates


In [4]:
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

def build_vectorized_motion_features(df):
    """
    Build aggregated motion features for ALL plays at once using vectorized operations
    This will be much faster than processing plays one by one
    """
    print("Starting vectorized feature calculation...")
    
    # Sort by PlayKey and time to ensure proper order
    df = df.sort_values(['PlayKey', 'time']).reset_index(drop=True)
    
    # Calculate time differences within each play
    df['time_diff'] = df.groupby('PlayKey')['time'].diff().fillna(0)
    
    # Calculate velocities from position changes (vectorized)
    df['vx'] = df.groupby('PlayKey')['x'].diff() / df['time_diff']
    df['vy'] = df.groupby('PlayKey')['y'].diff() / df['time_diff']
    df['speed_calc'] = np.sqrt(df['vx']**2 + df['vy']**2)
    
    # Use provided speed where available, calculated as backup
    df['speed'] = df['s'].fillna(df['speed_calc'])
    
    # Calculate acceleration (vectorized)
    df['ax'] = df.groupby('PlayKey')['vx'].diff() / df['time_diff']
    df['ay'] = df.groupby('PlayKey')['vy'].diff() / df['time_diff']
    df['acceleration'] = np.sqrt(df['ax']**2 + df['ay']**2)
    
    # Handle infinite values
    for col in ['vx', 'vy', 'speed_calc', 'speed', 'ax', 'ay', 'acceleration']:
        df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(0)
    
    # Jerk rate (vectorized)
    df['jerk'] = df.groupby('PlayKey')['acceleration'].diff() / df['time_diff']
    df['jerk'] = df['jerk'].replace([np.inf, -np.inf], np.nan).fillna(0)
    
    # Direction changes (vectorized)
    df['dir_change'] = df.groupby('PlayKey')['dir'].diff()
    df['orient_change'] = df.groupby('PlayKey')['o'].diff()
    
    # Handle angle wraparound
    for col in ['dir_change', 'orient_change']:
        df[col] = np.where(df[col] > 180, df[col] - 360, df[col])
        df[col] = np.where(df[col] < -180, df[col] + 360, df[col])
    
    # Angular velocities (vectorized)
    df['angular_vel'] = df['dir_change'] / df['time_diff']
    df['angular_vel'] = df['angular_vel'].replace([np.inf, -np.inf], np.nan).fillna(0)
    
    print("Calculated derivatives, now aggregating features...")
    
    # Create boolean masks for thresholds (faster than applying to groups)
    df['rapid_decel'] = df['acceleration'] < -3.0
    df['rapid_accel'] = df['acceleration'] > 3.0
    df['above_avg_speed'] = df['speed'] > df.groupby('PlayKey')['speed'].transform('mean')
    df['sharp_turn'] = df['angular_vel'].abs() > 45
    
    # Calculate quantiles for each play (vectorized)
    df['speed_95th'] = df.groupby('PlayKey')['speed'].transform(lambda x: x.quantile(0.95))
    df['jerk_90th'] = df.groupby('PlayKey')['jerk'].transform(lambda x: x.abs().quantile(0.9))
    df['high_jerk'] = df['jerk'].abs() > df['jerk_90th']
    
    # Get first and last positions for displacement calculation
    first_last = df.groupby('PlayKey').agg({
        'x': ['first', 'last'],
        'y': ['first', 'last'],
        'time': ['min', 'max']
    }).round(6)
    
    first_last.columns = ['x_first', 'x_last', 'y_first', 'y_last', 'time_min', 'time_max']
    first_last['displacement'] = np.sqrt(
        (first_last['x_last'] - first_last['x_first'])**2 + 
        (first_last['y_last'] - first_last['y_first'])**2
    )
    first_last['play_duration'] = first_last['time_max'] - first_last['time_min']
    
    print("Aggregating final features...")
    
    # Aggregate all features at once (much faster)
    agg_dict = {
        # Basic aggregations
        'dis': ['sum', 'mean', 'std'],
        'speed': ['max', 'mean', 'std'],
        'acceleration': ['max', 'mean', 'std', 'min'],
        'jerk': [lambda x: x.abs().max(), lambda x: x.abs().mean(), lambda x: x.abs().quantile(0.95)],
        'dir_change': [lambda x: x.abs().sum()],
        'angular_vel': [lambda x: x.abs().max(), lambda x: x.abs().mean()],
        'x': [lambda x: x.max() - x.min()],
        'y': [lambda x: x.max() - x.min()],
        'o': ['std'],
        'dir': ['std'],
        
        # Boolean aggregations (counts)
        'rapid_decel': 'sum',
        'rapid_accel': 'sum',
        'above_avg_speed': ['sum', 'count'],  # sum for count above avg, count for total
        'sharp_turn': 'sum',
        'high_jerk': 'sum',
        
        # For unique values
        'speed_95th': 'first',  # These are constant within each play
    }
    
    features = df.groupby('PlayKey').agg(agg_dict).round(6)
    
    # Flatten column names
    features.columns = ['_'.join(col) if col[1] else col[0] for col in features.columns]
    
    # Rename columns to match original naming
    rename_dict = {
        'dis_sum': 'total_distance',
        'dis_mean': 'avg_distance_per_frame', 
        'dis_std': 'distance_std',
        'speed_max': 'max_speed',
        'speed_mean': 'avg_speed',
        'speed_std': 'speed_std',
        'acceleration_max': 'max_acceleration',
        'acceleration_mean': 'avg_acceleration',
        'acceleration_std': 'acceleration_std',
        'acceleration_min': 'min_acceleration',
        'jerk_<lambda_0>': 'max_jerk',
        'jerk_<lambda_1>': 'avg_jerk',
        'jerk_<lambda_2>': 'jerk_95th',
        'dir_change_<lambda>': 'total_direction_change',
        'angular_vel_<lambda_0>': 'max_angular_velocity',
        'angular_vel_<lambda_1>': 'avg_angular_velocity',
        'x_<lambda>': 'x_range',
        'y_<lambda>': 'y_range',
        'o_std': 'orientation_std',
        'dir_std': 'direction_std',
        'rapid_decel_sum': 'rapid_decelerations',
        'rapid_accel_sum': 'rapid_accelerations',
        'above_avg_speed_sum': 'time_above_avg_speed_count',
        'above_avg_speed_count': 'total_frames',
        'sharp_turn_sum': 'sharp_turns',
        'high_jerk_sum': 'high_jerk_events',
        'speed_95th_first': 'speed_95th',
    }
    
    features = features.rename(columns=rename_dict)
    
    # Add back the first/last position features
    features = features.join(first_last[['displacement', 'play_duration']])
    
    # Calculate derived features
    features['max_deceleration'] = np.maximum(0, -features['min_acceleration'])
    features['field_coverage'] = features['x_range'] * features['y_range']
    features['time_above_avg_speed'] = features['time_above_avg_speed_count'] / features['total_frames']
    features['movement_efficiency'] = np.where(
        features['total_distance'] > 0,
        features['displacement'] / features['total_distance'],
        0
    )
    features['speed_cv'] = features['speed_std'] / (features['avg_speed'] + 1e-6)
    features['acceleration_cv'] = features['acceleration_std'] / (features['avg_acceleration'] + 1e-6)
    
    # Clean up temporary columns
    features = features.drop(['min_acceleration', 'time_above_avg_speed_count', 'total_frames'], axis=1)
    
    # Handle any remaining NaN values
    features = features.fillna(0)
    
    print(f"Completed feature calculation for {len(features)} plays")
    return features

def process_motion_chunk_v2(chunk_df):
    """
    Your existing motion chunk processing function (unchanged but optimized)
    """
    # Make an explicit copy to avoid warnings
    chunk_df = chunk_df.copy()
    
    # Time is already in seconds, just use directly as relative_time
    chunk_df['relative_time'] = chunk_df['time']
    
    # Create time deciles based on relative time
    chunk_df['time_decile'] = chunk_df.groupby(['PlayKey'])['relative_time'].transform(
        lambda x: pd.cut(x, bins=10, labels=[f'slice_{i}' for i in range(10)], duplicates='drop')
    )
    
    # Pivot to wide - using more metrics now
    motion_wide = chunk_df.pivot_table(
        index=['PlayKey'], 
        columns='time_decile',
        values=['dis', 'x', 'y', 'o', 'dir', 's'],
        aggfunc='mean',
        observed=True
    )
    
    # Flatten columns
    motion_wide.columns = [f'{metric}_{time}' for metric, time in motion_wide.columns]
    
    # Fill missing values by interpolating across time slices
    for metric in ['dis', 'x', 'y', 'o', 'dir', 's']:
        metric_cols = [col for col in motion_wide.columns if col.startswith(f'{metric}_slice_')]
        if metric_cols:
            motion_wide[metric_cols] = motion_wide[metric_cols].interpolate(axis=1, method='linear')
    
    return motion_wide

def process_playertrack_data_fast(df, chunk_size=5000):
    """
    Faster version of your playertrack processing
    """
    motion_wide_list = []
    unique_plays = df['PlayKey'].unique()
    
    print(f"Processing {len(unique_plays)} plays in chunks of {chunk_size}")
    
    for i in range(0, len(unique_plays), chunk_size):
        play_chunk = unique_plays[i:i+chunk_size]
        chunk_df = df[df['PlayKey'].isin(play_chunk)]
        
        motion_wide_chunk = process_motion_chunk_v2(chunk_df)
        motion_wide_list.append(motion_wide_chunk)
        
        print(f"Processed {min(i+chunk_size, len(unique_plays))} of {len(unique_plays)} plays")
    
    # Combine all chunks
    motion_wide = pd.concat(motion_wide_list, axis=0)
    return motion_wide

# Usage examples:
# 
# # For aggregated features (MUCH faster now):
motion_features = build_vectorized_motion_features(df)
# 
# # For time-series features (optimized):
# motion_wide = process_playertrack_data_fast(df, chunk_size=5000)
# 
# # Combine both:
# combined_features = motion_wide.join(motion_features, how='inner')

Starting vectorized feature calculation...
Calculated derivatives, now aggregating features...
Aggregating final features...
Completed feature calculation for 266960 plays


In [None]:
df = pd.read_csv('/home/pshmo/summer_milestone_2-1/datasets/nfl-playing-surface-analytics/PlayList.csv')  # Load your data here
playlist_short = df.copy()
playlist_short = playlist_short[['PlayerKey', 'GameID', 'PlayKey', 'RosterPosition', 'PlayerDay', 'PlayerGame', 'FieldType', 'PlayType']]
df_final = motion_features.merge(playlist_short, on='PlayKey', how='left')

In [10]:
df_final.head()

Unnamed: 0,PlayKey,total_distance,avg_distance_per_frame,distance_std,max_speed,avg_speed,speed_std,max_acceleration,avg_acceleration,acceleration_std,...,movement_efficiency,speed_cv,acceleration_cv,PlayerKey,GameID,RosterPosition,PlayerDay,PlayerGame,FieldType,PlayType
0,26624-1-1,16.83,0.056288,0.079067,2.94,0.535753,0.608942,22.090722,1.355953,2.012758,...,0.351677,1.136607,1.484385,26624,26624-1,Quarterback,1,1,Synthetic,Pass
1,26624-1-10,17.07,0.061848,0.080321,3.18,0.576304,0.642887,23.259407,1.418299,1.892383,...,0.034384,1.115533,1.334261,26624,26624-1,Quarterback,1,1,Synthetic,Pass
2,26624-1-11,31.2,0.084553,0.079344,2.94,0.795772,0.738191,10.0,1.50173,1.332937,...,0.206218,0.92764,0.8876,26624,26624-1,Quarterback,1,1,Synthetic,Rush
3,26624-1-12,14.7,0.041176,0.059135,1.83,0.349692,0.428258,24.020824,1.413075,2.030767,...,0.056989,1.224668,1.437125,26624,26624-1,Quarterback,1,1,Synthetic,Pass
4,26624-1-13,13.26,0.051797,0.063257,1.49,0.427109,0.404819,23.086793,2.157759,3.031911,...,0.085489,0.94781,1.40512,26624,26624-1,Quarterback,1,1,Synthetic,Pass


In [11]:
df_final['PlayType'] = df_final['PlayType'].fillna('Unknown') 
print(df_final.isna().sum())  # Check for any remaining NaNs from the first merge

PlayKey                   0
total_distance            0
avg_distance_per_frame    0
distance_std              0
max_speed                 0
avg_speed                 0
speed_std                 0
max_acceleration          0
avg_acceleration          0
acceleration_std          0
max_jerk                  0
avg_jerk                  0
jerk_95th                 0
total_direction_change    0
max_angular_velocity      0
avg_angular_velocity      0
x_range                   0
y_range                   0
orientation_std           0
direction_std             0
rapid_decelerations       0
rapid_accelerations       0
sharp_turns               0
high_jerk_events          0
speed_95th                0
displacement              0
play_duration             0
max_deceleration          0
field_coverage            0
time_above_avg_speed      0
movement_efficiency       0
speed_cv                  0
acceleration_cv           0
PlayerKey                 0
GameID                    0
RosterPosition      

In [12]:
# merge target
df_final = df_final.merge(injury_record, on='PlayKey', how='left')
df_final = df_final.drop(columns=['PlayerKey_x', 'GameID_x', 'PlayerKey_y', 'GameID_y', 'Surface'])  # Drop redundant columns

In [13]:
df_final

Unnamed: 0,PlayKey,total_distance,avg_distance_per_frame,distance_std,max_speed,avg_speed,speed_std,max_acceleration,avg_acceleration,acceleration_std,...,RosterPosition,PlayerDay,PlayerGame,FieldType,PlayType,BodyPart,DM_M1,DM_M7,DM_M28,DM_M42
0,26624-1-1,16.83,0.056288,0.079067,2.94,0.535753,0.608942,22.090722,1.355953,2.012758,...,Quarterback,1,1,Synthetic,Pass,,,,,
1,26624-1-10,17.07,0.061848,0.080321,3.18,0.576304,0.642887,23.259407,1.418299,1.892383,...,Quarterback,1,1,Synthetic,Pass,,,,,
2,26624-1-11,31.20,0.084553,0.079344,2.94,0.795772,0.738191,10.000000,1.501730,1.332937,...,Quarterback,1,1,Synthetic,Rush,,,,,
3,26624-1-12,14.70,0.041176,0.059135,1.83,0.349692,0.428258,24.020824,1.413075,2.030767,...,Quarterback,1,1,Synthetic,Pass,,,,,
4,26624-1-13,13.26,0.051797,0.063257,1.49,0.427109,0.404819,23.086793,2.157759,3.031911,...,Quarterback,1,1,Synthetic,Pass,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266955,47888-9-54,48.70,0.214537,0.121069,4.60,2.143744,1.200130,6.708204,1.689295,1.278145,...,Cornerback,71,9,Synthetic,Pass,,,,,
266956,47888-9-6,83.64,0.216124,0.195476,8.40,2.152041,1.954992,7.071068,1.509792,1.189161,...,Cornerback,71,9,Synthetic,Pass,,,,,
266957,47888-9-7,60.26,0.180419,0.196470,6.59,1.791317,1.968776,7.211103,1.585019,1.361082,...,Cornerback,71,9,Synthetic,Rush,,,,,
266958,47888-9-8,67.35,0.216559,0.193257,7.06,2.153923,1.925948,7.280110,1.748389,1.338410,...,Cornerback,71,9,Synthetic,Pass,,,,,


In [14]:
injury_cols = [['DM_M1', 'DM_M7', 'DM_M28', 'DM_M42']]
for col in injury_cols[0]:
    if col in df_final.columns:
        df_final[col] = df_final[col].fillna(0)  # Fill NaNs with 0 for injury columns

In [15]:
# create binary target 
df_final['injury'] = df_final['DM_M1']
df_final['BodyPart'] = df_final['BodyPart'].fillna('Non-Injury')  # Fill NaNs in BodyPart with 'Non-Injury'
# Drop the 'Surface' column as it is not needed
one_hot_cols = ['RosterPosition', 'FieldType', 'PlayType', 'BodyPart']
df_final = pd.get_dummies(df_final, columns=one_hot_cols, drop_first=True)  # One-hot encode categorical variables
# Convert all boolean columns in df_final to 0/1 integers
bool_cols = df_final.select_dtypes(include='bool').columns
df_final[bool_cols] = df_final[bool_cols].astype(int)
df_final.head()  # Display the final processed DataFrame

Unnamed: 0,PlayKey,total_distance,avg_distance_per_frame,distance_std,max_speed,avg_speed,speed_std,max_acceleration,avg_acceleration,acceleration_std,...,PlayType_Punt,PlayType_Punt Not Returned,PlayType_Punt Returned,PlayType_Rush,PlayType_Unknown,BodyPart_Foot,BodyPart_Heel,BodyPart_Knee,BodyPart_Non-Injury,BodyPart_Toes
0,26624-1-1,16.83,0.056288,0.079067,2.94,0.535753,0.608942,22.090722,1.355953,2.012758,...,0,0,0,0,0,0,0,0,1,0
1,26624-1-10,17.07,0.061848,0.080321,3.18,0.576304,0.642887,23.259407,1.418299,1.892383,...,0,0,0,0,0,0,0,0,1,0
2,26624-1-11,31.2,0.084553,0.079344,2.94,0.795772,0.738191,10.0,1.50173,1.332937,...,0,0,0,1,0,0,0,0,1,0
3,26624-1-12,14.7,0.041176,0.059135,1.83,0.349692,0.428258,24.020824,1.413075,2.030767,...,0,0,0,0,0,0,0,0,1,0
4,26624-1-13,13.26,0.051797,0.063257,1.49,0.427109,0.404819,23.086793,2.157759,3.031911,...,0,0,0,0,0,0,0,0,1,0


In [17]:
df_final.to_csv('/home/pshmo/summer_milestone_2-1/datasets/nfl-playing-surface-analytics/processed_motion_features.csv', index=False)