In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

def process_motion_chunk_v2(chunk_df):
    # Make an explicit copy to avoid warnings
    chunk_df = chunk_df.copy()
    
    # Time is already in seconds, just use directly as relative_time
    chunk_df['relative_time'] = chunk_df['time']
    
    # Create time deciles based on relative time
    chunk_df['time_decile'] = chunk_df.groupby(['PlayKey'])['relative_time'].transform(
        lambda x: pd.cut(x, bins=10, labels=[f'slice_{i}' for i in range(60)])
    )
    
    # Pivot to wide - using more metrics now
    motion_wide = chunk_df.pivot_table(
        index=['PlayKey'], 
        columns='time_decile',
        values=['dis', 'x', 'y', 'o', 'dir', 's'],  # Added dir and s
        aggfunc='mean',
        observed=True
    )
    
    # Flatten columns
    motion_wide.columns = [f'{metric}_{time}' for metric, time in motion_wide.columns]
    
    # Fill missing values by interpolating across time slices
    for metric in ['dis', 'x', 'y', 'o', 'dir', 's']:
        metric_cols = [f'{metric}_slice_{i}' for i in range(10)]
        motion_wide[metric_cols] = motion_wide[metric_cols].interpolate(axis=1, method='linear')
    
    return motion_wide

# Process in chunks - but now by PlayKey groups since no gamekey
def process_playertrack_data(df, chunk_size=1000):
    motion_wide_list = []
    unique_plays = df['PlayKey'].unique()
    
    for i in range(0, len(unique_plays), chunk_size):
        play_chunk = unique_plays[i:i+chunk_size]
        chunk_df = df[df['PlayKey'].isin(play_chunk)]
        
        motion_wide_chunk = process_motion_chunk_v2(chunk_df)
        motion_wide_list.append(motion_wide_chunk)
        
        print(f"Processed {i+len(play_chunk)} of {len(unique_plays)} plays")
    
    # Combine all chunks
    motion_wide = pd.concat(motion_wide_list, axis=0)
    return motion_wide

# Usage:
# motion_wide = process_playertrack_data(df)

In [10]:
df = pd.read_csv('datasets/nfl-playing-surface-analytics/PlayerTrackData.csv')

In [20]:
playlist = pd.read_csv('datasets/nfl-playing-surface-analytics/PlayList.csv')

In [4]:
df = pd.read_csv('datasets/nfl-playing-surface-analytics/PlayerTrackData.csv')
motion_wide = process_playertrack_data(df)

# merge in categorical variables from PlayList
df = pd.read_csv('datasets/nfl-playing-surface-analytics/PlayList.csv')  # Load your data here
playlist_short = df.copy()
playlist_short = playlist_short[['PlayerKey', 'GameID', 'PlayKey', 'RosterPosition', 'PlayerDay', 'PlayerGame', 'FieldType', 'PlayType']]
df_final = motion_wide.merge(playlist_short, on='PlayKey', how='left')
df_final['PlayType'] = df_final['PlayType'].fillna('Unknown') 
print(df_final.isna().sum())  # Check for any remaining NaNs from the first merge

# clean injury record data
injury_record = pd.read_csv('datasets/nfl-playing-surface-analytics/InjuryRecord.csv')
print(injury_record['PlayerKey'].value_counts().head(10))  # Check top 10 players with most injuries

# fix missing PlayKeys and drop duplicates
playlist = pd.read_csv('datasets/nfl-playing-surface-analytics/PlayList.csv')
last_plays = playlist.groupby('GameID')['PlayKey'].last() # get each players last play in each game
injury_record.loc[injury_record['PlayKey'].isnull(), 'PlayKey'] = injury_record['GameID'].map(last_plays)[injury_record['PlayKey'].isnull()].values  # fill in missing PlayKeys with last play of the game
injury_record = injury_record.groupby('PlayKey').first().reset_index()  # keep only the first record for each PlayKey to drop duplicates
print(injury_record.isna().sum())  # Check for any remaining NaNs after processing
print(injury_record['PlayerKey'].value_counts().head(10))  # Check top 10 players with most injuries after processing

# merge target
df_final = df_final.merge(injury_record, on='PlayKey', how='left')
df_final = df_final.drop(columns=['PlayerKey_x', 'GameID_x', 'PlayerKey_y', 'GameID_y', 'Surface'])  # Drop redundant columns

injury_cols = [['DM_M1', 'DM_M7', 'DM_M28', 'DM_M42']]
for col in injury_cols[0]:
    if col in df_final.columns:
        df_final[col] = df_final[col].fillna(0)  # Fill NaNs with 0 for injury columns

# create binary target 
df_final['injury'] = df_final['DM_M1']
df_final['BodyPart'] = df_final['BodyPart'].fillna('Non-Injury')  # Fill NaNs in BodyPart with 'Non-Injury'
# Drop the 'Surface' column as it is not needed
one_hot_cols = ['RosterPosition', 'FieldType', 'PlayType', 'BodyPart']
df_final = pd.get_dummies(df_final, columns=one_hot_cols, drop_first=True)  # One-hot encode categorical variables
# Convert all boolean columns in df_final to 0/1 integers
bool_cols = df_final.select_dtypes(include='bool').columns
df_final[bool_cols] = df_final[bool_cols].astype(int)
df_final.head()  # Display the final processed DataFrame

Processed 1000 of 266960 plays
Processed 2000 of 266960 plays
Processed 3000 of 266960 plays
Processed 4000 of 266960 plays
Processed 5000 of 266960 plays
Processed 6000 of 266960 plays
Processed 7000 of 266960 plays
Processed 8000 of 266960 plays
Processed 9000 of 266960 plays
Processed 10000 of 266960 plays
Processed 11000 of 266960 plays
Processed 12000 of 266960 plays
Processed 13000 of 266960 plays
Processed 14000 of 266960 plays
Processed 15000 of 266960 plays
Processed 16000 of 266960 plays
Processed 17000 of 266960 plays
Processed 18000 of 266960 plays
Processed 19000 of 266960 plays
Processed 20000 of 266960 plays
Processed 21000 of 266960 plays
Processed 22000 of 266960 plays
Processed 23000 of 266960 plays
Processed 24000 of 266960 plays
Processed 25000 of 266960 plays
Processed 26000 of 266960 plays
Processed 27000 of 266960 plays
Processed 28000 of 266960 plays
Processed 29000 of 266960 plays
Processed 30000 of 266960 plays
Processed 31000 of 266960 plays
Processed 32000 o

Unnamed: 0,PlayKey,dir_slice_0,dir_slice_1,dir_slice_2,dir_slice_3,dir_slice_4,dir_slice_5,dir_slice_6,dir_slice_7,dir_slice_8,...,PlayType_Punt,PlayType_Punt Not Returned,PlayType_Punt Returned,PlayType_Rush,PlayType_Unknown,BodyPart_Foot,BodyPart_Heel,BodyPart_Knee,BodyPart_Non-Injury,BodyPart_Toes
0,26624-1-1,191.485333,224.039667,276.865,270.013667,233.042667,102.27,116.333667,173.968,192.327,...,0,0,0,0,0,0,0,0,1,0
1,26624-1-10,321.335,223.575357,258.757407,289.139286,113.188148,50.242143,101.534444,217.098929,199.318519,...,0,0,0,0,0,0,0,0,1,0
2,26624-1-11,260.44027,137.356757,291.104595,188.302973,88.25027,128.555556,262.251892,300.738919,210.378919,...,0,0,0,1,0,0,0,0,1,0
3,26624-1-12,110.672778,110.170833,200.163429,66.518611,216.207222,248.813429,117.101944,214.901429,233.532778,...,0,0,0,0,0,0,0,0,1,0
4,26624-1-13,125.639231,232.7,38.018,184.952692,181.7476,199.4592,155.309231,149.863077,232.2016,...,0,0,0,0,0,0,0,0,1,0


In [10]:
injury_features = ['BodyPart_Foot', 'BodyPart_Heel', 'BodyPart_Knee', 'BodyPart_Non-Injury', 'BodyPart_Toes', 'DM_M1', 'DM_M7', 'DM_M28', 'DM_M28', 'DM_M42']
target = 'injury'

X = df_final.set_index('PlayKey').drop(columns=[target] + injury_features, errors='ignore')
y = df_final[target]

In [11]:
X.head()

Unnamed: 0_level_0,dir_slice_0,dir_slice_1,dir_slice_2,dir_slice_3,dir_slice_4,dir_slice_5,dir_slice_6,dir_slice_7,dir_slice_8,dir_slice_9,...,PlayType_Field Goal,PlayType_Kickoff,PlayType_Kickoff Not Returned,PlayType_Kickoff Returned,PlayType_Pass,PlayType_Punt,PlayType_Punt Not Returned,PlayType_Punt Returned,PlayType_Rush,PlayType_Unknown
PlayKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26624-1-1,191.485333,224.039667,276.865,270.013667,233.042667,102.27,116.333667,173.968,192.327,78.342667,...,0,0,0,0,1,0,0,0,0,0
26624-1-10,321.335,223.575357,258.757407,289.139286,113.188148,50.242143,101.534444,217.098929,199.318519,216.5775,...,0,0,0,0,1,0,0,0,0,0
26624-1-11,260.44027,137.356757,291.104595,188.302973,88.25027,128.555556,262.251892,300.738919,210.378919,147.169459,...,0,0,0,0,0,0,0,0,1,0
26624-1-12,110.672778,110.170833,200.163429,66.518611,216.207222,248.813429,117.101944,214.901429,233.532778,228.200278,...,0,0,0,0,1,0,0,0,0,0
26624-1-13,125.639231,232.7,38.018,184.952692,181.7476,199.4592,155.309231,149.863077,232.2016,267.487692,...,0,0,0,0,1,0,0,0,0,0


In [19]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Let's be more conservative with the ratios
undersample = RandomUnderSampler(sampling_strategy=0.01)
oversample = SMOTE(sampling_strategy=0.1)

pipeline = ImbPipeline([
    ('undersample', undersample),
    ('oversample', oversample)
])

X_resampled, y_resampled = pipeline.fit_resample(X_scaled, y)
print(f"Original: {y.value_counts()}")
print(f"Resampled: {pd.Series(y_resampled).value_counts()}")

# Cross-validation before train/test split
knn = KNeighborsClassifier(n_neighbors=5)

# Use stratified k-fold to maintain class balance
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Test multiple metrics
cv_accuracy = cross_val_score(knn, X_resampled, y_resampled, cv=cv, scoring='accuracy')
cv_f1 = cross_val_score(knn, X_resampled, y_resampled, cv=cv, scoring='f1')
cv_precision = cross_val_score(knn, X_resampled, y_resampled, cv=cv, scoring='precision')
cv_recall = cross_val_score(knn, X_resampled, y_resampled, cv=cv, scoring='recall')

print(f"\nCross-Validation Results:")
print(f"Accuracy: {cv_accuracy.mean():.3f} (+/- {cv_accuracy.std() * 2:.3f})")
print(f"F1: {cv_f1.mean():.3f} (+/- {cv_f1.std() * 2:.3f})")
print(f"Precision: {cv_precision.mean():.3f} (+/- {cv_precision.std() * 2:.3f})")
print(f"Recall: {cv_recall.mean():.3f} (+/- {cv_recall.std() * 2:.3f})")

# Then your original train/test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print(f"\nTest Set Results:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Original: injury
0.0    266856
1.0       104
Name: count, dtype: int64
Resampled: injury
0.0    10400
1.0     1040
Name: count, dtype: int64

Cross-Validation Results:
Accuracy: 0.946 (+/- 0.008)
F1: 0.770 (+/- 0.027)
Precision: 0.627 (+/- 0.036)
Recall: 0.998 (+/- 0.008)

Test Set Results:
              precision    recall  f1-score   support

         0.0       1.00      0.94      0.97      2080
         1.0       0.64      1.00      0.78       208

    accuracy                           0.95      2288
   macro avg       0.82      0.97      0.88      2288
weighted avg       0.97      0.95      0.95      2288

[[1965  115]
 [   0  208]]


In [14]:
# Verify no leakage - check if any plays appear in both train/test
# (if you have play-level features, this could be an issue)
print("Unique plays in dataset:", len(motion_wide))
print("Train + Test samples:", len(X_train) + len(X_test))

Unique plays in dataset: 266960
Train + Test samples: 11440


In [17]:
len(X.columns)  # Check number of features

83

In [25]:
# First, split the ORIGINAL data before any resampling
X_original_train, X_holdout, y_original_train, y_holdout = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Original holdout distribution:")
print(f"Holdout set: {pd.Series(y_holdout).value_counts()}")
print(f"Training set: {pd.Series(y_original_train).value_counts()}")

# SAVE HERE - before scaling/resampling
# Create proper column names
columns = X.columns

# Save original splits
pd.DataFrame(X, columns=columns).to_csv('scripts/play_surface/motion_features_X_full.csv', index=False)
pd.DataFrame({'injured': y}).to_csv('scripts/play_surface/motion_labels_y_full.csv', index=False)

pd.DataFrame(X_original_train, columns=columns).to_csv('scripts/play_surface/motion_train_X_original.csv', index=False)
pd.DataFrame({'injured': y_original_train}).to_csv('scripts/play_surface/motion_train_y_original.csv', index=False)

pd.DataFrame(X_holdout, columns=columns).to_csv('scripts/play_surface/motion_holdout_X.csv', index=False)
pd.DataFrame({'injured': y_holdout}).to_csv('scripts/play_surface/motion_holdout_y.csv', index=False)

# Now do your resampling ONLY on the training portion
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_original_train)

# Resample only the training data
undersample = RandomUnderSampler(sampling_strategy=0.01)
oversample = SMOTE(sampling_strategy=0.1)

pipeline = ImbPipeline([
    ('undersample', undersample),
    ('oversample', oversample)
])

X_resampled, y_resampled = pipeline.fit_resample(X_train_scaled, y_original_train)

# SAVE RESAMPLED DATA TOO
pd.DataFrame(X_resampled, columns=columns).to_csv('scripts/play_surface/motion_train_X_resampled.csv', index=False)
pd.DataFrame({'injured': y_resampled}).to_csv('scripts/play_surface/motion_train_y_resampled.csv', index=False)

# Save the fitted scaler for future use
import joblib
joblib.dump(scaler, 'scripts/play_surface/motion_scaler.pkl')

print("All datasets saved!")

Original holdout distribution:
Holdout set: injury
0.0    53371
1.0       21
Name: count, dtype: int64
Training set: injury
0.0    213485
1.0        83
Name: count, dtype: int64
All datasets saved!


In [27]:
X_holdout

Unnamed: 0_level_0,dir_slice_0,dir_slice_1,dir_slice_2,dir_slice_3,dir_slice_4,dir_slice_5,dir_slice_6,dir_slice_7,dir_slice_8,dir_slice_9,...,PlayType_Field Goal,PlayType_Kickoff,PlayType_Kickoff Not Returned,PlayType_Kickoff Returned,PlayType_Pass,PlayType_Punt,PlayType_Punt Not Returned,PlayType_Punt Returned,PlayType_Rush,PlayType_Unknown
PlayKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
36579-8-47,125.528966,139.956786,161.723571,175.533793,296.777500,157.096429,271.934138,240.735000,322.887500,32.023793,...,0,0,0,0,0,0,0,0,1,0
42470-30-12,129.090000,140.989231,161.496538,189.543846,135.063846,114.064615,80.696923,55.519615,304.504615,279.191852,...,0,0,0,0,0,0,0,0,1,0
43050-4-34,125.448235,191.211250,121.751875,90.850000,67.932500,57.811250,144.433125,227.800625,256.821250,250.055294,...,0,0,0,0,0,0,0,0,1,0
41577-2-32,132.567600,286.798800,273.736400,195.459600,313.210800,33.344583,155.729600,131.439600,231.295200,145.236800,...,0,0,0,0,1,0,0,0,0,0
27363-18-46,115.553137,261.774400,127.685400,270.365400,155.982000,170.474600,201.399200,200.004000,157.373600,124.317400,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44489-1-35,295.815652,67.890870,93.645000,95.233913,204.564783,231.998636,219.329130,169.903636,65.027826,112.426957,...,0,0,0,0,1,0,0,0,0,0
43229-14-58,179.867347,90.577917,197.731837,236.442708,210.117143,266.513958,57.809375,149.932245,91.967292,108.659388,...,0,0,0,0,1,0,0,0,0,0
46119-12-58,71.703333,48.931923,146.567308,240.770385,175.925385,78.595000,209.868462,221.832308,151.906923,202.268077,...,0,0,0,0,0,0,0,0,1,0
44527-26-54,211.615429,173.869429,130.110588,156.131143,149.821143,158.240588,229.678286,190.538529,48.080857,103.495429,...,0,0,0,0,0,0,0,0,1,0


In [26]:
X_holdout_scaled

array([[-0.70986705, -0.493822  , -0.22332528, ..., -0.09661734,
         1.37220165, -0.03710297],
       [-0.66321927, -0.48090315, -0.22617616, ..., -0.09661734,
         1.37220165, -0.03710297],
       [-0.71092458,  0.14751833, -0.72525465, ..., -0.09661734,
         1.37220165, -0.03710297],
       ...,
       [-1.41495603, -1.63280406, -0.4136443 , ..., -0.09661734,
         1.37220165, -0.03710297],
       [ 0.41782275, -0.06947759, -0.62029329, ..., -0.09661734,
         1.37220165, -0.03710297],
       [ 0.62225074, -0.47895794,  0.07625035, ..., -0.09661734,
        -0.72875587, -0.03710297]])

In [20]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_resampled, y_resampled)

X_holdout_scaled = scaler.transform(X_holdout)
y_holdout_pred = knn.predict(X_holdout_scaled)
print(f"\nHoldout Set Results:")
print(classification_report(y_holdout, y_holdout_pred))
print(confusion_matrix(y_holdout, y_holdout_pred))


Holdout Set Results:
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97     53371
         1.0       0.01      1.00      0.01        21

    accuracy                           0.95     53392
   macro avg       0.50      0.97      0.49     53392
weighted avg       1.00      0.95      0.97     53392

[[50586  2785]
 [    0    21]]


In [23]:
X_holdout.head()

Unnamed: 0_level_0,dir_slice_0,dir_slice_1,dir_slice_2,dir_slice_3,dir_slice_4,dir_slice_5,dir_slice_6,dir_slice_7,dir_slice_8,dir_slice_9,...,PlayType_Field Goal,PlayType_Kickoff,PlayType_Kickoff Not Returned,PlayType_Kickoff Returned,PlayType_Pass,PlayType_Punt,PlayType_Punt Not Returned,PlayType_Punt Returned,PlayType_Rush,PlayType_Unknown
PlayKey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
36579-8-47,125.528966,139.956786,161.723571,175.533793,296.7775,157.096429,271.934138,240.735,322.8875,32.023793,...,0,0,0,0,0,0,0,0,1,0
42470-30-12,129.09,140.989231,161.496538,189.543846,135.063846,114.064615,80.696923,55.519615,304.504615,279.191852,...,0,0,0,0,0,0,0,0,1,0
43050-4-34,125.448235,191.21125,121.751875,90.85,67.9325,57.81125,144.433125,227.800625,256.82125,250.055294,...,0,0,0,0,0,0,0,0,1,0
41577-2-32,132.5676,286.7988,273.7364,195.4596,313.2108,33.344583,155.7296,131.4396,231.2952,145.2368,...,0,0,0,0,1,0,0,0,0,0
27363-18-46,115.553137,261.7744,127.6854,270.3654,155.982,170.4746,201.3992,200.004,157.3736,124.3174,...,0,0,0,0,1,0,0,0,0,0


In [28]:
X_holdout.columns

Index(['dir_slice_0', 'dir_slice_1', 'dir_slice_2', 'dir_slice_3',
       'dir_slice_4', 'dir_slice_5', 'dir_slice_6', 'dir_slice_7',
       'dir_slice_8', 'dir_slice_9', 'dis_slice_0', 'dis_slice_1',
       'dis_slice_2', 'dis_slice_3', 'dis_slice_4', 'dis_slice_5',
       'dis_slice_6', 'dis_slice_7', 'dis_slice_8', 'dis_slice_9', 'o_slice_0',
       'o_slice_1', 'o_slice_2', 'o_slice_3', 'o_slice_4', 'o_slice_5',
       'o_slice_6', 'o_slice_7', 'o_slice_8', 'o_slice_9', 's_slice_0',
       's_slice_1', 's_slice_2', 's_slice_3', 's_slice_4', 's_slice_5',
       's_slice_6', 's_slice_7', 's_slice_8', 's_slice_9', 'x_slice_0',
       'x_slice_1', 'x_slice_2', 'x_slice_3', 'x_slice_4', 'x_slice_5',
       'x_slice_6', 'x_slice_7', 'x_slice_8', 'x_slice_9', 'y_slice_0',
       'y_slice_1', 'y_slice_2', 'y_slice_3', 'y_slice_4', 'y_slice_5',
       'y_slice_6', 'y_slice_7', 'y_slice_8', 'y_slice_9', 'PlayerDay',
       'PlayerGame', 'RosterPosition_Defensive Lineman',
       'RosterPosit