In [1]:
import numpy as np
import pandas as pd

def load_and_normalize(path):
    """Load a CSV, strip and lowercase its column names."""
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.lower()
    return df


# NGS chunks for 2016 & 2017
ngs_paths = [
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-pre.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-post.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk1-6.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk7-12.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk13-17.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-pre.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-post.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk1-6.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk7-12.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk13-17.csv',
]
df = pd.concat([load_and_normalize(p) for p in ngs_paths], ignore_index=True)

ndtypes = {'gamekey': 'int16',         
           'playid': 'int16',         
           'gsisid': 'float32',        
           'time': 'str',         
           'x': 'float32',         
           'y': 'float32',         
           'dis': 'float32',
           'o': 'float32',
           'event': 'str'}

df = df.astype(ndtypes)
df.dropna(subset='gsisid', inplace=True)
#df['gsisid'] = df['gsisid'].fillna(-1)

import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

def process_motion_chunk(chunk_df):
    # Make an explicit copy to avoid warnings
    chunk_df = chunk_df.copy()
    
    # Convert to datetime
    chunk_df['time'] = pd.to_datetime(chunk_df['time'])
    
    # Calculate relative time from play start
    chunk_df['relative_time'] = chunk_df.groupby(['season_year', 'gamekey', 'playid', 'gsisid'])['time'].transform(
        lambda x: (x - x.min()).dt.total_seconds()
    )
    
    # Create time deciles
    chunk_df['time_decile'] = chunk_df.groupby(['season_year', 'gamekey', 'playid', 'gsisid'])['relative_time'].transform(
        lambda x: pd.cut(x, bins=10, labels=[f'slice_{i}' for i in range(10)])
    )
    
    # Pivot to wide
    motion_wide = chunk_df.pivot_table(
        index=['season_year', 'gamekey', 'playid', 'gsisid'], 
        columns='time_decile',
        values=['dis', 'x', 'y', 'o'],
        aggfunc='mean',
        observed=True
    )
    
    # Flatten columns
    motion_wide.columns = [f'{metric}_{time}' for metric, time in motion_wide.columns]
    
    # Fill missing values by interpolating across time slices
    for metric in ['dis', 'x', 'y', 'o']:
        metric_cols = [f'{metric}_slice_{i}' for i in range(10)]
        motion_wide[metric_cols] = motion_wide[metric_cols].interpolate(axis=1, method='linear')
    
    return motion_wide

# Process in chunks
motion_wide_list = []
unique_games = df['gamekey'].unique()

for i in range(0, len(unique_games), 10):  # Process 10 games at a time
    game_chunk = unique_games[i:i+10]
    chunk_df = df[df['gamekey'].isin(game_chunk)]
    
    motion_wide_chunk = process_motion_chunk(chunk_df)
    motion_wide_list.append(motion_wide_chunk)
    
    print(f"Processed {i+len(game_chunk)} of {len(unique_games)} games")

# Combine all chunks
motion_wide = pd.concat(motion_wide_list, axis=0)

print(len(motion_wide), motion_wide.shape)
motion_wide.reset_index(inplace=True)
revs = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/video_review.csv')
df_final = motion_wide.merge(revs, on=['season_year', 'gamekey', 'playid', 'gsisid'], how='left')
df_final['concussed'] = df_final['player_activity_derived'].notnull().astype(int)
df_final.dropna(subset=['dis_slice_0'], inplace=True)

  df = pd.read_csv(path)


Processed 10 of 632 games
Processed 20 of 632 games
Processed 30 of 632 games
Processed 40 of 632 games
Processed 50 of 632 games
Processed 60 of 632 games
Processed 70 of 632 games
Processed 80 of 632 games
Processed 90 of 632 games
Processed 100 of 632 games
Processed 110 of 632 games
Processed 120 of 632 games
Processed 130 of 632 games
Processed 140 of 632 games
Processed 150 of 632 games
Processed 160 of 632 games
Processed 170 of 632 games


  chunk_df['time'] = pd.to_datetime(chunk_df['time'])


Processed 180 of 632 games
Processed 190 of 632 games
Processed 200 of 632 games
Processed 210 of 632 games
Processed 220 of 632 games
Processed 230 of 632 games
Processed 240 of 632 games
Processed 250 of 632 games
Processed 260 of 632 games
Processed 270 of 632 games
Processed 280 of 632 games
Processed 290 of 632 games
Processed 300 of 632 games
Processed 310 of 632 games
Processed 320 of 632 games
Processed 330 of 632 games
Processed 340 of 632 games
Processed 350 of 632 games
Processed 360 of 632 games
Processed 370 of 632 games
Processed 380 of 632 games
Processed 390 of 632 games
Processed 400 of 632 games
Processed 410 of 632 games
Processed 420 of 632 games
Processed 430 of 632 games
Processed 440 of 632 games
Processed 450 of 632 games
Processed 460 of 632 games
Processed 470 of 632 games
Processed 480 of 632 games
Processed 490 of 632 games
Processed 500 of 632 games
Processed 510 of 632 games
Processed 520 of 632 games
Processed 530 of 632 games
Processed 540 of 632 games
P

In [2]:
df_final.head()

Unnamed: 0,season_year,gamekey,playid,gsisid,dis_slice_0,dis_slice_1,dis_slice_2,dis_slice_3,dis_slice_4,dis_slice_5,...,y_slice_7,y_slice_8,y_slice_9,player_activity_derived,turnover_related,primary_impact_type,primary_partner_gsisid,primary_partner_activity_derived,friendly_fire,concussed
0,2016,6,3236,26957.0,0.065161,0.048667,0.037,0.035667,0.046667,0.120667,...,27.274332,33.087334,35.971668,,,,,,,0
1,2016,6,3236,28932.0,0.07129,0.062333,0.028,0.048667,0.058,0.192,...,25.376667,31.458334,22.678667,,,,,,,0
2,2016,6,3236,28943.0,0.026774,0.058667,0.029333,0.050667,0.061,0.166333,...,32.705334,32.312332,32.005333,,,,,,,0
3,2016,6,3236,29414.0,0.247419,0.114333,0.026333,0.054333,0.048667,0.216667,...,22.553667,28.905001,30.914667,,,,,,,0
4,2016,6,3236,29963.0,0.15,0.17,0.185,0.2,0.18,0.195,...,-0.195,-0.31,-0.46,,,,,,,0


In [3]:
concussion_features = ['primary_partner_gsisid', 'primary_partner_activity_derived', 'player_activity_derived', 'turnover_related', 'primary_impact_type', 'friendly_fire']
target = 'concussed'

X = df_final.set_index(['season_year', 'gamekey', 'playid', 'gsisid']).drop(columns=concussion_features + [target])
y = df_final[target]

In [4]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Let's be more conservative with the ratios
undersample = RandomUnderSampler(sampling_strategy=0.01)
oversample = SMOTE(sampling_strategy=0.1)

pipeline = ImbPipeline([
    ('undersample', undersample),
    ('oversample', oversample)
])

X_resampled, y_resampled = pipeline.fit_resample(X_scaled, y)
print(f"Original: {y.value_counts()}")
print(f"Resampled: {pd.Series(y_resampled).value_counts()}")

# Cross-validation before train/test split
knn = KNeighborsClassifier(n_neighbors=5)

# Use stratified k-fold to maintain class balance
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Test multiple metrics
cv_accuracy = cross_val_score(knn, X_resampled, y_resampled, cv=cv, scoring='accuracy')
cv_f1 = cross_val_score(knn, X_resampled, y_resampled, cv=cv, scoring='f1')
cv_precision = cross_val_score(knn, X_resampled, y_resampled, cv=cv, scoring='precision')
cv_recall = cross_val_score(knn, X_resampled, y_resampled, cv=cv, scoring='recall')

print(f"\nCross-Validation Results:")
print(f"Accuracy: {cv_accuracy.mean():.3f} (+/- {cv_accuracy.std() * 2:.3f})")
print(f"F1: {cv_f1.mean():.3f} (+/- {cv_f1.std() * 2:.3f})")
print(f"Precision: {cv_precision.mean():.3f} (+/- {cv_precision.std() * 2:.3f})")
print(f"Recall: {cv_recall.mean():.3f} (+/- {cv_recall.std() * 2:.3f})")

# Then your original train/test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print(f"\nTest Set Results:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Original: concussed
0    272716
1        32
Name: count, dtype: int64
Resampled: concussed
0    3200
1     320
Name: count, dtype: int64

Cross-Validation Results:
Accuracy: 0.960 (+/- 0.013)
F1: 0.819 (+/- 0.046)
Precision: 0.694 (+/- 0.067)
Recall: 1.000 (+/- 0.000)

Test Set Results:
              precision    recall  f1-score   support

           0       1.00      0.95      0.98       640
           1       0.69      1.00      0.82        64

    accuracy                           0.96       704
   macro avg       0.84      0.98      0.90       704
weighted avg       0.97      0.96      0.96       704

[[611  29]
 [  0  64]]


In [5]:
# Verify no leakage - check if any plays appear in both train/test
# (if you have play-level features, this could be an issue)
print("Unique plays in dataset:", len(motion_wide))
print("Train + Test samples:", len(X_train) + len(X_test))

Unique plays in dataset: 274208
Train + Test samples: 3520


In [6]:
X.columns

Index(['dis_slice_0', 'dis_slice_1', 'dis_slice_2', 'dis_slice_3',
       'dis_slice_4', 'dis_slice_5', 'dis_slice_6', 'dis_slice_7',
       'dis_slice_8', 'dis_slice_9', 'o_slice_0', 'o_slice_1', 'o_slice_2',
       'o_slice_3', 'o_slice_4', 'o_slice_5', 'o_slice_6', 'o_slice_7',
       'o_slice_8', 'o_slice_9', 'x_slice_0', 'x_slice_1', 'x_slice_2',
       'x_slice_3', 'x_slice_4', 'x_slice_5', 'x_slice_6', 'x_slice_7',
       'x_slice_8', 'x_slice_9', 'y_slice_0', 'y_slice_1', 'y_slice_2',
       'y_slice_3', 'y_slice_4', 'y_slice_5', 'y_slice_6', 'y_slice_7',
       'y_slice_8', 'y_slice_9'],
      dtype='object')

In [7]:
from sklearn.model_selection import train_test_split

# First, split the ORIGINAL data before any resampling
X_original_train, X_holdout, y_original_train, y_holdout = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Original holdout distribution:")
print(f"Holdout set: {pd.Series(y_holdout).value_counts()}")
print(f"Training set: {pd.Series(y_original_train).value_counts()}")

# SAVE HERE - before scaling/resampling
# Create proper column names
columns = []
for metric in ['dis', 'x', 'y', 'o']:
    for i in range(10):  # or 20 if you used 20 slices
        columns.append(f'{metric}_slice_{i}')

# Save original splits
pd.DataFrame(X, columns=columns).to_csv('motion_features_X_full.csv', index=False)
pd.DataFrame({'concussed': y}).to_csv('motion_labels_y_full.csv', index=False)

pd.DataFrame(X_original_train, columns=columns).to_csv('motion_train_X_original.csv', index=False)
pd.DataFrame({'concussed': y_original_train}).to_csv('motion_train_y_original.csv', index=False)

pd.DataFrame(X_holdout, columns=columns).to_csv('motion_holdout_X.csv', index=False)
pd.DataFrame({'concussed': y_holdout}).to_csv('motion_holdout_y.csv', index=False)

# Now do your resampling ONLY on the training portion
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_original_train)

# Resample only the training data
undersample = RandomUnderSampler(sampling_strategy=0.01)
oversample = SMOTE(sampling_strategy=0.1)

pipeline = ImbPipeline([
    ('undersample', undersample),
    ('oversample', oversample)
])

X_resampled, y_resampled = pipeline.fit_resample(X_train_scaled, y_original_train)

# SAVE RESAMPLED DATA TOO
pd.DataFrame(X_resampled, columns=columns).to_csv('motion_train_X_resampled.csv', index=False)
pd.DataFrame({'concussed': y_resampled}).to_csv('motion_train_y_resampled.csv', index=False)

# Save the fitted scaler for future use
import joblib
joblib.dump(scaler, 'motion_scaler.pkl')

print("All datasets saved!")

Original holdout distribution:
Holdout set: concussed
0    54544
1        6
Name: count, dtype: int64
Training set: concussed
0    218172
1        26
Name: count, dtype: int64
All datasets saved!


In [10]:
# train again on our properly scaled/resampled data
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_resampled, y_resampled)


In [11]:
# Check if probability scores are more useful
X_holdout_scaled = scaler.fit_transform(X_holdout)  
y_proba_holdout = knn.predict_proba(X_holdout_scaled)[:, 1]
print(f"Probability scores for the 6 actual concussions:")
print(y_proba_holdout[y_holdout == 1])

Probability scores for the 6 actual concussions:
[0.6 0.  0.6 0.  0.  0.4]


In [12]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

# Get probabilities for entire holdout set
y_proba_holdout = knn.predict_proba(X_holdout_scaled)[:, 1]

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_holdout, y_proba_holdout)

# Try different thresholds
thresholds_to_try = [0.1, 0.2, 0.3, 0.4, 0.5]

print("Threshold | Precision | Recall | False Positives | True Positives")
print("-" * 65)

for thresh in thresholds_to_try:
    y_pred_thresh = (y_proba_holdout >= thresh).astype(int)
    
    # Calculate metrics manually
    tp = np.sum((y_holdout == 1) & (y_pred_thresh == 1))
    fp = np.sum((y_holdout == 0) & (y_pred_thresh == 1))
    fn = np.sum((y_holdout == 1) & (y_pred_thresh == 0))
    
    precision_val = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall_val = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    print(f"{thresh:8.1f} | {precision_val:9.3f} | {recall_val:6.3f} | {fp:15d} | {tp:14d}")

Threshold | Precision | Recall | False Positives | True Positives
-----------------------------------------------------------------
     0.1 |     0.001 |  0.500 |            5406 |              3
     0.2 |     0.001 |  0.500 |            5406 |              3
     0.3 |     0.001 |  0.500 |            3497 |              3
     0.4 |     0.001 |  0.500 |            3497 |              3
     0.5 |     0.001 |  0.333 |            2161 |              2
