In [1]:
import numpy as np
import pandas as pd

def load_and_normalize(path):
    """Load a CSV, strip and lowercase its column names."""
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.lower()
    return df


# NGS chunks for 2016 & 2017
ngs_paths = [
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-pre.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-post.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk1-6.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk7-12.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk13-17.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-pre.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-post.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk1-6.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk7-12.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk13-17.csv',
]
df = pd.concat([load_and_normalize(p) for p in ngs_paths], ignore_index=True)

ndtypes = {'gamekey': 'int16',         
           'playid': 'int16',         
           'gsisid': 'float32',        
           'time': 'str',         
           'x': 'float32',         
           'y': 'float32',         
           'dis': 'float32',
           'o': 'float32',
           'event': 'str'}

df = df.astype(ndtypes)
df.dropna(subset='gsisid', inplace=True)
#df['gsisid'] = df['gsisid'].fillna(-1)

import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

def process_motion_chunk(chunk_df):
    # Make an explicit copy to avoid warnings
    chunk_df = chunk_df.copy()
    
    # Convert to datetime
    chunk_df['time'] = pd.to_datetime(chunk_df['time'])
    
    # Calculate relative time from play start
    chunk_df['relative_time'] = chunk_df.groupby(['season_year', 'gamekey', 'playid', 'gsisid'])['time'].transform(
        lambda x: (x - x.min()).dt.total_seconds()
    )
    
    # Create time deciles
    chunk_df['time_decile'] = chunk_df.groupby(['season_year', 'gamekey', 'playid', 'gsisid'])['relative_time'].transform(
        lambda x: pd.cut(x, bins=60, labels=[f'slice_{i}' for i in range(60)])
    )
    
    # Pivot to wide
    motion_wide = chunk_df.pivot_table(
        index=['season_year', 'gamekey', 'playid', 'gsisid'], 
        columns='time_decile',
        values=['dis', 'x', 'y', 'o'],
        aggfunc='mean',
        observed=True
    )
    
    # Flatten columns
    motion_wide.columns = [f'{metric}_{time}' for metric, time in motion_wide.columns]
    
    # Fill missing values by interpolating across time slices
    for metric in ['dis', 'x', 'y', 'o']:
        metric_cols = [f'{metric}_slice_{i}' for i in range(60)]
        motion_wide[metric_cols] = motion_wide[metric_cols].interpolate(axis=1, method='linear')
    
    return motion_wide

# Process in chunks
motion_wide_list = []
unique_games = df['gamekey'].unique()

for i in range(0, len(unique_games), 10):  # Process 10 games at a time
    game_chunk = unique_games[i:i+10]
    chunk_df = df[df['gamekey'].isin(game_chunk)]
    
    motion_wide_chunk = process_motion_chunk(chunk_df)
    motion_wide_list.append(motion_wide_chunk)
    
    print(f"Processed {i+len(game_chunk)} of {len(unique_games)} games")

# Combine all chunks
motion_wide = pd.concat(motion_wide_list, axis=0)

print(len(motion_wide), motion_wide.shape)
motion_wide.reset_index(inplace=True)
revs = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/video_review.csv')
df_final = motion_wide.merge(revs, on=['season_year', 'gamekey', 'playid', 'gsisid'], how='left')
df_final['concussed'] = df_final['player_activity_derived'].notnull().astype(int)
df_final.dropna(subset=['dis_slice_0'], inplace=True)

  df = pd.read_csv(path)


Processed 10 of 632 games
Processed 20 of 632 games
Processed 30 of 632 games
Processed 40 of 632 games
Processed 50 of 632 games
Processed 60 of 632 games
Processed 70 of 632 games
Processed 80 of 632 games
Processed 90 of 632 games
Processed 100 of 632 games
Processed 110 of 632 games
Processed 120 of 632 games
Processed 130 of 632 games
Processed 140 of 632 games
Processed 150 of 632 games
Processed 160 of 632 games
Processed 170 of 632 games


  chunk_df['time'] = pd.to_datetime(chunk_df['time'])


Processed 180 of 632 games
Processed 190 of 632 games
Processed 200 of 632 games
Processed 210 of 632 games
Processed 220 of 632 games
Processed 230 of 632 games
Processed 240 of 632 games
Processed 250 of 632 games
Processed 260 of 632 games
Processed 270 of 632 games
Processed 280 of 632 games
Processed 290 of 632 games
Processed 300 of 632 games
Processed 310 of 632 games
Processed 320 of 632 games
Processed 330 of 632 games
Processed 340 of 632 games
Processed 350 of 632 games
Processed 360 of 632 games
Processed 370 of 632 games
Processed 380 of 632 games
Processed 390 of 632 games
Processed 400 of 632 games
Processed 410 of 632 games
Processed 420 of 632 games
Processed 430 of 632 games
Processed 440 of 632 games
Processed 450 of 632 games
Processed 460 of 632 games
Processed 470 of 632 games
Processed 480 of 632 games
Processed 490 of 632 games
Processed 500 of 632 games
Processed 510 of 632 games
Processed 520 of 632 games
Processed 530 of 632 games
Processed 540 of 632 games
P

  motion_wide.reset_index(inplace=True)
  motion_wide.reset_index(inplace=True)
  motion_wide.reset_index(inplace=True)
  motion_wide.reset_index(inplace=True)


In [2]:
df_final.head()

Unnamed: 0,season_year,gamekey,playid,gsisid,dis_slice_0,dis_slice_1,dis_slice_2,dis_slice_3,dis_slice_4,dis_slice_5,...,y_slice_57,y_slice_58,y_slice_59,player_activity_derived,turnover_related,primary_impact_type,primary_partner_gsisid,primary_partner_activity_derived,friendly_fire,concussed
0,2016,6,3236,26957.0,0.103333,0.068,0.066,0.056,0.068,0.022,...,36.258003,35.993999,35.484001,,,,,,,0
1,2016,6,3236,28932.0,0.108333,0.096,0.082,0.064,0.036,0.034,...,21.26,18.789999,16.49,,,,,,,0
2,2016,6,3236,28943.0,0.03,0.014,0.018,0.01,0.024,0.064,...,31.932001,31.614002,31.241999,,,,,,,0
3,2016,6,3236,29414.0,0.301667,0.294,0.272,0.236,0.212,0.158,...,30.91,30.794001,30.689999,,,,,,,0
4,2016,6,3236,29963.0,0.15,0.152222,0.154444,0.156667,0.158889,0.161111,...,-0.43,-0.445,-0.46,,,,,,,0


In [3]:
concussion_features = ['primary_partner_gsisid', 'primary_partner_activity_derived', 'player_activity_derived', 'turnover_related', 'primary_impact_type', 'friendly_fire']
target = 'concussed'

X = df_final.set_index(['season_year', 'gamekey', 'playid', 'gsisid']).drop(columns=concussion_features + [target])
y = df_final[target]

In [4]:
X.isna().sum()

dis_slice_0    0
dis_slice_1    0
dis_slice_2    0
dis_slice_3    0
dis_slice_4    0
              ..
y_slice_55     0
y_slice_56     0
y_slice_57     0
y_slice_58     0
y_slice_59     0
Length: 240, dtype: int64

In [5]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Let's be more conservative with the ratios
undersample = RandomUnderSampler(sampling_strategy=0.01)
oversample = SMOTE(sampling_strategy=0.1)

pipeline = ImbPipeline([
    ('undersample', undersample),
    ('oversample', oversample)
])

X_resampled, y_resampled = pipeline.fit_resample(X_scaled, y)
print(f"Original: {y.value_counts()}")
print(f"Resampled: {pd.Series(y_resampled).value_counts()}")

# Cross-validation before train/test split
knn = KNeighborsClassifier(n_neighbors=5)

# Use stratified k-fold to maintain class balance
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Test multiple metrics
cv_accuracy = cross_val_score(knn, X_resampled, y_resampled, cv=cv, scoring='accuracy')
cv_f1 = cross_val_score(knn, X_resampled, y_resampled, cv=cv, scoring='f1')
cv_precision = cross_val_score(knn, X_resampled, y_resampled, cv=cv, scoring='precision')
cv_recall = cross_val_score(knn, X_resampled, y_resampled, cv=cv, scoring='recall')

print(f"\nCross-Validation Results:")
print(f"Accuracy: {cv_accuracy.mean():.3f} (+/- {cv_accuracy.std() * 2:.3f})")
print(f"F1: {cv_f1.mean():.3f} (+/- {cv_f1.std() * 2:.3f})")
print(f"Precision: {cv_precision.mean():.3f} (+/- {cv_precision.std() * 2:.3f})")
print(f"Recall: {cv_recall.mean():.3f} (+/- {cv_recall.std() * 2:.3f})")

# Then your original train/test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print(f"\nTest Set Results:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Original: concussed
0    272716
1        32
Name: count, dtype: int64
Resampled: concussed
0    3200
1     320
Name: count, dtype: int64

Cross-Validation Results:
Accuracy: 0.952 (+/- 0.012)
F1: 0.793 (+/- 0.041)
Precision: 0.657 (+/- 0.056)
Recall: 1.000 (+/- 0.000)

Test Set Results:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       640
           1       0.62      1.00      0.76        64

    accuracy                           0.94       704
   macro avg       0.81      0.97      0.86       704
weighted avg       0.97      0.94      0.95       704

[[600  40]
 [  0  64]]


In [6]:
# Verify no leakage - check if any plays appear in both train/test
# (if you have play-level features, this could be an issue)
print("Unique plays in dataset:", len(motion_wide))
print("Train + Test samples:", len(X_train) + len(X_test))

Unique plays in dataset: 274208
Train + Test samples: 3520


In [7]:
X.columns

Index(['dis_slice_0', 'dis_slice_1', 'dis_slice_2', 'dis_slice_3',
       'dis_slice_4', 'dis_slice_5', 'dis_slice_6', 'dis_slice_7',
       'dis_slice_8', 'dis_slice_9',
       ...
       'y_slice_50', 'y_slice_51', 'y_slice_52', 'y_slice_53', 'y_slice_54',
       'y_slice_55', 'y_slice_56', 'y_slice_57', 'y_slice_58', 'y_slice_59'],
      dtype='object', length=240)

In [8]:
from sklearn.model_selection import train_test_split

# First, split the ORIGINAL data before any resampling
X_original_train, X_holdout, y_original_train, y_holdout = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Original holdout distribution:")
print(f"Holdout set: {pd.Series(y_holdout).value_counts()}")
print(f"Training set: {pd.Series(y_original_train).value_counts()}")

# SAVE HERE - before scaling/resampling
# Create proper column names
columns = []
for metric in ['dis', 'x', 'y', 'o']:
    for i in range(60):  # or 20 if you used 20 slices
        columns.append(f'{metric}_slice_{i}')

# Save original splits
pd.DataFrame(X, columns=columns).to_csv('scripts/punt_analytics/motion_features_X_full.csv', index=False)
pd.DataFrame({'concussed': y}).to_csv('scripts/punt_analytics/motion_labels_y_full.csv', index=False)

pd.DataFrame(X_original_train, columns=columns).to_csv('scripts/punt_analytics/motion_train_X_original.csv', index=False)
pd.DataFrame({'concussed': y_original_train}).to_csv('scripts/punt_analytics/motion_train_y_original.csv', index=False)

pd.DataFrame(X_holdout, columns=columns).to_csv('scripts/punt_analytics/motion_holdout_X.csv', index=False)
pd.DataFrame({'concussed': y_holdout}).to_csv('scripts/punt_analytics/motion_holdout_y.csv', index=False)

# Now do your resampling ONLY on the training portion
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_original_train)

# Resample only the training data
undersample = RandomUnderSampler(sampling_strategy=0.01)
oversample = SMOTE(sampling_strategy=0.1)

pipeline = ImbPipeline([
    ('undersample', undersample),
    ('oversample', oversample)
])

X_resampled, y_resampled = pipeline.fit_resample(X_train_scaled, y_original_train)

# SAVE RESAMPLED DATA TOO
pd.DataFrame(X_resampled, columns=columns).to_csv('scripts/punt_analytics/motion_train_X_resampled.csv', index=False)
pd.DataFrame({'concussed': y_resampled}).to_csv('scripts/punt_analytics/motion_train_y_resampled.csv', index=False)

# Save the fitted scaler for future use
import joblib
joblib.dump(scaler, 'scripts/punt_analytics/motion_scaler.pkl')

print("All datasets saved!")

Original holdout distribution:
Holdout set: concussed
0    54544
1        6
Name: count, dtype: int64
Training set: concussed
0    218172
1        26
Name: count, dtype: int64
All datasets saved!


In [9]:
# train again on our properly scaled/resampled data
# Then your original train/test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(f"\nTest Set Results after resampling:")
print(classification_report(y_test, y_pred))



Test Set Results after resampling:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       520
           1       0.76      1.00      0.87        52

    accuracy                           0.97       572
   macro avg       0.88      0.98      0.93       572
weighted avg       0.98      0.97      0.97       572



In [10]:
# Check if probability scores are more useful
X_holdout_scaled = scaler.transform(X_holdout)  
y_proba_holdout = knn.predict_proba(X_holdout_scaled)[:, 1]
print(f"Probability scores for the 6 actual concussions:")
print(y_proba_holdout[y_holdout == 1])

Probability scores for the 6 actual concussions:
[0.6 0.  0.6 0.  0.  0.6]


In [12]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

# Get probabilities for entire holdout set
y_proba_holdout = knn.predict_proba(X_holdout_scaled)[:, 1]

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_holdout, y_proba_holdout)

# Try different thresholds
thresholds_to_try = [0.1, 0.2, 0.3, 0.4, 0.5]

print("Threshold | Precision | Recall | False Positives | True Positives")
print("-" * 65)

for thresh in thresholds_to_try:
    y_pred_thresh = (y_proba_holdout >= thresh).astype(int)
    
    # Calculate metrics manually
    tp = np.sum((y_holdout == 1) & (y_pred_thresh == 1))
    fp = np.sum((y_holdout == 0) & (y_pred_thresh == 1))
    fn = np.sum((y_holdout == 1) & (y_pred_thresh == 0))
    
    precision_val = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall_val = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    print(f"{thresh:8.1f} | {precision_val:9.3f} | {recall_val:6.3f} | {fp:15d} | {tp:14d}")

Threshold | Precision | Recall | False Positives | True Positives
-----------------------------------------------------------------
     0.1 |     0.001 |  0.500 |            5874 |              3
     0.2 |     0.001 |  0.500 |            5874 |              3
     0.3 |     0.001 |  0.500 |            3708 |              3
     0.4 |     0.001 |  0.500 |            3708 |              3
     0.5 |     0.001 |  0.500 |            2175 |              3


In [13]:
df2 = pd.read_csv('/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/play_player_role_data.csv')

In [15]:
df_final.head()

Unnamed: 0,season_year,gamekey,playid,gsisid,dis_slice_0,dis_slice_1,dis_slice_2,dis_slice_3,dis_slice_4,dis_slice_5,...,y_slice_57,y_slice_58,y_slice_59,player_activity_derived,turnover_related,primary_impact_type,primary_partner_gsisid,primary_partner_activity_derived,friendly_fire,concussed
0,2016,6,3236,26957.0,0.103333,0.068,0.066,0.056,0.068,0.022,...,36.258003,35.993999,35.484001,,,,,,,0
1,2016,6,3236,28932.0,0.108333,0.096,0.082,0.064,0.036,0.034,...,21.26,18.789999,16.49,,,,,,,0
2,2016,6,3236,28943.0,0.03,0.014,0.018,0.01,0.024,0.064,...,31.932001,31.614002,31.241999,,,,,,,0
3,2016,6,3236,29414.0,0.301667,0.294,0.272,0.236,0.212,0.158,...,30.91,30.794001,30.689999,,,,,,,0
4,2016,6,3236,29963.0,0.15,0.152222,0.154444,0.156667,0.158889,0.161111,...,-0.43,-0.445,-0.46,,,,,,,0


In [14]:
df2.head()

Unnamed: 0,Season_Year,GameKey,PlayID,GSISID,Role
0,2017,414,188,33704,PDL2
1,2017,414,1107,33704,PDL2
2,2017,424,1113,33704,PDR3
3,2017,424,1454,33704,PLR2
4,2017,424,644,33704,PRG


In [16]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146573 entries, 0 to 146572
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Season_Year  146573 non-null  int64 
 1   GameKey      146573 non-null  int64 
 2   PlayID       146573 non-null  int64 
 3   GSISID       146573 non-null  int64 
 4   Role         146573 non-null  object
dtypes: int64(4), object(1)
memory usage: 5.6+ MB


In [24]:
player_play_counts = df2.groupby(['GSISID']).agg({'Season_Year': 'nunique', 'GameKey': 'nunique', 'PlayID': 'count', 'Role' : 'nunique'}).reset_index()

In [25]:
player_play_counts

Unnamed: 0,GSISID,Season_Year,GameKey,PlayID,Role
0,19714,2,41,229,1
1,20578,1,1,2,1
2,20712,2,4,10,5
3,21140,2,14,19,3
4,21146,1,3,3,2
...,...,...,...,...,...
2421,34038,1,1,3,2
2422,34042,1,1,2,2
2423,34046,1,2,9,5
2424,34048,1,1,1,1


In [23]:
len(player_play_counts.loc[player_play_counts['PlayID'] > 100])

556

In [30]:
concussed_ids = df_final.query('concussed == 1')['gsisid'].to_list()

In [31]:
concussed_ids[:10]  # Display first 10 concussed player IDs

[31023.0,
 32410.0,
 28128.0,
 28987.0,
 27595.0,
 32214.0,
 28620.0,
 23742.0,
 32120.0,
 27654.0]

In [33]:
player_play_counts.query('GSISID in @concussed_ids').sort_values(by='PlayID', ascending=False)

Unnamed: 0,GSISID,Season_Year,GameKey,PlayID,Role
486,29492,2,34,286,8
1267,31950,2,34,284,7
336,28128,2,38,278,24
203,27654,2,31,268,19
405,28987,2,41,262,10
558,29793,2,36,246,13
1690,32783,2,33,239,18
1469,32403,2,29,230,20
1291,32007,2,34,221,8
28,23742,2,42,212,1


In [34]:
concussion_movement = df_final.query('gsisid in @concussed_ids').copy()

In [36]:
len(concussion_movement)

5055

In [37]:
player1 = concussion_movement.query('gsisid == 29492').copy()

In [48]:
player1.query('concussed == 1')  # How many concussions for player 29492

Unnamed: 0,season_year,gamekey,playid,gsisid,dis_slice_0,dis_slice_1,dis_slice_2,dis_slice_3,dis_slice_4,dis_slice_5,...,y_slice_57,y_slice_58,y_slice_59,player_activity_derived,turnover_related,primary_impact_type,primary_partner_gsisid,primary_partner_activity_derived,friendly_fire,concussed
183845,2017,473,2072,29492.0,0.13,0.1,0.081667,0.058,0.06,0.03,...,20.998333,21.048334,21.233332,Tackling,No,Helmet-to-body,33445,Tackling,Yes,1


In [39]:
player1.to_csv('scripts/punt_analytics/player_29492_movement.csv', index=False)

In [44]:
player1_raw = df.query('gsisid == 29492').copy()  # Assuming you want the raw data for player 29492

In [42]:
player1_raw.head(1000).to_csv('scripts/punt_analytics/player_29492_raw.csv', index=False)

In [46]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, precision_recall_curve, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

def extract_enhanced_play_features(df):
    """
    Extract comprehensive features from raw NFL tracking data
    Designed to work with your exact data format
    """
    
    def process_single_play(play_data):
        """Process one play for one player"""
        if len(play_data) < 5:
            return None
            
        play_data = play_data.sort_values('time').reset_index(drop=True)
        
        # Convert time to seconds from start
        play_data['time'] = pd.to_datetime(play_data['time'])
        play_data['seconds'] = (play_data['time'] - play_data['time'].min()).dt.total_seconds()
        play_data['dt'] = play_data['seconds'].diff().fillna(0.1)
        
        # Calculate velocities from position changes
        play_data['vx'] = play_data['x'].diff() / play_data['dt']
        play_data['vy'] = play_data['y'].diff() / play_data['dt']
        play_data['speed_calc'] = np.sqrt(play_data['vx']**2 + play_data['vy']**2)
        
        # Use provided speed where available, calculated as backup
        play_data['speed'] = play_data['dis'].fillna(play_data['speed_calc'])
        
        # Calculate accelerations
        play_data['ax'] = play_data['vx'].diff() / play_data['dt']
        play_data['ay'] = play_data['vy'].diff() / play_data['dt']
        play_data['acceleration'] = np.sqrt(play_data['ax']**2 + play_data['ay']**2)
        
        # Jerk (rate of acceleration change)
        play_data['jerk'] = play_data['acceleration'].diff() / play_data['dt']
        
        # Direction changes
        play_data['dir_change'] = play_data['dir'].diff()
        play_data['orient_change'] = play_data['o'].diff()
        
        # Handle angle wraparound
        for col in ['dir_change', 'orient_change']:
            play_data[col] = np.where(play_data[col] > 180, play_data[col] - 360, play_data[col])
            play_data[col] = np.where(play_data[col] < -180, play_data[col] + 360, play_data[col])
        
        # Angular velocities
        play_data['angular_vel'] = play_data['dir_change'] / play_data['dt']
        
        # Key events timing
        events = play_data[play_data['event'].notna() & (play_data['event'] != 'nan')]['event'].tolist()
        snap_time = None
        if 'ball_snap' in events:
            snap_time = play_data[play_data['event'] == 'ball_snap']['seconds'].iloc[0]
        
        # Aggregate features
        features = {
            # Identifiers
            'season_year': play_data['season_year'].iloc[0],
            'gamekey': play_data['gamekey'].iloc[0], 
            'playid': play_data['playid'].iloc[0],
            'gsisid': play_data['gsisid'].iloc[0],
            
            # Basic metrics
            'play_duration': play_data['seconds'].max(),
            'total_distance': play_data['speed'].sum() * 0.1,  # Approximate with 10Hz
            'displacement': np.sqrt((play_data['x'].iloc[-1] - play_data['x'].iloc[0])**2 + 
                                  (play_data['y'].iloc[-1] - play_data['y'].iloc[0])**2),
            
            # Speed characteristics
            'max_speed': play_data['speed'].max(),
            'avg_speed': play_data['speed'].mean(),
            'speed_std': play_data['speed'].std(),
            'speed_95th': play_data['speed'].quantile(0.95),
            'time_above_avg_speed': (play_data['speed'] > play_data['speed'].mean()).sum() / len(play_data),
            
            # Acceleration patterns
            'max_acceleration': play_data['acceleration'].max(),
            'avg_acceleration': play_data['acceleration'].mean(),
            'acceleration_std': play_data['acceleration'].std(),
            'max_deceleration': -play_data['acceleration'].min(),  # Most negative as positive
            'rapid_decelerations': (play_data['acceleration'] < -3.0).sum(),
            'rapid_accelerations': (play_data['acceleration'] > 3.0).sum(),
            
            # Jerk and smoothness
            'max_jerk': play_data['jerk'].abs().max(),
            'avg_jerk': play_data['jerk'].abs().mean(),
            'jerk_95th': play_data['jerk'].abs().quantile(0.95),
            'high_jerk_events': (play_data['jerk'].abs() > play_data['jerk'].abs().quantile(0.9)).sum(),
            
            # Direction and agility
            'total_direction_change': play_data['dir_change'].abs().sum(),
            'max_angular_velocity': play_data['angular_vel'].abs().max(),
            'avg_angular_velocity': play_data['angular_vel'].abs().mean(),
            'sharp_turns': (play_data['angular_vel'].abs() > 45).sum(),  # >45 deg/sec
            
            # Movement variability
            'speed_cv': play_data['speed'].std() / (play_data['speed'].mean() + 1e-6),
            'acceleration_cv': play_data['acceleration'].std() / (play_data['acceleration'].mean() + 1e-6),
            
            # Event-based features
            'has_snap': 1 if 'ball_snap' in events else 0,
            'has_punt': 1 if 'punt' in events else 0,
            'num_events': len(events),
            
            # Position characteristics
            'x_range': play_data['x'].max() - play_data['x'].min(),
            'y_range': play_data['y'].max() - play_data['y'].min(),
            'field_coverage': (play_data['x'].max() - play_data['x'].min()) * (play_data['y'].max() - play_data['y'].min()),
        }
        
        # Movement efficiency
        if features['total_distance'] > 0:
            features['movement_efficiency'] = features['displacement'] / features['total_distance']
        else:
            features['movement_efficiency'] = 0
            
        return features
    
    # Process all plays
    all_features = []
    
    for (season, game, play, player), play_data in df.groupby(['season_year', 'gamekey', 'playid', 'gsisid']):
        features = process_single_play(play_data)
        if features is not None:
            all_features.append(features)
    
    return pd.DataFrame(all_features)

def build_individualized_model(features_df, min_plays=15):
    """
    Build anomaly detection model for each player individually
    """
    results = []
    
    for player_id in features_df['gsisid'].unique():
        player_data = features_df[features_df['gsisid'] == player_id].copy()
        
        if len(player_data) < min_plays:
            # Not enough data for reliable baseline
            player_data['anomaly_score'] = 0
            player_data['is_anomaly'] = False
            player_data['baseline_plays'] = len(player_data)
            results.append(player_data)
            continue
        
        # Features for anomaly detection (exclude identifiers)
        feature_cols = [col for col in player_data.columns 
                       if col not in ['season_year', 'gamekey', 'playid', 'gsisid']]
        
        # Handle missing values
        X = player_data[feature_cols].fillna(player_data[feature_cols].median())
        
        # Standardize features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Fit isolation forest
        iso_forest = IsolationForest(
            contamination=0.1,  # Expect 10% of plays to be anomalous
            random_state=42,
            n_estimators=100
        )
        
        anomaly_labels = iso_forest.fit_predict(X_scaled)
        anomaly_scores = iso_forest.decision_function(X_scaled)
        
        # Add results
        player_data['anomaly_score'] = anomaly_scores
        player_data['is_anomaly'] = anomaly_labels == -1
        player_data['baseline_plays'] = len(player_data)
        
        # Identify top risk factors for anomalous plays
        if anomaly_labels.sum() < 0:  # If any anomalies detected
            feature_importance = np.abs(X_scaled[anomaly_labels == -1].mean(axis=0) - 
                                      X_scaled[anomaly_labels == 1].mean(axis=0))
            top_features = np.argsort(feature_importance)[-3:][::-1]  # Top 3
            risk_factors = [feature_cols[i] for i in top_features]
        else:
            risk_factors = []
        
        player_data['top_risk_factors'] = str(risk_factors)
        results.append(player_data)
    
    return pd.concat(results, ignore_index=True)

def evaluate_injury_prediction(results_df, injury_labels_df):
    """
    Evaluate how well anomaly detection predicts injuries
    """
    # Merge with injury labels
    merged = results_df.merge(
        injury_labels_df[['season_year', 'gamekey', 'playid', 'gsisid', 'concussed']], 
        on=['season_year', 'gamekey', 'playid', 'gsisid'], 
        how='left'
    )
    merged['concussed'] = merged['concussed'].fillna(0)
    
    # Only evaluate players with sufficient baseline
    reliable_data = merged[merged['baseline_plays'] >= 15]
    
    if len(reliable_data) == 0:
        print("No players have sufficient baseline data (15+ plays)")
        return None
    
    print(f"Evaluation on {len(reliable_data)} plays from players with 15+ baseline plays")
    print(f"Injury rate: {reliable_data['concussed'].mean():.4f}")
    print(f"Anomaly rate: {reliable_data['is_anomaly'].mean():.4f}")
    
    # Basic classification metrics
    if reliable_data['concussed'].sum() > 0:
        print("\nClassification Report:")
        print(classification_report(reliable_data['concussed'], reliable_data['is_anomaly']))
        
        # Precision-Recall analysis
        precision, recall, thresholds = precision_recall_curve(
            reliable_data['concussed'], reliable_data['anomaly_score']
        )
        
        print(f"\nBest threshold analysis:")
        best_f1_idx = np.argmax(2 * precision * recall / (precision + recall + 1e-6))
        best_threshold = thresholds[best_f1_idx]
        
        predictions_best = (reliable_data['anomaly_score'] >= best_threshold).astype(int)
        print(f"Best threshold: {best_threshold:.3f}")
        print(classification_report(reliable_data['concussed'], predictions_best))
        
        # ROC-AUC if possible
        if len(np.unique(reliable_data['concussed'])) > 1:
            auc = roc_auc_score(reliable_data['concussed'], reliable_data['anomaly_score'])
            print(f"ROC-AUC: {auc:.3f}")
    
    return reliable_data

# Main pipeline function
def run_injury_prediction_pipeline(tracking_data_path, injury_labels_path):
    """
    Complete pipeline from raw data to injury prediction evaluation
    """
    print("Loading data...")
    tracking_df = pd.read_csv(tracking_data_path)
    injury_df = pd.read_csv(injury_labels_path)
    
    print(f"Loaded {len(tracking_df)} tracking records for {tracking_df['gsisid'].nunique()} players")
    print(f"Loaded {len(injury_df)} injury labels")
    
    print("\nExtracting play-level features...")
    features_df = extract_enhanced_play_features(tracking_df)
    print(f"Extracted features for {len(features_df)} plays")
    
    print("\nBuilding individualized anomaly models...")
    results_df = build_individualized_model(features_df)
    
    print("\nEvaluating injury prediction performance...")
    evaluation_results = evaluate_injury_prediction(results_df, injury_df)
    
    return results_df, evaluation_results

# Quick test function for your sample data
def test_on_sample_data():
    """
    Test the pipeline on your player 29492 sample
    """
    # Load your sample file
    #df = pd.read_csv('player_29492_raw.csv')
    df = player1_raw.copy()  # Use the player 29492 raw data
    
    print(f"Sample data: {len(df)} records across {df['playid'].nunique()} plays")
    
    # Extract features
    features = extract_enhanced_play_features(df)
    print(f"Extracted {len(features)} play-level feature sets")
    print(f"Features per play: {len(features.columns) - 4}")  # Minus identifiers
    
    # Show sample features
    print("\nSample features for first play:")
    first_play_features = features.iloc[0]
    for key, value in first_play_features.items():
        if isinstance(value, (int, float)):
            print(f"  {key}: {value:.3f}")
        else:
            print(f"  {key}: {value}")
    
    # Build anomaly model (if enough plays)
    if len(features) >= 15:
        results = build_individualized_model(features)
        print(f"\nAnomaly detection results:")
        print(f"  Anomalous plays: {results['is_anomaly'].sum()}/{len(results)}")
        print(f"  Anomaly scores range: {results['anomaly_score'].min():.3f} to {results['anomaly_score'].max():.3f}")
        
        if results['is_anomaly'].sum() > 0:
            anomalous_plays = results[results['is_anomaly']]
            print(f"  Anomalous play characteristics:")
            print(f"    Avg max speed: {anomalous_plays['max_speed'].mean():.3f}")
            print(f"    Avg max acceleration: {anomalous_plays['max_acceleration'].mean():.3f}")
            print(f"    Avg max jerk: {anomalous_plays['max_jerk'].mean():.3f}")
            
            # Show detailed anomalous plays for lookup
            print(f"\nDetailed anomalous plays:")
            print(f"{'Season':<6} {'Game':<8} {'Play':<8} {'GSISID':<8} {'Score':<8} {'Max Speed':<10} {'Max Accel':<10} {'Max Jerk':<10}")
            print("-" * 80)
            
            for _, play in anomalous_plays.iterrows():
                print(f"{play['season_year']:<6.0f} {play['gamekey']:<8.0f} {play['playid']:<8.0f} {play['gsisid']:<8.0f} "
                      f"{play['anomaly_score']:<8.3f} {play['max_speed']:<10.3f} {play['max_acceleration']:<10.3f} {play['max_jerk']:<10.3f}")
            
            # Also save to CSV for easy lookup
            anomaly_lookup = anomalous_plays[['season_year', 'gamekey', 'playid', 'gsisid', 'anomaly_score', 
                                            'max_speed', 'max_acceleration', 'max_jerk', 'top_risk_factors']].copy()
            anomaly_lookup.to_csv('anomalous_plays_lookup.csv', index=False)
            print(f"\nSaved detailed anomalous plays to 'anomalous_plays_lookup.csv'")
            
    else:
        print(f"Need at least 15 plays for anomaly detection, only have {len(features)}")
    
    return features

# Example usage
if __name__ == "__main__":
    # Test on your sample data
    sample_features = test_on_sample_data()
    
    # For full pipeline (when you have complete data):
    # results, evaluation = run_injury_prediction_pipeline(
    #     'your_full_tracking_data.csv', 
    #     'your_injury_labels.csv'
    # )

Sample data: 103390 records across 279 plays
Extracted 288 play-level feature sets
Features per play: 31

Sample features for first play:
  season_year: 2016.000
  gamekey: 13.000
  playid: 2034.000
  gsisid: 29492.000
  play_duration: 37.400
  total_distance: 8.748
  displacement: 51.990
  max_speed: 1.050
  avg_speed: 0.277
  speed_std: 0.336
  speed_95th: 1.022
  time_above_avg_speed: 0.294
  max_acceleration: 20.396
  avg_acceleration: 4.397
  acceleration_std: 3.587
  max_deceleration: -0.000
  rapid_decelerations: 0.000
  rapid_accelerations: 176.000
  max_jerk: 163.957
  avg_jerk: 25.591
  jerk_95th: 83.841
  high_jerk_events: 32.000
  total_direction_change: 2733.360
  max_angular_velocity: 1125.900
  avg_angular_velocity: 83.131
  sharp_turns: 147.000
  speed_cv: 1.215
  acceleration_cv: 0.816
  has_snap: 1.000
  has_punt: 1.000
  num_events: 8.000
  x_range: 57.390
  y_range: 23.560
  field_coverage: 1352.108
  movement_efficiency: 5.943

Anomaly detection results:
  Anomalou

In [49]:
player1.query('concussed == 1')  # How many concussions for player 29492

Unnamed: 0,season_year,gamekey,playid,gsisid,dis_slice_0,dis_slice_1,dis_slice_2,dis_slice_3,dis_slice_4,dis_slice_5,...,y_slice_57,y_slice_58,y_slice_59,player_activity_derived,turnover_related,primary_impact_type,primary_partner_gsisid,primary_partner_activity_derived,friendly_fire,concussed
183845,2017,473,2072,29492.0,0.13,0.1,0.081667,0.058,0.06,0.03,...,20.998333,21.048334,21.233332,Tackling,No,Helmet-to-body,33445,Tackling,Yes,1


In [50]:
concussions = df_final.query('concussed == 1')[['season_year', 'gamekey', 'playid', 'gsisid']].copy()

In [56]:
concussions

Unnamed: 0,season_year,gamekey,playid,gsisid
984,2016,29,538,31023.0
43873,2016,144,2342,32410.0
45407,2016,149,3663,28128.0
59696,2016,218,3468,28987.0
63375,2016,189,3509,27595.0
70803,2016,231,1976,32214.0
71934,2016,234,3278,28620.0
92916,2016,274,3609,23742.0
93459,2016,280,2918,32120.0
93546,2016,280,3746,27654.0


In [52]:
concussions.to_csv('scripts/punt_analytics/concussions.csv', index=False)

In [53]:
test2 = df.query('gsisid in @concussed_ids').copy()

In [54]:
test2.head()

Unnamed: 0,season_year,gamekey,playid,gsisid,time,x,y,dis,o,dir,event
7176,2016,16,254,30171.0,2016-08-11 23:14:50.300,75.5,22.209999,0.03,347.980011,109.949997,
7185,2016,16,254,30171.0,2016-08-11 23:14:50.700,75.839996,21.879999,0.17,343.390015,100.349998,
7196,2016,16,254,30171.0,2016-08-11 23:14:50.900,76.110001,21.549999,0.26,336.880005,102.300003,
7214,2016,16,254,30171.0,2016-08-11 23:14:50.400,75.540001,22.190001,0.05,348.559998,103.279999,
7225,2016,16,254,30171.0,2016-08-11 23:14:50.600,75.730003,22.0,0.15,346.420013,98.290001,


In [57]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

def load_all_concussion_players(tracking_data_path, concussion_events_path):
    """
    Load tracking data for all players who had concussions
    """
    # Load concussion events
    concussions_df = concussions
    concussion_players = concussions_df['gsisid'].unique()

    print(f"Found {len(concussion_players)} unique players with concussions")
    print(f"Total concussion events: {len(concussions)}")
    
    # Load tracking data
    tracking_df = df
    
    # Filter to only players who had concussions
    concussion_tracking = tracking_df[tracking_df['gsisid'].isin(concussion_players)]
    
    print(f"Loaded tracking data for concussion players: {len(concussion_tracking)} records")
    print(f"Covers {concussion_tracking['gsisid'].nunique()} players across {concussion_tracking['playid'].nunique()} plays")
    
    return concussion_tracking, concussions

def extract_enhanced_play_features_v2(df):
    """
    Enhanced version that handles potential data quality issues better
    """
    
    def process_single_play(play_data):
        """Process one play for one player with better error handling"""
        if len(play_data) < 5:
            return None
            
        play_data = play_data.sort_values('time').reset_index(drop=True)
        
        try:
            # Convert time to seconds from start
            play_data['time'] = pd.to_datetime(play_data['time'])
            play_data['seconds'] = (play_data['time'] - play_data['time'].min()).dt.total_seconds()
            play_data['dt'] = play_data['seconds'].diff().fillna(0.1)
            
            # Handle edge case where dt might be 0
            play_data['dt'] = play_data['dt'].replace(0, 0.1)
            
            # Calculate velocities from position changes
            play_data['vx'] = play_data['x'].diff() / play_data['dt']
            play_data['vy'] = play_data['y'].diff() / play_data['dt']
            play_data['speed_calc'] = np.sqrt(play_data['vx']**2 + play_data['vy']**2)
            
            # Use provided speed where available, calculated as backup
            play_data['speed'] = play_data['dis'].fillna(play_data['speed_calc'])
            
            # Calculate accelerations
            play_data['ax'] = play_data['vx'].diff() / play_data['dt']
            play_data['ay'] = play_data['vy'].diff() / play_data['dt']
            play_data['acceleration'] = np.sqrt(play_data['ax']**2 + play_data['ay']**2)
            
            # Handle infinite values
            for col in ['vx', 'vy', 'speed_calc', 'ax', 'ay', 'acceleration']:
                play_data[col] = play_data[col].replace([np.inf, -np.inf], np.nan)
                play_data[col] = play_data[col].fillna(0)
            
            # Jerk (rate of acceleration change)
            play_data['jerk'] = play_data['acceleration'].diff() / play_data['dt']
            play_data['jerk'] = play_data['jerk'].replace([np.inf, -np.inf], np.nan).fillna(0)
            
            # Direction changes
            play_data['dir_change'] = play_data['dir'].diff()
            play_data['orient_change'] = play_data['o'].diff()
            
            # Handle angle wraparound
            for col in ['dir_change', 'orient_change']:
                play_data[col] = np.where(play_data[col] > 180, play_data[col] - 360, play_data[col])
                play_data[col] = np.where(play_data[col] < -180, play_data[col] + 360, play_data[col])
            
            # Angular velocities
            play_data['angular_vel'] = play_data['dir_change'] / play_data['dt']
            play_data['angular_vel'] = play_data['angular_vel'].replace([np.inf, -np.inf], np.nan).fillna(0)
            
            # Key events
            events = play_data[play_data['event'].notna() & (play_data['event'] != 'nan')]['event'].tolist()
            
            # Aggregate features with error handling
            features = {
                # Identifiers
                'season_year': play_data['season_year'].iloc[0],
                'gamekey': play_data['gamekey'].iloc[0], 
                'playid': play_data['playid'].iloc[0],
                'gsisid': play_data['gsisid'].iloc[0],
                
                # Basic metrics
                'play_duration': play_data['seconds'].max(),
                'total_distance': max(0, play_data['speed'].sum() * 0.1),
                'displacement': np.sqrt((play_data['x'].iloc[-1] - play_data['x'].iloc[0])**2 + 
                                      (play_data['y'].iloc[-1] - play_data['y'].iloc[0])**2),
                
                # Speed characteristics
                'max_speed': play_data['speed'].max(),
                'avg_speed': play_data['speed'].mean(),
                'speed_std': play_data['speed'].std() if len(play_data) > 1 else 0,
                'speed_95th': play_data['speed'].quantile(0.95),
                'time_above_avg_speed': (play_data['speed'] > play_data['speed'].mean()).sum() / len(play_data),
                
                # Acceleration patterns
                'max_acceleration': play_data['acceleration'].max(),
                'avg_acceleration': play_data['acceleration'].mean(),
                'acceleration_std': play_data['acceleration'].std() if len(play_data) > 1 else 0,
                'max_deceleration': max(0, -play_data['acceleration'].min()),
                'rapid_decelerations': (play_data['acceleration'] < -3.0).sum(),
                'rapid_accelerations': (play_data['acceleration'] > 3.0).sum(),
                
                # Jerk and smoothness
                'max_jerk': play_data['jerk'].abs().max(),
                'avg_jerk': play_data['jerk'].abs().mean(),
                'jerk_95th': play_data['jerk'].abs().quantile(0.95),
                'high_jerk_events': (play_data['jerk'].abs() > play_data['jerk'].abs().quantile(0.9)).sum(),
                
                # Direction and agility
                'total_direction_change': play_data['dir_change'].abs().sum(),
                'max_angular_velocity': play_data['angular_vel'].abs().max(),
                'avg_angular_velocity': play_data['angular_vel'].abs().mean(),
                'sharp_turns': (play_data['angular_vel'].abs() > 45).sum(),
                
                # Movement variability
                'speed_cv': play_data['speed'].std() / (play_data['speed'].mean() + 1e-6),
                'acceleration_cv': play_data['acceleration'].std() / (play_data['acceleration'].mean() + 1e-6),
                
                # Event-based features
                'has_snap': 1 if 'ball_snap' in events else 0,
                'has_punt': 1 if 'punt' in events else 0,
                'num_events': len(events),
                
                # Position characteristics
                'x_range': play_data['x'].max() - play_data['x'].min(),
                'y_range': play_data['y'].max() - play_data['y'].min(),
                'field_coverage': (play_data['x'].max() - play_data['x'].min()) * (play_data['y'].max() - play_data['y'].min()),
            }
            
            # Movement efficiency
            if features['total_distance'] > 0:
                features['movement_efficiency'] = features['displacement'] / features['total_distance']
            else:
                features['movement_efficiency'] = 0
                
            return features
            
        except Exception as e:
            print(f"Error processing play {play_data['playid'].iloc[0]} for player {play_data['gsisid'].iloc[0]}: {e}")
            return None
    
    # Process all plays
    all_features = []
    
    for (season, game, play, player), play_data in df.groupby(['season_year', 'gamekey', 'playid', 'gsisid']):
        features = process_single_play(play_data)
        if features is not None:
            all_features.append(features)
    
    return pd.DataFrame(all_features)

def evaluate_concussion_detection_by_player(tracking_df, concussions_df, min_baseline_plays=15):
    """
    Test anomaly detection for each player who had a concussion
    """
    results_summary = []
    detailed_results = []
    
    for player_id in concussions_df['gsisid'].unique():
        print(f"\nProcessing Player {player_id}...")
        
        # Get this player's data
        player_tracking = tracking_df[tracking_df['gsisid'] == player_id]
        player_concussions = concussions_df[concussions_df['gsisid'] == player_id]
        
        # Extract features
        try:
            player_features = extract_enhanced_play_features_v2(player_tracking)
        except Exception as e:
            print(f"  Error extracting features: {e}")
            continue
            
        if len(player_features) == 0:
            print(f"  No valid features extracted")
            continue
            
        print(f"  Extracted {len(player_features)} plays")
        print(f"  Player had {len(player_concussions)} concussion events")
        
        # Check if we have enough plays for baseline
        if len(player_features) < min_baseline_plays:
            print(f"  Insufficient baseline data ({len(player_features)} < {min_baseline_plays})")
            
            # Still record basic info
            for _, concussion in player_concussions.iterrows():
                results_summary.append({
                    'player_id': player_id,
                    'season_year': concussion['season_year'],
                    'gamekey': concussion['gamekey'],
                    'playid': concussion['playid'],
                    'total_plays': len(player_features),
                    'baseline_sufficient': False,
                    'concussion_detected': False,
                    'anomaly_score': None,
                    'reason': 'insufficient_baseline'
                })
            continue
        
        # Build anomaly model
        feature_cols = [col for col in player_features.columns 
                       if col not in ['season_year', 'gamekey', 'playid', 'gsisid']]
        
        X = player_features[feature_cols].fillna(player_features[feature_cols].median())
        
        # Handle edge case where all features are the same
        if X.std().sum() == 0:
            print(f"  No variation in features - skipping anomaly detection")
            continue
            
        # Standardize and fit model
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        iso_forest = IsolationForest(contamination=0.1, random_state=42, n_estimators=100)
        anomaly_labels = iso_forest.fit_predict(X_scaled)
        anomaly_scores = iso_forest.decision_function(X_scaled)
        
        # Add anomaly results to features
        player_features['anomaly_score'] = anomaly_scores
        player_features['is_anomaly'] = anomaly_labels == -1
        
        print(f"  Detected {(anomaly_labels == -1).sum()} anomalous plays out of {len(player_features)}")
        
        # Check each concussion event
        for _, concussion in player_concussions.iterrows():
            # Find matching play in features
            matching_play = player_features[
                (player_features['season_year'] == concussion['season_year']) &
                (player_features['gamekey'] == concussion['gamekey']) &
                (player_features['playid'] == concussion['playid'])
            ]
            
            if len(matching_play) == 0:
                print(f"    Concussion play not found in features: {concussion['season_year']}-{concussion['gamekey']}-{concussion['playid']}")
                results_summary.append({
                    'player_id': player_id,
                    'season_year': concussion['season_year'],
                    'gamekey': concussion['gamekey'],
                    'playid': concussion['playid'],
                    'total_plays': len(player_features),
                    'baseline_sufficient': True,
                    'concussion_detected': False,
                    'anomaly_score': None,
                    'reason': 'play_not_found'
                })
            else:
                play_data = matching_play.iloc[0]
                detected = play_data['is_anomaly']
                score = play_data['anomaly_score']
                
                print(f"    Concussion play {concussion['playid']}: {'DETECTED' if detected else 'MISSED'} (score: {score:.3f})")
                
                results_summary.append({
                    'player_id': player_id,
                    'season_year': concussion['season_year'],
                    'gamekey': concussion['gamekey'],
                    'playid': concussion['playid'],
                    'total_plays': len(player_features),
                    'baseline_sufficient': True,
                    'concussion_detected': detected,
                    'anomaly_score': score,
                    'max_speed': play_data['max_speed'],
                    'max_acceleration': play_data['max_acceleration'],
                    'max_jerk': play_data['max_jerk'],
                    'reason': 'evaluated'
                })
        
        # Store detailed results for this player
        detailed_results.append({
            'player_id': player_id,
            'features': player_features,
            'concussions': player_concussions
        })
    
    return pd.DataFrame(results_summary), detailed_results

def analyze_results_by_position(results_df, position_mapping=None):
    """
    Analyze detection performance by player position if position data available
    """
    print("\n" + "="*60)
    print("CONCUSSION DETECTION SUMMARY")
    print("="*60)
    
    # Overall statistics
    total_concussions = len(results_df)
    evaluable_concussions = len(results_df[results_df['baseline_sufficient'] == True])
    detected_concussions = len(results_df[results_df['concussion_detected'] == True])
    
    print(f"Total concussion events: {total_concussions}")
    print(f"Evaluable (sufficient baseline): {evaluable_concussions}")
    print(f"Detected by anomaly model: {detected_concussions}")
    
    if evaluable_concussions > 0:
        detection_rate = detected_concussions / evaluable_concussions
        print(f"Detection rate: {detection_rate:.2%}")
        
        # Show breakdown by reason
        print(f"\nBreakdown by evaluation status:")
        status_counts = results_df['reason'].value_counts()
        for reason, count in status_counts.items():
            print(f"  {reason}: {count}")
        
        # Show anomaly scores for detected vs missed
        detected = results_df[results_df['concussion_detected'] == True]
        missed = results_df[(results_df['concussion_detected'] == False) & (results_df['baseline_sufficient'] == True)]
        
        if len(detected) > 0:
            print(f"\nDetected concussions:")
            print(f"  Avg anomaly score: {detected['anomaly_score'].mean():.3f}")
            print(f"  Avg max speed: {detected['max_speed'].mean():.3f}")
            print(f"  Avg max acceleration: {detected['max_acceleration'].mean():.3f}")
            print(f"  Avg max jerk: {detected['max_jerk'].mean():.3f}")
        
        if len(missed) > 0:
            print(f"\nMissed concussions:")
            print(f"  Avg anomaly score: {missed['anomaly_score'].mean():.3f}")
            print(f"  Avg max speed: {missed['max_speed'].mean():.3f}")
            print(f"  Avg max acceleration: {missed['max_acceleration'].mean():.3f}")
            print(f"  Avg max jerk: {missed['max_jerk'].mean():.3f}")
    
    # Position analysis if available
    if position_mapping is not None:
        print(f"\nPosition-based analysis:")
        results_with_pos = results_df.merge(position_mapping, left_on='player_id', right_on='gsisid', how='left')
        
        if 'position' in results_with_pos.columns:
            for position in results_with_pos['position'].unique():
                if pd.isna(position):
                    continue
                pos_data = results_with_pos[results_with_pos['position'] == position]
                pos_evaluable = pos_data[pos_data['baseline_sufficient'] == True]
                pos_detected = pos_data[pos_data['concussion_detected'] == True]
                
                if len(pos_evaluable) > 0:
                    pos_rate = len(pos_detected) / len(pos_evaluable)
                    print(f"  {position}: {len(pos_detected)}/{len(pos_evaluable)} detected ({pos_rate:.2%})")
    
    return results_df

# Main execution function
def run_comprehensive_concussion_evaluation(tracking_data_path, concussion_events_path, position_data_path=None):
    """
    Run complete evaluation across all concussion cases
    """
    print("Loading concussion player data...")
    tracking_df, concussions_df = load_all_concussion_players(tracking_data_path, concussion_events_path)
    
    # Load position data if available
    position_mapping = None
    if position_data_path:
        try:
            position_mapping = pd.read_csv(position_data_path)
            print(f"Loaded position data for {len(position_mapping)} players")
        except:
            print("Could not load position data - proceeding without position analysis")
    
    print("\nEvaluating anomaly detection for each concussion case...")
    results_df, detailed_results = evaluate_concussion_detection_by_player(tracking_df, concussions_df)
    
    print("\nAnalyzing results...")
    final_results = analyze_results_by_position(results_df, position_mapping)
    
    # Save results
    final_results.to_csv('concussion_detection_results.csv', index=False)
    print(f"\nSaved detailed results to 'concussion_detection_results.csv'")
    
    return final_results, detailed_results

# Example usage
if __name__ == "__main__":
    # Run the comprehensive evaluation
    results, details = run_comprehensive_concussion_evaluation(
        'your_full_tracking_data.csv',  # Path to complete tracking data
        'concussions.csv',              # Your concussion events file
        'player_positions.csv'          # Optional: player position mapping
    )

Loading concussion player data...
Found 31 unique players with concussions
Total concussion events: 32
Loaded tracking data for concussion players: 1603766 records
Covers 31 players across 2329 plays
Could not load position data - proceeding without position analysis

Evaluating anomaly detection for each concussion case...

Processing Player 31023.0...
  Extracted 88 plays
  Player had 1 concussion events
  Detected 9 anomalous plays out of 88
    Concussion play 538.0: DETECTED (score: -0.041)

Processing Player 32410.0...
  Extracted 210 plays
  Player had 1 concussion events
  Detected 21 anomalous plays out of 210
    Concussion play 2342.0: MISSED (score: 0.108)

Processing Player 28128.0...
  Extracted 270 plays
  Player had 1 concussion events
  Detected 27 anomalous plays out of 270
    Concussion play 3663.0: MISSED (score: 0.042)

Processing Player 28987.0...
  Extracted 292 plays
  Player had 2 concussion events
  Detected 30 anomalous plays out of 292
    Concussion play 3

In [60]:
test2.to_csv('scripts/punt_analytics/concussedplayer_movement.csv', index=False)

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

def analyze_detected_concussions(tracking_data_path, video_review_path, play_player_role_path, 
                                game_data_path, play_info_path):
    """
    Deep dive analysis of the 7 detected concussion cases
    """
    
    # The 7 detected concussion cases from your results
    detected_cases = [
        {'player_id': 31023.0, 'season_year': 2016, 'gamekey': 24, 'playid': 538.0, 'score': -0.041},
        {'player_id': 23742.0, 'season_year': 2016, 'gamekey': 31, 'playid': 3609.0, 'score': -0.161},
        {'player_id': 23564.0, 'season_year': 2016, 'gamekey': 29, 'playid': 2902.0, 'score': -0.033},
        {'player_id': 26035.0, 'season_year': 2016, 'gamekey': 15, 'playid': 3312.0, 'score': -0.080},
        {'player_id': 27060.0, 'season_year': 2016, 'gamekey': 34, 'playid': 1988.0, 'score': -0.086},
        {'player_id': 29793.0, 'season_year': 2016, 'gamekey': 14, 'playid': 978.0, 'score': -0.028},
        {'player_id': 30384.0, 'season_year': 2016, 'gamekey': 24, 'playid': 733.0, 'score': -0.134}
    ]
    
    print("Loading datasets...")
    
    # Load all datasets
    tracking_df = pd.read_csv(tracking_data_path)
    video_review = pd.read_csv(video_review_path)
    play_roles = pd.read_csv(play_player_role_path)
    game_data = pd.read_csv(game_data_path)
    play_info = pd.read_csv(play_info_path)
    
    print("Analyzing detected concussion cases...")
    
    analysis_results = []
    
    for case in detected_cases:
        print(f"\n{'='*60}")
        print(f"CASE: Player {case['player_id']:.0f} - Game {case['gamekey']} Play {case['playid']:.0f}")
        print(f"Anomaly Score: {case['score']:.3f}")
        print(f"{'='*60}")
        
        # Get video review details for this case
        video_details = video_review[
            (video_review['season_year'] == case['season_year']) &
            (video_review['gamekey'] == case['gamekey']) &
            (video_review['playid'] == case['playid']) &
            (video_review['gsisid'] == case['player_id'])
        ]
        
        if len(video_details) > 0:
            vd = video_details.iloc[0]
            print(f"INJURY DETAILS:")
            print(f"  Player Activity: {vd.get('player_activity_derived', 'N/A')}")
            print(f"  Impact Type: {vd.get('primary_impact_type', 'N/A')}")
            print(f"  Partner Activity: {vd.get('primary_partner_activity_derived', 'N/A')}")
            print(f"  Friendly Fire: {vd.get('friendly_fire', 'N/A')}")
            print(f"  Turnover Related: {vd.get('turnover_related', 'N/A')}")
            
            if pd.notna(vd.get('primary_partner_gsisid')):
                print(f"  Partner Player ID: {vd.get('primary_partner_gsisid')}")
        else:
            print("  No video review data found")
            vd = None
        
        # Get player role for this play
        player_role = play_roles[
            (play_roles['season_year'] == case['season_year']) &
            (play_roles['gamekey'] == case['gamekey']) &
            (play_roles['playid'] == case['playid']) &
            (play_roles['gsisid'] == case['player_id'])
        ]
        
        role = player_role['role'].iloc[0] if len(player_role) > 0 else 'Unknown'
        print(f"  Player Role: {role}")
        
        # Get play context
        play_context = play_info[
            (play_info['season_year'] == case['season_year']) &
            (play_info['gamekey'] == case['gamekey']) &
            (play_info['playid'] == case['playid'])
        ]
        
        if len(play_context) > 0:
            pc = play_context.iloc[0]
            print(f"PLAY CONTEXT:")
            print(f"  Quarter: {pc.get('quarter', 'N/A')}")
            print(f"  Game Clock: {pc.get('game_clock', 'N/A')}")
            print(f"  Yard Line: {pc.get('yardline', 'N/A')}")
            print(f"  Score: {pc.get('score_home_visiting', 'N/A')}")
            print(f"  Play Description: {pc.get('playdescription', 'N/A')[:100]}...")
        
        # Get game details
        game_details = game_data[
            (game_data['season_year'] == case['season_year']) &
            (game_data['gamekey'] == case['gamekey'])
        ]
        
        if len(game_details) > 0:
            gd = game_details.iloc[0]
            print(f"GAME CONDITIONS:")
            print(f"  Weather: {gd.get('gameweather', 'N/A')}")
            print(f"  Temperature: {gd.get('temperature', 'N/A')}")
            print(f"  Turf: {gd.get('turf', 'N/A')}")
            print(f"  Stadium Type: {gd.get('stadiumtype', 'N/A')}")
        
        # Get tracking data for this specific play
        play_tracking = tracking_df[
            (tracking_df['season_year'] == case['season_year']) &
            (tracking_df['gamekey'] == case['gamekey']) &
            (tracking_df['playid'] == case['playid']) &
            (tracking_df['gsisid'] == case['player_id'])
        ].copy()
        
        if len(play_tracking) > 0:
            # Analyze movement patterns
            play_tracking = play_tracking.sort_values('time')
            play_tracking['time'] = pd.to_datetime(play_tracking['time'])
            play_tracking['seconds'] = (play_tracking['time'] - play_tracking['time'].min()).dt.total_seconds()
            
            # Calculate velocities and accelerations
            play_tracking['dt'] = play_tracking['seconds'].diff().fillna(0.1)
            play_tracking['vx'] = play_tracking['x'].diff() / play_tracking['dt']
            play_tracking['vy'] = play_tracking['y'].diff() / play_tracking['dt']
            play_tracking['speed_calc'] = np.sqrt(play_tracking['vx']**2 + play_tracking['vy']**2)
            play_tracking['ax'] = play_tracking['vx'].diff() / play_tracking['dt']
            play_tracking['ay'] = play_tracking['vy'].diff() / play_tracking['dt']
            play_tracking['acceleration'] = np.sqrt(play_tracking['ax']**2 + play_tracking['ay']**2)
            
            # Clean infinite values
            for col in ['vx', 'vy', 'speed_calc', 'ax', 'ay', 'acceleration']:
                play_tracking[col] = play_tracking[col].replace([np.inf, -np.inf], np.nan).fillna(0)
            
            print(f"MOVEMENT ANALYSIS:")
            print(f"  Play Duration: {play_tracking['seconds'].max():.1f} seconds")
            print(f"  Max Speed: {play_tracking['dis'].max():.2f} yards/sec")
            print(f"  Max Acceleration: {play_tracking['acceleration'].max():.2f} yards/sec²")
            print(f"  Total Distance: {play_tracking['dis'].sum() * 0.1:.2f} yards")
            
            # Look for key events during the play
            events = play_tracking[play_tracking['event'].notna() & (play_tracking['event'] != 'nan')]
            if len(events) > 0:
                print(f"  Key Events:")
                for _, event in events.iterrows():
                    print(f"    {event['seconds']:.1f}s: {event['event']} (speed: {event['dis']:.2f})")
            
            # Find peak movement moments
            max_speed_time = play_tracking.loc[play_tracking['dis'].idxmax(), 'seconds']
            max_accel_time = play_tracking.loc[play_tracking['acceleration'].idxmax(), 'seconds']
            print(f"  Peak speed at: {max_speed_time:.1f}s")
            print(f"  Peak acceleration at: {max_accel_time:.1f}s")
        
        # Store for summary analysis
        analysis_results.append({
            'player_id': case['player_id'],
            'anomaly_score': case['score'],
            'role': role,
            'impact_type': vd.get('primary_impact_type') if vd is not None else None,
            'player_activity': vd.get('player_activity_derived') if vd is not None else None,
            'partner_activity': vd.get('primary_partner_activity_derived') if vd is not None else None,
            'friendly_fire': vd.get('friendly_fire') if vd is not None else None,
            'max_speed': play_tracking['dis'].max() if len(play_tracking) > 0 else None,
            'max_acceleration': play_tracking['acceleration'].max() if len(play_tracking) > 0 else None,
            'play_duration': play_tracking['seconds'].max() if len(play_tracking) > 0 else None
        })
    
    # Summary analysis
    print(f"\n{'='*60}")
    print("SUMMARY ANALYSIS OF DETECTED CASES")
    print(f"{'='*60}")
    
    results_df = pd.DataFrame(analysis_results)
    
    # Role distribution
    if 'role' in results_df.columns:
        print("Role Distribution:")
        role_counts = results_df['role'].value_counts()
        for role, count in role_counts.items():
            print(f"  {role}: {count}")
    
    # Impact type distribution
    if 'impact_type' in results_df.columns:
        print("\nImpact Type Distribution:")
        impact_counts = results_df['impact_type'].value_counts()
        for impact, count in impact_counts.items():
            print(f"  {impact}: {count}")
    
    # Activity distribution
    if 'player_activity' in results_df.columns:
        print("\nPlayer Activity Distribution:")
        activity_counts = results_df['player_activity'].value_counts()
        for activity, count in activity_counts.items():
            print(f"  {activity}: {count}")
    
    # Movement characteristics
    print(f"\nMovement Characteristics:")
    if 'max_speed' in results_df.columns and results_df['max_speed'].notna().any():
        print(f"  Avg Max Speed: {results_df['max_speed'].mean():.2f} yards/sec")
        print(f"  Max Speed Range: {results_df['max_speed'].min():.2f} - {results_df['max_speed'].max():.2f}")
    
    if 'max_acceleration' in results_df.columns and results_df['max_acceleration'].notna().any():
        print(f"  Avg Max Acceleration: {results_df['max_acceleration'].mean():.2f} yards/sec²")
        print(f"  Acceleration Range: {results_df['max_acceleration'].min():.2f} - {results_df['max_acceleration'].max():.2f}")
    
    # Friendly fire analysis
    if 'friendly_fire' in results_df.columns:
        print(f"\nFriendly Fire Analysis:")
        ff_counts = results_df['friendly_fire'].value_counts()
        for ff, count in ff_counts.items():
            print(f"  {ff}: {count}")
    
    return results_df

def compare_detected_vs_missed(detected_results, all_concussion_results_path):
    """
    Compare characteristics of detected vs missed concussions
    """
    print(f"\n{'='*60}")
    print("DETECTED VS MISSED COMPARISON")
    print(f"{'='*60}")
    
    # Load the full results
    all_results = pd.read_csv(all_concussion_results_path)
    
    detected = all_results[all_results['concussion_detected'] == True]
    missed = all_results[all_results['concussion_detected'] == False]
    
    print(f"Detected: {len(detected)} cases")
    print(f"Missed: {len(missed)} cases")
    
    # Compare anomaly scores
    print(f"\nAnomaly Scores:")
    print(f"  Detected avg: {detected['anomaly_score'].mean():.3f}")
    print(f"  Missed avg: {missed['anomaly_score'].mean():.3f}")
    
    # Compare movement characteristics
    for metric in ['max_speed', 'max_acceleration', 'max_jerk']:
        if metric in detected.columns and metric in missed.columns:
            det_val = detected[metric].mean()
            miss_val = missed[metric].mean()
            ratio = det_val / miss_val if miss_val > 0 else np.inf
            print(f"  {metric}:")
            print(f"    Detected: {det_val:.2f}")
            print(f"    Missed: {miss_val:.2f}")
            print(f"    Ratio: {ratio:.1f}x")

# Example usage
if __name__ == "__main__":
    # Run the detailed analysis
    results = analyze_detected_concussions(
        tracking_data_path= 'scripts/punt_analytics/concussedplayer_movement.csv',
        video_review_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/video_review.csv', 
        play_player_role_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/play_player_role_data.csv',
        game_data_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/game_data.csv',
        play_info_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/play_information.csv'
    )
    
    # Compare with missed cases
    compare_detected_vs_missed(results, 'concussion_detection_results.csv')
    
    print("\nDetailed analysis complete! Key patterns to look for:")
    print("1. Are detected cases concentrated in specific roles (gunners, returners)?")
    print("2. Do they show specific impact types (helmet-to-helmet vs helmet-to-ground)?")
    print("3. Are there common movement signatures (sudden stops, direction changes)?")
    print("4. Do game conditions play a role (weather, turf type)?")
    print("5. Are there temporal patterns (when during play injury occurred)?")

Loading datasets...
Analyzing detected concussion cases...

CASE: Player 31023 - Game 24 Play 538
Anomaly Score: -0.041


KeyError: 'season_year'

In [64]:
import pandas as pd

def explore_datasets(video_review_path, play_player_role_path, game_data_path, play_info_path):
    """
    Quick exploration to see what columns we have available
    """
    print("Loading and exploring datasets...")
    
    # Load each dataset and show columns
    datasets = {
        'Video Review': video_review_path,
        'Play Player Role': play_player_role_path, 
        'Game Data': game_data_path,
        'Play Info': play_info_path
    }
    
    for name, path in datasets.items():
        try:
            df = pd.read_csv(path)
            print(f"\n{name}:")
            print(f"  Shape: {df.shape}")
            print(f"  Columns: {list(df.columns)}")
            print(f"  First few rows:")
            print(df.head(2))
        except Exception as e:
            print(f"  Error loading {name}: {e}")

def simple_concussion_analysis(video_review_path):
    """
    Simple analysis of just the video review data for the 7 detected cases
    """
    detected_cases = [
        {'player_id': 31023.0, 'gamekey': 24, 'playid': 538.0},
        {'player_id': 23742.0, 'gamekey': 31, 'playid': 3609.0},
        {'player_id': 23564.0, 'gamekey': 29, 'playid': 2902.0},
        {'player_id': 26035.0, 'gamekey': 15, 'playid': 3312.0},
        {'player_id': 27060.0, 'gamekey': 34, 'playid': 1988.0},
        {'player_id': 29793.0, 'gamekey': 14, 'playid': 978.0},
        {'player_id': 30384.0, 'gamekey': 24, 'playid': 733.0}
    ]
    
    # Load video review
    video_review = pd.read_csv(video_review_path)
    print(f"Video review columns: {list(video_review.columns)}")
    
    # Look for our cases
    print(f"\nSearching for detected concussion cases in video review data...")
    
    for i, case in enumerate(detected_cases):
        print(f"\nCase {i+1}: Player {case['player_id']:.0f}, Game {case['gamekey']}, Play {case['playid']:.0f}")
        
        # Try different column name variations
        possible_filters = []
        
        # Check what columns exist for matching
        gamekey_cols = [col for col in video_review.columns if 'game' in col.lower()]
        playid_cols = [col for col in video_review.columns if 'play' in col.lower()]
        gsisid_cols = [col for col in video_review.columns if 'gsis' in col.lower() or 'player' in col.lower()]
        
        print(f"  Available game columns: {gamekey_cols}")
        print(f"  Available play columns: {playid_cols}")
        print(f"  Available player columns: {gsisid_cols}")
        
        # Try to find matching row
        matches = []
        for gc in gamekey_cols:
            for pc in playid_cols:
                for gsc in gsisid_cols:
                    try:
                        match = video_review[
                            (video_review[gc] == case['gamekey']) &
                            (video_review[pc] == case['playid']) &
                            (video_review[gsc] == case['player_id'])
                        ]
                        if len(match) > 0:
                            matches.append(match)
                            print(f"  FOUND MATCH using {gc}, {pc}, {gsc}")
                            print(f"    Data: {match.iloc[0].to_dict()}")
                            break
                    except:
                        continue
                if matches:
                    break
            if matches:
                break
        
        if not matches:
            print(f"  No match found")

# Run exploration
if __name__ == "__main__":
    explore_datasets(
        video_review_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/video_review.csv',
        play_player_role_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/play_player_role_data.csv',
        game_data_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/game_data.csv',
        play_info_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/play_information.csv'
    )
    
    print("\n" + "="*60)
    
    simple_concussion_analysis(
        video_review_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/video_review.csv'
    )

Loading and exploring datasets...

Video Review:
  Shape: (37, 10)
  Columns: ['Season_Year', 'GameKey', 'PlayID', 'GSISID', 'Player_Activity_Derived', 'Turnover_Related', 'Primary_Impact_Type', 'Primary_Partner_GSISID', 'Primary_Partner_Activity_Derived', 'Friendly_Fire']
  First few rows:
   Season_Year  GameKey  PlayID  GSISID Player_Activity_Derived  \
0         2016        5    3129   31057                Tackling   
1         2016       21    2587   29343                 Blocked   

  Turnover_Related Primary_Impact_Type Primary_Partner_GSISID  \
0               No      Helmet-to-body                  32482   
1               No    Helmet-to-helmet                  31059   

  Primary_Partner_Activity_Derived Friendly_Fire  
0                          Tackled            No  
1                         Blocking            No  

Play Player Role:
  Shape: (146573, 5)
  Columns: ['Season_Year', 'GameKey', 'PlayID', 'GSISID', 'Role']
  First few rows:
   Season_Year  GameKey  PlayID  

In [66]:
import pandas as pd

def analyze_concussion_data_mismatch(concussions_csv_path, video_review_path, 
                                   play_player_role_path, tracking_data_path):
    """
    Analyze the mismatch between different concussion datasets
    """
    
    print("Loading datasets...")
    
    # Load the datasets
    concussions_simple = pd.read_csv(concussions_csv_path)  # Your 32-row concussions.csv
    video_review = pd.read_csv(video_review_path)          # NFL's 37-row video review
    play_roles = pd.read_csv(play_player_role_path)
    
    print(f"Simple concussions.csv: {len(concussions_simple)} events")
    print(f"Video review database: {len(video_review)} events") 
    print(f"Play roles database: {len(play_roles)} roles")
    
    # The 7 detected cases
    detected_cases = [
        {'player_id': 31023.0, 'gamekey': 24, 'playid': 538.0, 'score': -0.041},
        {'player_id': 23742.0, 'gamekey': 31, 'playid': 3609.0, 'score': -0.161},
        {'player_id': 23564.0, 'gamekey': 29, 'playid': 2902.0, 'score': -0.033},
        {'player_id': 26035.0, 'gamekey': 15, 'playid': 3312.0, 'score': -0.080},
        {'player_id': 27060.0, 'gamekey': 34, 'playid': 1988.0, 'score': -0.086},
        {'player_id': 29793.0, 'gamekey': 14, 'playid': 978.0, 'score': -0.028},
        {'player_id': 30384.0, 'gamekey': 24, 'playid': 733.0, 'score': -0.134}
    ]
    
    print(f"\n{'='*60}")
    print("ANALYZING 7 DETECTED CASES")
    print(f"{'='*60}")
    
    # Check if detected cases are in the simple concussions.csv
    print("Checking if detected cases are in concussions.csv...")
    for i, case in enumerate(detected_cases):
        match = concussions_simple[
            (concussions_simple['gamekey'] == case['gamekey']) &
            (concussions_simple['playid'] == case['playid']) &
            (concussions_simple['gsisid'] == case['player_id'])
        ]
        
        if len(match) > 0:
            print(f"  ✓ Case {i+1}: FOUND in concussions.csv")
        else:
            print(f"  ✗ Case {i+1}: NOT FOUND in concussions.csv")
    
    # Check what's in video review vs simple concussions
    print(f"\n{'='*60}")
    print("COMPARING CONCUSSION DATASETS")
    print(f"{'='*60}")
    
    # Standardize column names for comparison
    vr_std = video_review.copy()
    vr_std.columns = vr_std.columns.str.lower()
    
    cs_std = concussions_simple.copy() 
    cs_std.columns = cs_std.columns.str.lower()
    
    # Find overlaps
    print("Looking for overlaps between datasets...")
    
    overlaps = []
    vr_only = []
    cs_only = []
    
    # Check each video review case
    for _, vr_case in vr_std.iterrows():
        match = cs_std[
            (cs_std['gamekey'] == vr_case['gamekey']) &
            (cs_std['playid'] == vr_case['playid']) &
            (cs_std['gsisid'] == vr_case['gsisid'])
        ]
        
        if len(match) > 0:
            overlaps.append(vr_case)
        else:
            vr_only.append(vr_case)
    
    # Check each simple concussion case
    for _, cs_case in cs_std.iterrows():
        match = vr_std[
            (vr_std['gamekey'] == cs_case['gamekey']) &
            (vr_std['playid'] == cs_case['playid']) &
            (vr_std['gsisid'] == cs_case['gsisid'])
        ]
        
        if len(match) == 0:
            cs_only.append(cs_case)
    
    print(f"Overlapping cases (in both datasets): {len(overlaps)}")
    print(f"Video review only: {len(vr_only)}")
    print(f"Concussions.csv only: {len(cs_only)}")
    
    # Show some examples
    if len(overlaps) > 0:
        print(f"\nSample overlapping cases:")
        for i, case in enumerate(overlaps[:3]):
            print(f"  Player {case['gsisid']:.0f}, Game {case['gamekey']}, Play {case['playid']:.0f}")
    
    if len(cs_only) > 0:
        print(f"\nSample concussions.csv only cases:")
        for i, case in enumerate(cs_only[:5]):
            print(f"  Player {case['gsisid']:.0f}, Game {case['gamekey']}, Play {case['playid']:.0f}")
    
    # Now the key question: Are our detected cases actual concussions?
    print(f"\n{'='*60}")
    print("TRUTH CHECK: ARE DETECTED CASES REAL CONCUSSIONS?")
    print(f"{'='*60}")
    
    detected_are_concussions = 0
    detected_not_concussions = 0
    
    for i, case in enumerate(detected_cases):
        # Check in simple concussions
        match_simple = cs_std[
            (cs_std['gamekey'] == case['gamekey']) &
            (cs_std['playid'] == case['playid']) &
            (cs_std['gsisid'] == case['player_id'])
        ]
        
        # Check in video review  
        match_video = vr_std[
            (vr_std['gamekey'] == case['gamekey']) &
            (vr_std['playid'] == case['playid']) &
            (vr_std['gsisid'] == case['player_id'])
        ]
        
        is_concussion = len(match_simple) > 0 or len(match_video) > 0
        
        print(f"Case {i+1} (Player {case['player_id']:.0f}, Game {case['gamekey']}, Play {case['playid']:.0f}):")
        print(f"  Anomaly Score: {case['score']:.3f}")
        print(f"  Is Real Concussion: {'YES' if is_concussion else 'NO'}")
        
        if is_concussion:
            detected_are_concussions += 1
            # Get role for this player/play
            role_match = play_roles[
                (play_roles['GameKey'] == case['gamekey']) &
                (play_roles['PlayID'] == case['playid']) &
                (play_roles['GSISID'] == case['player_id'])
            ]
            role = role_match['Role'].iloc[0] if len(role_match) > 0 else 'Unknown'
            print(f"  Role: {role}")
            
            # Get video review details if available
            if len(match_video) > 0:
                vd = match_video.iloc[0]
                print(f"  Impact Type: {vd.get('primary_impact_type', 'N/A')}")
                print(f"  Player Activity: {vd.get('player_activity_derived', 'N/A')}")
                print(f"  Friendly Fire: {vd.get('friendly_fire', 'N/A')}")
        else:
            detected_not_concussions += 1
            print(f"  → This is a FALSE POSITIVE")
        
        print()
    
    print(f"SUMMARY:")
    print(f"  True Positives (correctly detected concussions): {detected_are_concussions}")
    print(f"  False Positives (flagged but not concussions): {detected_not_concussions}")
    print(f"  Precision: {detected_are_concussions / len(detected_cases):.2%}")
    
    # If we have some true positives, analyze what made them detectable
    if detected_are_concussions > 0:
        print(f"\n{'='*60}")
        print("ANALYZING TRUE POSITIVE CHARACTERISTICS")
        print(f"{'='*60}")
        
        # Get roles for all true positive cases
        tp_roles = []
        for case in detected_cases:
            match_simple = cs_std[
                (cs_std['gamekey'] == case['gamekey']) &
                (cs_std['playid'] == case['playid']) &
                (cs_std['gsisid'] == case['player_id'])
            ]
            
            if len(match_simple) > 0:
                role_match = play_roles[
                    (play_roles['GameKey'] == case['gamekey']) &
                    (play_roles['PlayID'] == case['playid']) &
                    (play_roles['GSISID'] == case['player_id'])
                ]
                if len(role_match) > 0:
                    tp_roles.append(role_match['Role'].iloc[0])
        
        if tp_roles:
            role_counts = pd.Series(tp_roles).value_counts()
            print("Roles of correctly detected concussions:")
            for role, count in role_counts.items():
                print(f"  {role}: {count}")
    
    return {
        'detected_cases': detected_cases,
        'true_positives': detected_are_concussions,
        'false_positives': detected_not_concussions,
        'overlaps': overlaps,
        'cs_only': cs_only,
        'vr_only': vr_only
    }

# Run the analysis
if __name__ == "__main__":
    results = analyze_concussion_data_mismatch(
        concussions_csv_path='scripts/punt_analytics/concussions.csv',  # Your 32-event file
        video_review_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/video_review.csv',
        play_player_role_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/play_player_role_data.csv',
        tracking_data_path='scripts/punt_analytics/concussedplayer_movement.csv'
    )

Loading datasets...
Simple concussions.csv: 32 events
Video review database: 37 events
Play roles database: 146573 roles

ANALYZING 7 DETECTED CASES
Checking if detected cases are in concussions.csv...
  ✗ Case 1: NOT FOUND in concussions.csv
  ✗ Case 2: NOT FOUND in concussions.csv
  ✗ Case 3: NOT FOUND in concussions.csv
  ✗ Case 4: NOT FOUND in concussions.csv
  ✗ Case 5: NOT FOUND in concussions.csv
  ✗ Case 6: NOT FOUND in concussions.csv
  ✗ Case 7: NOT FOUND in concussions.csv

COMPARING CONCUSSION DATASETS
Looking for overlaps between datasets...
Overlapping cases (in both datasets): 32
Video review only: 5
Concussions.csv only: 0

Sample overlapping cases:
  Player 31023, Game 29, Play 538
  Player 32410, Game 144, Play 2342
  Player 28128, Game 149, Play 3663

TRUTH CHECK: ARE DETECTED CASES REAL CONCUSSIONS?
Case 1 (Player 31023, Game 24, Play 538):
  Anomaly Score: -0.041
  Is Real Concussion: NO
  → This is a FALSE POSITIVE

Case 2 (Player 23742, Game 31, Play 3609):
  Ano

In [68]:
import pandas as pd

def analyze_concussion_data_mismatch(concussions_csv_path, video_review_path, 
                                   play_player_role_path, tracking_data_path):
    """
    Analyze the mismatch between different concussion datasets
    """
    
    print("Loading datasets...")
    
    # Load the datasets
    concussions_simple = pd.read_csv(concussions_csv_path)  # Your 32-row concussions.csv
    video_review = pd.read_csv(video_review_path)          # NFL's 37-row video review
    play_roles = pd.read_csv(play_player_role_path)
    
    print(f"Simple concussions.csv: {len(concussions_simple)} events")
    print(f"Video review database: {len(video_review)} events") 
    print(f"Play roles database: {len(play_roles)} roles")
    
    # The 7 detected cases - ensure integer types for matching
    detected_cases = [
        {'player_id': 31023, 'gamekey': 24, 'playid': 538, 'score': -0.041},
        {'player_id': 23742, 'gamekey': 31, 'playid': 3609, 'score': -0.161},
        {'player_id': 23564, 'gamekey': 29, 'playid': 2902, 'score': -0.033},
        {'player_id': 26035, 'gamekey': 15, 'playid': 3312, 'score': -0.080},
        {'player_id': 27060, 'gamekey': 34, 'playid': 1988, 'score': -0.086},
        {'player_id': 29793, 'gamekey': 14, 'playid': 978, 'score': -0.028},
        {'player_id': 30384, 'gamekey': 24, 'playid': 733, 'score': -0.134}
    ]
    
    print(f"\n{'='*60}")
    print("ANALYZING 7 DETECTED CASES")
    print(f"{'='*60}")
    
    # Check if detected cases are in the simple concussions.csv
    print("Checking if detected cases are in concussions.csv...")
    for i, case in enumerate(detected_cases):
        # Ensure data types match for comparison
        match = concussions_simple[
            (concussions_simple['gamekey'].astype(int) == int(case['gamekey'])) &
            (concussions_simple['playid'].astype(int) == int(case['playid'])) &
            (concussions_simple['gsisid'].astype(int) == int(case['player_id']))
        ]
        
        if len(match) > 0:
            print(f"  ✓ Case {i+1}: FOUND in concussions.csv")
        else:
            print(f"  ✗ Case {i+1}: NOT FOUND in concussions.csv")
            # Debug: show what we're looking for vs what's available
            print(f"    Looking for: Game {case['gamekey']}, Play {case['playid']}, Player {case['player_id']}")
            similar_game = concussions_simple[concussions_simple['gamekey'] == case['gamekey']]
            if len(similar_game) > 0:
                print(f"    Found {len(similar_game)} events in same game:")
                for _, sg in similar_game.iterrows():
                    print(f"      Game {sg['gamekey']}, Play {sg['playid']}, Player {sg['gsisid']}")
            else:
                print(f"    No events found in game {case['gamekey']}")
    
    # Check what's in video review vs simple concussions
    print(f"\n{'='*60}")
    print("COMPARING CONCUSSION DATASETS")
    print(f"{'='*60}")
    
    # Standardize column names for comparison
    vr_std = video_review.copy()
    vr_std.columns = vr_std.columns.str.lower()
    
    cs_std = concussions_simple.copy() 
    cs_std.columns = cs_std.columns.str.lower()
    
    # Find overlaps
    print("Looking for overlaps between datasets...")
    
    overlaps = []
    vr_only = []
    cs_only = []
    
    # Check each video review case
    for _, vr_case in vr_std.iterrows():
        match = cs_std[
            (cs_std['gamekey'] == vr_case['gamekey']) &
            (cs_std['playid'] == vr_case['playid']) &
            (cs_std['gsisid'] == vr_case['gsisid'])
        ]
        
        if len(match) > 0:
            overlaps.append(vr_case)
        else:
            vr_only.append(vr_case)
    
    # Check each simple concussion case
    for _, cs_case in cs_std.iterrows():
        match = vr_std[
            (vr_std['gamekey'] == cs_case['gamekey']) &
            (vr_std['playid'] == cs_case['playid']) &
            (vr_std['gsisid'] == cs_case['gsisid'])
        ]
        
        if len(match) == 0:
            cs_only.append(cs_case)
    
    print(f"Overlapping cases (in both datasets): {len(overlaps)}")
    print(f"Video review only: {len(vr_only)}")
    print(f"Concussions.csv only: {len(cs_only)}")
    
    # Show some examples
    if len(overlaps) > 0:
        print(f"\nSample overlapping cases:")
        for i, case in enumerate(overlaps[:3]):
            print(f"  Player {case['gsisid']:.0f}, Game {case['gamekey']}, Play {case['playid']:.0f}")
    
    if len(cs_only) > 0:
        print(f"\nSample concussions.csv only cases:")
        for i, case in enumerate(cs_only[:5]):
            print(f"  Player {case['gsisid']:.0f}, Game {case['gamekey']}, Play {case['playid']:.0f}")
    
    # Now the key question: Are our detected cases actual concussions?
    print(f"\n{'='*60}")
    print("TRUTH CHECK: ARE DETECTED CASES REAL CONCUSSIONS?")
    print(f"{'='*60}")
    
    detected_are_concussions = 0
    detected_not_concussions = 0
    
    for i, case in enumerate(detected_cases):
        # Check in simple concussions - ensure type matching
        match_simple = cs_std[
            (cs_std['gamekey'].astype(int) == int(case['gamekey'])) &
            (cs_std['playid'].astype(int) == int(case['playid'])) &
            (cs_std['gsisid'].astype(int) == int(case['player_id']))
        ]
        
        # Check in video review - ensure type matching
        match_video = vr_std[
            (vr_std['gamekey'].astype(int) == int(case['gamekey'])) &
            (vr_std['playid'].astype(int) == int(case['playid'])) &
            (vr_std['gsisid'].astype(int) == int(case['player_id']))
        ]
        
        is_concussion = len(match_simple) > 0 or len(match_video) > 0
        
        print(f"Case {i+1} (Player {case['player_id']}, Game {case['gamekey']}, Play {case['playid']}):")
        print(f"  Anomaly Score: {case['score']:.3f}")
        print(f"  Is Real Concussion: {'YES' if is_concussion else 'NO'}")
        
        if is_concussion:
            detected_are_concussions += 1
            # Get role for this player/play
            role_match = play_roles[
                (play_roles['GameKey'] == case['gamekey']) &
                (play_roles['PlayID'] == case['playid']) &
                (play_roles['GSISID'] == case['player_id'])
            ]
            role = role_match['Role'].iloc[0] if len(role_match) > 0 else 'Unknown'
            print(f"  Role: {role}")
            
            # Get video review details if available
            if len(match_video) > 0:
                vd = match_video.iloc[0]
                print(f"  Impact Type: {vd.get('primary_impact_type', 'N/A')}")
                print(f"  Player Activity: {vd.get('player_activity_derived', 'N/A')}")
                print(f"  Friendly Fire: {vd.get('friendly_fire', 'N/A')}")
        else:
            detected_not_concussions += 1
            print(f"  → This is a FALSE POSITIVE")
        
        print()
    
    print(f"SUMMARY:")
    print(f"  True Positives (correctly detected concussions): {detected_are_concussions}")
    print(f"  False Positives (flagged but not concussions): {detected_not_concussions}")
    print(f"  Precision: {detected_are_concussions / len(detected_cases):.2%}")
    
    # If we have some true positives, analyze what made them detectable
    if detected_are_concussions > 0:
        print(f"\n{'='*60}")
        print("ANALYZING TRUE POSITIVE CHARACTERISTICS")
        print(f"{'='*60}")
        
        # Get roles for all true positive cases
        tp_roles = []
        for case in detected_cases:
            match_simple = cs_std[
                (cs_std['gamekey'] == case['gamekey']) &
                (cs_std['playid'] == case['playid']) &
                (cs_std['gsisid'] == case['player_id'])
            ]
            
            if len(match_simple) > 0:
                role_match = play_roles[
                    (play_roles['GameKey'] == case['gamekey']) &
                    (play_roles['PlayID'] == case['playid']) &
                    (play_roles['GSISID'] == case['player_id'])
                ]
                if len(role_match) > 0:
                    tp_roles.append(role_match['Role'].iloc[0])
        
        if tp_roles:
            role_counts = pd.Series(tp_roles).value_counts()
            print("Roles of correctly detected concussions:")
            for role, count in role_counts.items():
                print(f"  {role}: {count}")
    
    return {
        'detected_cases': detected_cases,
        'true_positives': detected_are_concussions,
        'false_positives': detected_not_concussions,
        'overlaps': overlaps,
        'cs_only': cs_only,
        'vr_only': vr_only
    }

# Run the analysis
if __name__ == "__main__":
    results = analyze_concussion_data_mismatch(
        concussions_csv_path='scripts/punt_analytics/concussions.csv',  # Your 32-event file
        video_review_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/video_review.csv',
        play_player_role_path='/home/pshmo/summer_milestone_2-1/datasets/NFL-Punt-Analytics-Competition/play_player_role_data.csv',
        tracking_data_path='scripts/punt_analytics/concussedplayer_movement.csv'
    )

Loading datasets...
Simple concussions.csv: 32 events
Video review database: 37 events
Play roles database: 146573 roles

ANALYZING 7 DETECTED CASES
Checking if detected cases are in concussions.csv...
  ✗ Case 1: NOT FOUND in concussions.csv
    Looking for: Game 24, Play 538, Player 31023
    No events found in game 24
  ✗ Case 2: NOT FOUND in concussions.csv
    Looking for: Game 31, Play 3609, Player 23742
    No events found in game 31
  ✗ Case 3: NOT FOUND in concussions.csv
    Looking for: Game 29, Play 2902, Player 23564
    Found 1 events in same game:
      Game 29.0, Play 538.0, Player 31023.0
  ✗ Case 4: NOT FOUND in concussions.csv
    Looking for: Game 15, Play 3312, Player 26035
    No events found in game 15
  ✗ Case 5: NOT FOUND in concussions.csv
    Looking for: Game 34, Play 1988, Player 27060
    No events found in game 34
  ✗ Case 6: NOT FOUND in concussions.csv
    Looking for: Game 14, Play 978, Player 29793
    No events found in game 14
  ✗ Case 7: NOT FOUND 

In [70]:
import pandas as pd
import numpy as np

def final_concussion_truth_check():
    """
    Definitive check of whether our 7 detected cases are real concussions
    """
    
    # Load the video review data (source of truth for concussions)
    video_review = pd.read_csv('datasets/NFL-Punt-Analytics-Competition/video_review.csv')
    video_review.columns = video_review.columns.str.strip().str.lower()
    
    # The 7 cases our anomaly detection flagged
    detected_cases = [
        {'player_id': 31023, 'gamekey': 24, 'playid': 538, 'score': -0.041},
        {'player_id': 23742, 'gamekey': 31, 'playid': 3609, 'score': -0.161},
        {'player_id': 23564, 'gamekey': 29, 'playid': 2902, 'score': -0.033},
        {'player_id': 26035, 'gamekey': 15, 'playid': 3312, 'score': -0.080},
        {'player_id': 27060, 'gamekey': 34, 'playid': 1988, 'score': -0.086},
        {'player_id': 29793, 'gamekey': 14, 'playid': 978, 'score': -0.028},
        {'player_id': 30384, 'gamekey': 24, 'playid': 733, 'score': -0.134}
    ]
    
    print("VIDEO REVIEW DATABASE ANALYSIS")
    print("="*50)
    print(f"Total concussion events in video review: {len(video_review)}")
    print(f"Unique players with concussions: {video_review['gsisid'].nunique()}")
    print(f"Games with concussions: {video_review['gamekey'].nunique()}")
    
    print(f"\nConcussion events by game:")
    game_counts = video_review['gamekey'].value_counts().sort_index()
    for game, count in game_counts.items():
        print(f"  Game {game}: {count} concussion(s)")
    
    print(f"\n{'='*50}")
    print("CHECKING OUR 7 DETECTED CASES")
    print("="*50)
    
    true_positives = 0
    false_positives = 0
    
    for i, case in enumerate(detected_cases):
        # Check if this case is in video review (real concussion)
        match = video_review[
            (video_review['gamekey'] == case['gamekey']) &
            (video_review['playid'] == case['playid']) &
            (video_review['gsisid'] == case['player_id'])
        ]
        
        is_real_concussion = len(match) > 0
        
        print(f"\nCase {i+1}: Player {case['player_id']}, Game {case['gamekey']}, Play {case['playid']}")
        print(f"  Anomaly Score: {case['score']:.3f}")
        print(f"  Real Concussion: {'YES' if is_real_concussion else 'NO'}")
        
        if is_real_concussion:
            true_positives += 1
            concussion_details = match.iloc[0]
            print(f"  ✓ DETECTED REAL CONCUSSION")
            print(f"    Impact Type: {concussion_details.get('primary_impact_type', 'N/A')}")
            print(f"    Player Activity: {concussion_details.get('player_activity_derived', 'N/A')}")
            print(f"    Partner Activity: {concussion_details.get('primary_partner_activity_derived', 'N/A')}")
            print(f"    Friendly Fire: {concussion_details.get('friendly_fire', 'N/A')}")
        else:
            false_positives += 1
            print(f"  ✗ FALSE POSITIVE - No concussion occurred")
            
            # Show what actually happened in this game
            game_concussions = video_review[video_review['gamekey'] == case['gamekey']]
            if len(game_concussions) > 0:
                print(f"    Actual concussions in Game {case['gamekey']}:")
                for _, conc in game_concussions.iterrows():
                    print(f"      Player {conc['gsisid']}, Play {conc['playid']}")
            else:
                print(f"    No concussions recorded in Game {case['gamekey']}")
    
    print(f"\n{'='*50}")
    print("FINAL RESULTS")
    print("="*50)
    print(f"Total detected cases: {len(detected_cases)}")
    print(f"True Positives (real concussions detected): {true_positives}")
    print(f"False Positives (non-concussions flagged): {false_positives}")
    print(f"Precision: {true_positives / len(detected_cases):.1%}")
    
    if true_positives == 0:
        print(f"\n🚨 CRITICAL FINDING:")
        print(f"   Our anomaly detection flagged 7 plays as high-risk,")
        print(f"   but NONE of them were actual concussions!")
        print(f"   This suggests our model is detecting movement anomalies")
        print(f"   that are NOT related to injury risk.")
        
        print(f"\n💡 IMPLICATIONS:")
        print(f"   - The individualized baseline approach may be working")
        print(f"     (detecting unusual movement for each player)")
        print(f"   - But unusual movement ≠ injury risk")
        print(f"   - We may need to focus on contact-related features")
        print(f"   - Or look at movement patterns of collision partners")
        
    elif true_positives > 0:
        print(f"\n✅ PARTIAL SUCCESS:")
        print(f"   We detected {true_positives} real concussion(s)!")
        print(f"   But also had {false_positives} false alarm(s).")
        print(f"   The model shows promise but needs refinement.")
    
    # Now let's see what we missed
    print(f"\n{'='*50}")
    print("MISSED CONCUSSIONS ANALYSIS")
    print("="*50)
    
    # Load our full results to see which concussions we missed
    try:
        full_results = pd.read_csv('concussion_detection_results.csv')
        missed_concussions = full_results[
            (full_results['concussion_detected'] == False) & 
            (full_results['baseline_sufficient'] == True)
        ]
        
        print(f"Total evaluable concussions: {len(full_results[full_results['baseline_sufficient'] == True])}")
        print(f"Missed concussions: {len(missed_concussions)}")
        
        if len(missed_concussions) > 0:
            print(f"\nCharacteristics of missed concussions:")
            print(f"  Avg anomaly score: {missed_concussions['anomaly_score'].mean():.3f}")
            print(f"  Avg max speed: {missed_concussions['max_speed'].mean():.3f}")
            print(f"  Avg max acceleration: {missed_concussions['max_acceleration'].mean():.3f}")
            print(f"  Avg max jerk: {missed_concussions['max_jerk'].mean():.3f}")
            
            # Compare to detected (false positive) cases
            if false_positives > 0:
                print(f"\nComparison - False Positives vs Missed Real Concussions:")
                print(f"  False positives had HIGHER movement extremes than real concussions!")
                print(f"  This suggests concussions often happen during 'normal' movement")
                print(f"  The danger might be in the contact, not the individual movement")
    except:
        print("Could not load full results for missed concussion analysis")
    
    return {
        'true_positives': true_positives,
        'false_positives': false_positives,
        'precision': true_positives / len(detected_cases),
        'detected_cases': detected_cases
    }

# What really happened analysis
def analyze_what_really_happened():
    """
    Deep dive into what our anomaly detection actually caught
    """
    print(f"\n{'='*60}")
    print("WHAT DID OUR ANOMALY DETECTION ACTUALLY DETECT?")
    print("="*60)
    
    print("Our model flagged 7 plays with extreme movement patterns:")
    print("- Very high accelerations (3x normal)")
    print("- Very high jerk values (3x normal)")
    print("- Strong negative anomaly scores")
    print()
    print("But these were NOT concussions. So what were they?")
    print()
    print("Possible explanations:")
    print("1. 🏃‍♂️ Gunners sprinting at max effort (high speed/acceleration)")
    print("2. 🛑 Sudden stops to avoid collisions (high deceleration)")
    print("3. 🔄 Sharp cuts/direction changes (high jerk)")
    print("4. 📍 Unusual positioning for that player (spatial anomalies)")
    print("5. ⚡ 'Athletic plays' - exceptional but safe movements")
    print()
    print("Key insight: Movement extremes ≠ Injury risk")
    print("Injuries might happen during normal movements with unlucky contact")

if __name__ == "__main__":
    results = final_concussion_truth_check()
    analyze_what_really_happened()

VIDEO REVIEW DATABASE ANALYSIS
Total concussion events in video review: 37
Unique players with concussions: 36
Games with concussions: 34

Concussion events by game:
  Game 5: 1 concussion(s)
  Game 21: 1 concussion(s)
  Game 29: 1 concussion(s)
  Game 45: 1 concussion(s)
  Game 54: 1 concussion(s)
  Game 60: 1 concussion(s)
  Game 144: 1 concussion(s)
  Game 149: 1 concussion(s)
  Game 189: 1 concussion(s)
  Game 218: 1 concussion(s)
  Game 231: 1 concussion(s)
  Game 234: 1 concussion(s)
  Game 266: 1 concussion(s)
  Game 274: 1 concussion(s)
  Game 280: 2 concussion(s)
  Game 281: 1 concussion(s)
  Game 289: 1 concussion(s)
  Game 296: 1 concussion(s)
  Game 357: 1 concussion(s)
  Game 364: 2 concussion(s)
  Game 384: 1 concussion(s)
  Game 392: 1 concussion(s)
  Game 397: 1 concussion(s)
  Game 399: 1 concussion(s)
  Game 414: 1 concussion(s)
  Game 448: 1 concussion(s)
  Game 473: 1 concussion(s)
  Game 506: 1 concussion(s)
  Game 553: 1 concussion(s)
  Game 567: 1 concussion(s)
 