In [1]:
import pandas as pd
import numpy as np

def load_and_normalize(path):
    """Load a CSV, strip and lowercase its column names."""
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.lower()
    return df

# 1. LOAD & NORMALIZE
# -------------------
games            = load_and_normalize('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/game_data.csv')
play_info        = load_and_normalize('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/play_information.csv')
player_roles     = load_and_normalize('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/play_player_role_data.csv')
player_positions = load_and_normalize('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/player_punt_data.csv')
video_review     = load_and_normalize('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/video_review.csv')

# NGS chunks for 2016 & 2017
ngs_paths = [
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2016-pre.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2016-post.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk1-6.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk7-12.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk13-17.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2017-pre.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2017-post.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk1-6.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk7-12.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk13-17.csv',
]
ngs = pd.concat([load_and_normalize(p) for p in ngs_paths], ignore_index=True)

  df = pd.read_csv(path)


In [2]:
ngs.head()

Unnamed: 0,season_year,gamekey,playid,gsisid,time,x,y,dis,o,dir,event
0,2016,6,3236,28943.0,2016-08-13 01:38:02.900,39.669998,30.690001,0.06,308.0,267.940002,
1,2016,6,3236,32366.0,2016-08-13 01:38:03.000,39.57,28.950001,0.02,24.4,284.350006,
2,2016,6,3236,31810.0,2016-08-13 01:38:03.000,39.740002,47.209999,0.0,15.76,15.42,
3,2016,6,3236,32331.0,2016-08-13 01:38:03.000,40.369999,29.969999,0.02,13.57,246.490005,
4,2016,6,3236,28932.0,2016-08-13 01:38:03.000,39.330002,28.02,0.09,324.890015,230.100006,


In [None]:
ngs_grouped = ngs.groupby(['season_year', 'gamekey', 'playid', 'gsisid']).

In [None]:
# 2. CLEAN & CAST
# ----------------

# Parse any ISO‐style dates in games & play_info
for df in (games, play_info):
    if 'game_date' in df.columns:
        df['game_date'] = pd.to_datetime(df['game_date'])

# Make sure keys are ints
for df in (player_roles, player_positions, video_review):
    for col in ('gamekey', 'playid', 'gsisid'):
        if col in df.columns:
            df[col] = df[col].astype(int)

# Parse NGS timestamps if present
if 'time' in ngs.columns:
    ngs['time'] = pd.to_datetime(ngs['time'])

In [None]:
# 2. CLEAN & CAST
# ----------------

# Parse any ISO‐style dates in games & play_info
for df in (games, play_info):
    if 'game_date' in df.columns:
        df['game_date'] = pd.to_datetime(df['game_date'])

# Make sure keys are ints
for df in (player_roles, player_positions, video_review):
    for col in ('gamekey', 'playid', 'gsisid'):
        if col in df.columns:
            df[col] = df[col].astype(int)

# Parse NGS timestamps if present
if 'time' in ngs.columns:
    ngs['time'] = pd.to_datetime(ngs['time'])

In [None]:
# 3. MERGE BASE TABLE
# -------------------

# Start from each player’s role in each play
df = player_roles.copy()

# Merge play-level data
df = df.merge(
    play_info,
    on=['gamekey','playid'],
    how='left',
    validate='many_to_one'
)

# Merge game-level data
df = df.merge(
    games.drop(columns=['game_date'], errors='ignore'),
    on='gamekey',
    how='left',
    validate='many_to_one'
)

# Merge typical football position
if {'gamekey','gsisid','position'}.issubset(player_positions.columns):
    df = df.merge(
        player_positions[['gamekey','gsisid','position']],
        on=['gamekey','gsisid'],
        how='left'
    )

# Select the video_review columns we want
video_feats = video_review[[
    'gamekey','playid','gsisid',
    'player_activity_derived',
    'turnover_related',
    'primary_impact_type',
    'primary_partner_activity_derived',
    'friendly_fire'
]]

# Merge them in
df = df.merge(
    video_feats,
    on=['gamekey','playid','gsisid'],
    how='left'
)

# Fill NaNs for non-injured rows
for c in [
    'player_activity_derived',
    'turnover_related',
    'primary_impact_type',
    'primary_partner_activity_derived',
    'friendly_fire'
]:
    df[c] = df[c].fillna('NoInjury')

# 5. BUILD TARGET—ANY INJURY
# --------------------------

inj = video_review[['gamekey','playid','gsisid']].copy()
inj['injury'] = 1

df = df.merge(inj, on=['gamekey','playid','gsisid'], how='left')
df['injury'] = df['injury'].fillna(0).astype(int)

In [None]:
df.head()

In [None]:
ngs.head()

In [None]:
if {'gamekey','playid','gsisid','dis'}.issubset(ngs.columns):
    ngs_summary = (
        ngs
        .groupby(['gamekey','playid','gsisid'], as_index=False)
        .agg(
            total_distance = ('dis','sum'),
            max_step       = ('dis','max'),
            mean_step      = ('dis','mean'),
            n_timestamps   = ('time' if 'time' in ngs.columns else 'dis','count')     
        )
    )
    df = df.merge(ngs_summary, on=['gamekey','playid','gsisid'], how='left')
else:
    df[['total_distance','max_step','mean_step','n_timestamps']] = 0

In [None]:
# Fill any remaining nulls in the summary stats
for col in ['total_distance','max_step','mean_step','n_timestamps']:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Define feature columns
feature_cols = [
    'position', 'role', 'season_year', 'week',
    'player_activity_derived',
    'turnover_related',
    'primary_impact_type',
    'primary_partner_activity_derived',
    'friendly_fire',
    'total_distance','max_step','mean_step','n_timestamps'
]
feature_cols = [c for c in feature_cols if c in df.columns]

In [None]:
# One‑hot encode categoricals
X = pd.get_dummies(df[feature_cols], drop_first=True)
y = df['injury']

# Quick check
print("▶️  Data prep complete!")
print("   X shape:", X.shape)
print("   Injury prevalence:", y.mean())

In [None]:
X.isna().sum()

In [None]:
mapping = {
  'PR':'Returner',
  'GL':'Gunner','GR':'Gunner',
  'PLW':'Wing','PLG':'Wing','PRG':'Wing','PLL':'Wing',
  # …etc…
}
df['role_group'] = df['role'].map(mapping)

# 2) Drop all the old one‑hot role_ columns
old_roles = [c for c in X.columns if c.startswith('role_')]
X = X.drop(columns=old_roles)

# 3) Add one‑hot encoding for the three groups
role_group_dummies = pd.get_dummies(df['role_group'], prefix='role_group', drop_first=True)
X = pd.concat([X.reset_index(drop=True), role_group_dummies.reset_index(drop=True)], axis=1)

print("New feature set:")
print(X.filter(like='role_group_').columns)

In [None]:
# Identify and drop all columns that came from video_review
leak_cols = [c for c in X.columns if 
             c.startswith('player_activity_derived_') or
             c.startswith('turnover_related_') or
             c.startswith('primary_impact_type_') or
             c.startswith('primary_partner_activity_derived_') or
             c.startswith('friendly_fire_')]

X_nomap = X.drop(columns=leak_cols)

In [None]:
X_nomap.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_nomap, y, test_size=0.2, random_state=42, stratify=y)
X_train.to_csv('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/scripts/punt_analytics/X_train.csv', index=False)
X_test.to_csv('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/scripts/punt_analytics/X_test.csv', index=False)
y_train.to_csv('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/scripts/punt_analytics/y_train.csv', index=False)
y_test.to_csv('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/scripts/punt_analytics/y_test.csv', index=False)