In [1]:
import pandas as pd
import numpy as np

def load_and_normalize(path):
    """Load a CSV, strip and lowercase its column names."""
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.lower()
    return df

# 1. LOAD & NORMALIZE
# -------------------
games            = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/game_data.csv')
play_info        = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/play_information.csv')
player_roles     = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/play_player_role_data.csv')
player_positions = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/player_punt_data.csv')
video_review     = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/video_review.csv')

# NGS chunks for 2016 & 2017
ngs_paths = [
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-pre.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-post.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk1-6.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk7-12.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk13-17.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-pre.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-post.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk1-6.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk7-12.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk13-17.csv',
]
ngs = pd.concat([load_and_normalize(p) for p in ngs_paths], ignore_index=True)


# 2. CLEAN & CAST
# ----------------

# Parse any ISO‐style dates in games & play_info
for df in (games, play_info):
    if 'game_date' in df.columns:
        df['game_date'] = pd.to_datetime(df['game_date'])

# Make sure keys are ints
for df in (player_roles, player_positions, video_review):
    for col in ('gamekey', 'playid', 'gsisid'):
        if col in df.columns:
            df[col] = df[col].astype(int)

# Parse NGS timestamps if present
if 'time' in ngs.columns:
    ngs['time'] = pd.to_datetime(ngs['time'])


# 3. MERGE BASE TABLE
# -------------------

# Start from each player’s role in each play
df = player_roles.copy()

# Merge play-level data
df = df.merge(
    play_info,
    on=['gamekey','playid'],
    how='left',
    validate='many_to_one'
)

# Merge game-level data
df = df.merge(
    games.drop(columns=['game_date'], errors='ignore'),
    on='gamekey',
    how='left',
    validate='many_to_one'
)

# Merge typical football position
if {'gamekey','gsisid','position'}.issubset(player_positions.columns):
    df = df.merge(
        player_positions[['gamekey','gsisid','position']],
        on=['gamekey','gsisid'],
        how='left'
    )


# 4. ADD VIDEO REVIEW AS FEATURES
# --------------------------------

# Select the video_review columns we want
video_feats = video_review[[
    'gamekey','playid','gsisid',
    'player_activity_derived',
    'turnover_related',
    'primary_impact_type',
    'primary_partner_activity_derived',
    'friendly_fire'
]]

# Merge them in
df = df.merge(
    video_feats,
    on=['gamekey','playid','gsisid'],
    how='left'
)

# Fill NaNs for non-injured rows
for c in [
    'player_activity_derived',
    'turnover_related',
    'primary_impact_type',
    'primary_partner_activity_derived',
    'friendly_fire'
]:
    df[c] = df[c].fillna('NoInjury')


# 5. BUILD TARGET—ANY INJURY
# --------------------------

inj = video_review[['gamekey','playid','gsisid']].copy()
inj['injury'] = 1

df = df.merge(inj, on=['gamekey','playid','gsisid'], how='left')
df['injury'] = df['injury'].fillna(0).astype(int)


# 6. AGGREGATE NGS INTO SUMMARY FEATURES
# ---------------------------------------

if {'gamekey','playid','gsisid','dis'}.issubset(ngs.columns):
    ngs_summary = (
        ngs
        .groupby(['gamekey','playid','gsisid'], as_index=False)
        .agg(
            total_distance = ('dis','sum'),
            max_step       = ('dis','max'),
            mean_step      = ('dis','mean'),
            n_timestamps   = ('time' if 'time' in ngs.columns else 'dis','count')
        )
    )
    df = df.merge(ngs_summary, on=['gamekey','playid','gsisid'], how='left')
else:
    df[['total_distance','max_step','mean_step','n_timestamps']] = 0


# 7. FINAL PREP FOR EDA & MODELING
# ---------------------------------

# Fill any remaining nulls in the summary stats
for col in ['total_distance','max_step','mean_step','n_timestamps']:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Define feature columns
feature_cols = [
    'position', 'role', 'season_year', 'week',
    'player_activity_derived',
    'turnover_related',
    'primary_impact_type',
    'primary_partner_activity_derived',
    'friendly_fire',
    'total_distance','max_step','mean_step','n_timestamps'
]
feature_cols = [c for c in feature_cols if c in df.columns]

# One‑hot encode categoricals
X = pd.get_dummies(df[feature_cols], drop_first=True)
y = df['injury']

# Quick check
print("X shape:", X.shape)
print("Injury prevalence:", y.mean())


  df = pd.read_csv(path)


X shape: (146573, 73)
Injury prevalence: 0.0002524339407667169


In [1]:
import pandas as pd
import numpy as np

def load_and_normalize(path):
    """Load a CSV, strip and lowercase its column names."""
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.lower()
    return df

# 1. LOAD & NORMALIZE
# -------------------
games            = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/game_data.csv')
play_info        = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/play_information.csv')
player_roles     = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/play_player_role_data.csv')
player_positions = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/player_punt_data.csv')
video_review     = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/video_review.csv')

# NGS chunks for 2016 & 2017
ngs_paths = [
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-pre.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-post.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk1-6.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk7-12.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk13-17.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-pre.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-post.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk1-6.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk7-12.csv',
    'datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk13-17.csv',
]
ngs = pd.concat([load_and_normalize(p) for p in ngs_paths], ignore_index=True)

  df = pd.read_csv(path)


In [1]:
import pandas as pd
import numpy as np

def load_and_normalize(path):
    """Load a CSV, strip and lowercase its column names."""
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.lower()
    return df

# 1. LOAD & NORMALIZE
# -------------------
games            = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/game_data.csv')
play_info        = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/play_information.csv')
player_roles     = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/play_player_role_data.csv')
player_positions = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/player_punt_data.csv')
video_review     = load_and_normalize('datasets/NFL-Punt-Analytics-Competition/video_review.csv')

In [24]:
play_info.query('season_year == 2016 & gamekey == 5')

Unnamed: 0,season_year,season_type,gamekey,game_date,week,playid,game_clock,yardline,quarter,play_type,poss_team,home_team_visit_team,score_home_visiting,playdescription
26,2016,Pre,5,08/11/2016,2,575,06:59,CHI 49,1,Punt,CHI,CHI-DEN,0 - 7,(6:59) (Punt formation) P.O'Donnell punts 31 y...
27,2016,Pre,5,08/11/2016,2,821,03:23,CHI 34,1,Punt,CHI,CHI-DEN,0 - 7,(3:23) (Punt formation) P.O'Donnell punts 46 y...
28,2016,Pre,5,08/11/2016,2,933,01:51,DEN 31,1,Punt,DEN,CHI-DEN,0 - 7,(1:51) (Punt formation) B.Colquitt punts 51 ya...
29,2016,Pre,5,08/11/2016,2,1044,00:06,CHI 12,1,Punt,CHI,CHI-DEN,0 - 7,(:06) (Punt formation) P.O'Donnell punts 54 ya...
30,2016,Pre,5,08/11/2016,2,1333,11:40,CHI 20,2,Punt,CHI,CHI-DEN,0 - 10,(11:40) (Punt formation) P.O'Donnell punts 52 ...
31,2016,Pre,5,08/11/2016,2,1440,10:17,DEN 41,2,Punt,DEN,CHI-DEN,0 - 10,(10:17) (Punt formation) R.Dixon punts 46 yard...
32,2016,Pre,5,08/11/2016,2,2046,01:57,CHI 16,2,Punt,CHI,CHI-DEN,0 - 19,(1:57) (Punt formation) P.O'Donnell punt is BL...
33,2016,Pre,5,08/11/2016,2,2564,13:24,CHI 28,3,Punt,CHI,CHI-DEN,0 - 20,(13:24) (Punt formation) P.O'Donnell punts 42 ...
34,2016,Pre,5,08/11/2016,2,2680,11:14,DEN 45,3,Punt,DEN,CHI-DEN,0 - 20,(11:14) (Punt formation) B.Colquitt punts 45 y...
35,2016,Pre,5,08/11/2016,2,3129,03:44,CHI 31,3,Punt,CHI,CHI-DEN,0 - 22,(3:44) (Punt formation) P.O'Donnell punts 58 y...


In [25]:
ngs_1 = pd.read_csv('datasets/NFL-Punt-Analytics-Competition/NGS-2016-pre.csv')

In [26]:
video_review.head()

Unnamed: 0,season_year,gamekey,playid,gsisid,player_activity_derived,turnover_related,primary_impact_type,primary_partner_gsisid,primary_partner_activity_derived,friendly_fire
0,2016,5,3129,31057,Tackling,No,Helmet-to-body,32482,Tackled,No
1,2016,21,2587,29343,Blocked,No,Helmet-to-helmet,31059,Blocking,No
2,2016,29,538,31023,Tackling,No,Helmet-to-body,31941,Tackled,No
3,2016,45,1212,33121,Tackling,No,Helmet-to-body,28249,Tackled,No
4,2016,54,1045,32444,Blocked,No,Helmet-to-body,31756,Blocked,Yes


In [27]:
ngs_1.head()

Unnamed: 0,Season_Year,GameKey,PlayID,GSISID,Time,x,y,dis,o,dir,Event
0,2016,6,3236,28943.0,2016-08-13 01:38:02.900,39.669998,30.690001,0.06,308.0,267.940002,
1,2016,6,3236,32366.0,2016-08-13 01:38:03.000,39.57,28.950001,0.02,24.4,284.350006,
2,2016,6,3236,31810.0,2016-08-13 01:38:03.000,39.740002,47.209999,0.0,15.76,15.42,
3,2016,6,3236,32331.0,2016-08-13 01:38:03.000,40.369999,29.969999,0.02,13.57,246.490005,
4,2016,6,3236,28932.0,2016-08-13 01:38:03.000,39.330002,28.02,0.09,324.890015,230.100006,


In [29]:
ngs_1.query('GameKey == 21')['PlayID'].unique()

array([1238,  519,  606])