In [1]:
import numpy as np
import pandas as pd

def load_and_normalize(path):
    """Load a CSV, strip and lowercase its column names."""
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip().str.lower()
    return df


# NGS chunks for 2016 & 2017
ngs_paths = [
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2016-pre.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2016-post.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk1-6.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk7-12.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2016-reg-wk13-17.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2017-pre.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2017-post.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk1-6.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk7-12.csv',
    '/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/NGS-2017-reg-wk13-17.csv',
]
df = pd.concat([load_and_normalize(p) for p in ngs_paths], ignore_index=True)

ndtypes = {'gamekey': 'int16',         
           'playid': 'int16',         
           'gsisid': 'float32',        
           'time': 'str',         
           'x': 'float32',         
           'y': 'float32',         
           'dis': 'float32',
           'o': 'float32',
           'event': 'str'}

df = df.astype(ndtypes)
df.dropna(subset='gsisid', inplace=True)
#df['gsisid'] = df['gsisid'].fillna(-1)

import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

def process_motion_chunk(chunk_df):
    # Make an explicit copy to avoid warnings
    chunk_df = chunk_df.copy()
    
    # Convert to datetime
    chunk_df['time'] = pd.to_datetime(chunk_df['time'])
    
    # Calculate relative time from play start
    chunk_df['relative_time'] = chunk_df.groupby(['season_year', 'gamekey', 'playid', 'gsisid'])['time'].transform(
        lambda x: (x - x.min()).dt.total_seconds()
    )
    
    # Create time deciles
    chunk_df['time_decile'] = chunk_df.groupby(['season_year', 'gamekey', 'playid', 'gsisid'])['relative_time'].transform(
        lambda x: pd.cut(x, bins=10, labels=[f'slice_{i}' for i in range(10)])
    )
    
    # Pivot to wide
    motion_wide = chunk_df.pivot_table(
        index=['season_year', 'gamekey', 'playid', 'gsisid'], 
        columns='time_decile',
        values=['dis', 'x', 'y', 'o'],
        aggfunc='mean',
        observed=True
    )
    
    # Flatten columns
    motion_wide.columns = [f'{metric}_{time}' for metric, time in motion_wide.columns]
    
    # Fill missing values by interpolating across time slices
    for metric in ['dis', 'x', 'y', 'o']:
        metric_cols = [f'{metric}_slice_{i}' for i in range(10)]
        motion_wide[metric_cols] = motion_wide[metric_cols].interpolate(axis=1, method='linear')
    
    return motion_wide

# Process in chunks
motion_wide_list = []
unique_games = df['gamekey'].unique()

for i in range(0, len(unique_games), 10):  # Process 10 games at a time
    game_chunk = unique_games[i:i+10]
    chunk_df = df[df['gamekey'].isin(game_chunk)]
    
    motion_wide_chunk = process_motion_chunk(chunk_df)
    motion_wide_list.append(motion_wide_chunk)
    
    print(f"Processed {i+len(game_chunk)} of {len(unique_games)} games")

# Combine all chunks
motion_wide = pd.concat(motion_wide_list, axis=0)

print(len(motion_wide), motion_wide.shape)
motion_wide.reset_index(inplace=True)
revs = load_and_normalize('/Users/petershmorhun/Documents/GitHub/summer_milestone_2/datasets/NFL-Punt-Analytics-Competition/video_review.csv')
df_final = motion_wide.merge(revs, on=['season_year', 'gamekey', 'playid', 'gsisid'], how='left')
df_final['concussed'] = df_final['player_activity_derived'].notnull().astype(int)
df_final.dropna(subset=['dis_slice_0'], inplace=True)

  df = pd.read_csv(path)


Processed 10 of 632 games
Processed 20 of 632 games
Processed 30 of 632 games
Processed 40 of 632 games
Processed 50 of 632 games
Processed 60 of 632 games
Processed 70 of 632 games
Processed 80 of 632 games
Processed 90 of 632 games
Processed 100 of 632 games
Processed 110 of 632 games
Processed 120 of 632 games
Processed 130 of 632 games
Processed 140 of 632 games
Processed 150 of 632 games
Processed 160 of 632 games
Processed 170 of 632 games


  chunk_df['time'] = pd.to_datetime(chunk_df['time'])


Processed 180 of 632 games
Processed 190 of 632 games
Processed 200 of 632 games
Processed 210 of 632 games
Processed 220 of 632 games
Processed 230 of 632 games
Processed 240 of 632 games
Processed 250 of 632 games
Processed 260 of 632 games
Processed 270 of 632 games
Processed 280 of 632 games
Processed 290 of 632 games
Processed 300 of 632 games
Processed 310 of 632 games
Processed 320 of 632 games
Processed 330 of 632 games
Processed 340 of 632 games
Processed 350 of 632 games
Processed 360 of 632 games
Processed 370 of 632 games
Processed 380 of 632 games
Processed 390 of 632 games
Processed 400 of 632 games
Processed 410 of 632 games
Processed 420 of 632 games
Processed 430 of 632 games
Processed 440 of 632 games
Processed 450 of 632 games
Processed 460 of 632 games
Processed 470 of 632 games
Processed 480 of 632 games
Processed 490 of 632 games
Processed 500 of 632 games
Processed 510 of 632 games
Processed 520 of 632 games
Processed 530 of 632 games
Processed 540 of 632 games
P

In [5]:
print(df_final.columns)

Index(['season_year', 'gamekey', 'playid', 'gsisid', 'dis_slice_0',
       'dis_slice_1', 'dis_slice_2', 'dis_slice_3', 'dis_slice_4',
       'dis_slice_5', 'dis_slice_6', 'dis_slice_7', 'dis_slice_8',
       'dis_slice_9', 'o_slice_0', 'o_slice_1', 'o_slice_2', 'o_slice_3',
       'o_slice_4', 'o_slice_5', 'o_slice_6', 'o_slice_7', 'o_slice_8',
       'o_slice_9', 'x_slice_0', 'x_slice_1', 'x_slice_2', 'x_slice_3',
       'x_slice_4', 'x_slice_5', 'x_slice_6', 'x_slice_7', 'x_slice_8',
       'x_slice_9', 'y_slice_0', 'y_slice_1', 'y_slice_2', 'y_slice_3',
       'y_slice_4', 'y_slice_5', 'y_slice_6', 'y_slice_7', 'y_slice_8',
       'y_slice_9', 'player_activity_derived', 'turnover_related',
       'primary_impact_type', 'primary_partner_gsisid',
       'primary_partner_activity_derived', 'friendly_fire', 'concussed'],
      dtype='object')
