In [1]:
import pandas as pd

pd.options.mode.chained_assignment = None

import numpy as np

In [2]:
def create_ngs_df():

    ngs_df = pd.read_csv("data/week1.csv", engine='c')
    print(f"dataframe shape = {ngs_df.shape}")

    for i in range(2,9):
        ngs_df = pd.concat([ngs_df,pd.read_csv(f"data/week{i}.csv", engine='c') ])
        
    # add column 'is_football'
    #ngs_df['is_football'] = ((ngs_df.team == 'football').values).astype(int)
    return ngs_df


In [4]:
pff = pd.read_csv("data/pffScoutingData.csv")
plays = pd.read_csv("data/plays.csv")
games = pd.read_csv("data/games.csv")

In [5]:
# get frameId where ball is snapped and filter out plays before snap
def filter_play_edges(ex_group):
    ball_snap_events = ['autoevent_ballsnap', 'ball_snap']
    try:
        ball_snap_frame_id = ex_group.query("event in @ball_snap_events").frameId.values[0]
    except:
        ball_snap_frame_id = 0
    # filter out time after QB gets rid of ball or is sacked
    end_conditions = ['pass_forward', 'qb_sack', 'qb_strip_sack', 'fumble', 'lateral', 'handoff', 'out_of_bounds']
    end_frame_id = min(ex_group.query("event in @end_conditions").frameId.values, default=max(ex_group.frameId.values))
    # crop play
    play_df = ex_group[(ball_snap_frame_id <= ex_group.frameId) & (ex_group.frameId <= end_frame_id)]
    
    return play_df

# get if there was a sack
def get_sack_info(play_df):
    sack_list = ['qb_sack', 'qb_strip_sack']
    play_events = play_df.event.unique()
    is_sack = False
    sack_frame_id = -1
    for item in sack_list:
        if item in play_events:
            is_sack = True
            sack_frame_id = play_df.query("event == @item").frameId.values[0]
            break
    return is_sack, sack_frame_id

# adjust stats based on play Direction
def adjust_play(play_df):
    playDirection = play_df['playDirection'].unique()[0]
    if playDirection == 'left':
        play_df['adj_x'] = play_df['x']
        play_df['adj_y'] = play_df['y'] - 26.65
        play_df['adj_o'] = (play_df['o'] + 90) % 360
        play_df['adj_dir'] = (play_df['dir'] + 90) % 360
    if playDirection == 'right':
        play_df['adj_x'] = 120 - play_df['x']
        play_df['adj_y'] = (53.3/2) - play_df['y']
        play_df['adj_o'] = (play_df['o'] + 270) % 360
        play_df['adj_dir'] = (play_df['dir'] + 270) % 360
    
    play_df.loc[play_df.team == 'football', 'adj_o'] = 0
    play_df.loc[play_df.team == 'football', 'adj_dir'] = 0
    play_df.loc[play_df.team == 'football', 'nflId'] = 0
    
    return play_df




In [6]:

def add_team_indicator(play_df, games_df, game_id):
    game_df = games_df.query("gameId == @game_id")
    home_team = game_df['homeTeamAbbr'].values[0]
    away_team = game_df['visitorTeamAbbr'].values[0]
    
    conditions = [
        (play_df.team == home_team),
        (play_df.team == away_team),
        (play_df.team == 'football')
    ]
    choices = [1,2,3]
    play_df["team_indicator"] = np.select(conditions, choices)
    return play_df


In [7]:
ngs_df = create_ngs_df()

all_games = ngs_df.gameId.unique()

dataframe shape = (1118122, 16)


In [8]:
ngs_df.event.unique()

array(['None', 'ball_snap', 'autoevent_passforward', 'pass_forward',
       'autoevent_ballsnap', 'line_set', 'play_action', 'pass_arrived',
       'autoevent_passinterrupted', 'fumble', 'fumble_offense_recovered',
       'qb_sack', 'run', 'man_in_motion', 'pass_outcome_caught',
       'pass_outcome_incomplete', 'pass_tipped', 'qb_strip_sack', 'shift',
       'first_contact', 'huddle_break_offense', 'lateral', 'handoff',
       'penalty_flag', 'tackle', 'dropped_pass', 'out_of_bounds'],
      dtype=object)

In [9]:
games_df = pd.read_csv("data/games.csv")

In [10]:
features_list = []
target_list = []

In [11]:
features_to_keep = ['gameId', 'playId', 'frameId', 'nflId', 'team_indicator', 'adj_x', 'adj_y', 's', 'a', 'adj_o', 'adj_dir']

for game_index, game_id in enumerate(all_games):
    all_play_ids = plays.query("gameId == @game_id").playId.values
    
    for play_id in all_play_ids:
        ngs_play = ngs_df.query("gameId == @game_id and playId == @play_id")
        play_df = filter_play_edges(ngs_play)
        play_df = add_team_indicator(play_df, games_df, game_id)
        
        is_sack, sack_frame_id = get_sack_info(play_df)
        play_df = adjust_play(play_df)
        # for each frame, get our featuers and target
        all_frame_ids = play_df.frameId.unique()
        for frame_id in all_frame_ids:
            
            time_df = play_df.query("frameId == @frame_id")
            
            if len(time_df) != 23:
                print(f"skipped at game {game_index} play {play_id} at time {frame_id}")
                continue
            
            feature_matrix = time_df[features_to_keep].values # (23,7) array
            time_until_sack = -1
            if is_sack:
                time_until_sack = (sack_frame_id - frame_id)/10
            target = np.array([is_sack, time_until_sack])
            
            features_list.append(feature_matrix)
            target_list.append(target)
            



In [12]:
target_arr = np.array(target_list)
feature_arr = np.array(features_list)

In [13]:
import pickle

np.save("./cleaned_data/feature_arr_ids", feature_arr)
np.save("./cleaned_data/target_arr_ids", target_arr)

In [22]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [23]:
ngs_df.query("gameId == @game_id and playId == @play_id").head(100)

Unnamed: 0,gameId,playId,nflId,frameId,time,jerseyNumber,team,playDirection,x,y,s,a,dis,o,dir,event,is_football
161184,2021091200,4367,33084.0,1,2021-09-12 19:53:51,2.0,ATL,left,89.47,26.02,0.44,1.35,0.04,311.53,76.52,,0
161185,2021091200,4367,33084.0,2,2021-09-12 19:53:51,2.0,ATL,left,89.53,26.03,0.69,1.57,0.06,311.53,78.13,,0
161186,2021091200,4367,33084.0,3,2021-09-12 19:53:52,2.0,ATL,left,89.61,26.05,0.93,1.66,0.09,314.0,78.77,,0
161187,2021091200,4367,33084.0,4,2021-09-12 19:53:52,2.0,ATL,left,89.71,26.06,1.13,1.59,0.1,315.3,81.78,,0
161188,2021091200,4367,33084.0,5,2021-09-12 19:53:52,2.0,ATL,left,89.84,26.08,1.36,1.57,0.13,315.3,81.63,,0
161189,2021091200,4367,33084.0,6,2021-09-12 19:53:52,2.0,ATL,left,90.0,26.11,1.61,1.55,0.16,314.05,81.13,,0
161190,2021091200,4367,33084.0,7,2021-09-12 19:53:52,2.0,ATL,left,90.17,26.14,1.81,1.45,0.18,314.05,81.49,,0
161191,2021091200,4367,33084.0,8,2021-09-12 19:53:52,2.0,ATL,left,90.36,26.16,1.98,1.24,0.19,314.05,81.88,,0
161192,2021091200,4367,33084.0,9,2021-09-12 19:53:52,2.0,ATL,left,90.57,26.2,2.12,1.04,0.21,311.62,81.34,,0
161193,2021091200,4367,33084.0,10,2021-09-12 19:53:52,2.0,ATL,left,90.78,26.24,2.22,0.82,0.22,309.32,79.87,,0
