In [2]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)


In [2]:
''' 
Read in data files
'''

pff = pd.read_csv("~/Documents/Python/nfl-big-data-bowl-2023/data/pffScoutingData.csv")
plays = pd.read_csv("~/Documents/Python/nfl-big-data-bowl-2023/data/plays.csv")
games = pd.read_csv("~/Documents/Python/nfl-big-data-bowl-2023/data/games.csv")
games_df = pd.read_csv("~/Documents/Python/nfl-big-data-bowl-2023/data/games.csv")

In [3]:
def create_ngs_df():

    ngs_df = pd.read_csv("~/Documents/Python/nfl-big-data-bowl-2023/data/week1.csv", engine='c')
    print(f"dataframe shape = {ngs_df.shape}")

    for i in range(2,9):
        ngs_df = pd.concat([ngs_df,pd.read_csv(f"~/Documents/Python/nfl-big-data-bowl-2023/data/week{i}.csv", engine='c') ])
        
    # add column 'is_football'
    #ngs_df['is_football'] = ((ngs_df.team == 'football').values).astype(int)
    return ngs_df


In [4]:
# get frameId where ball is snapped and filter out plays before snap
def filter_play_edges(ex_group):
    ball_snap_events = ['autoevent_ballsnap', 'ball_snap']
    try:
        ball_snap_frame_id = ex_group.query("event in @ball_snap_events").frameId.values[0]
    except:
        ball_snap_frame_id = 0
    # filter out time after QB gets rid of ball or is sacked
    end_conditions = ['pass_forward', 'qb_sack', 'qb_strip_sack', 'fumble', 'lateral', 'handoff', 'out_of_bounds']
    end_frame_id = min(ex_group.query("event in @end_conditions").frameId.values, default=max(ex_group.frameId.values))
    # crop play
    play_df = ex_group[(ball_snap_frame_id <= ex_group.frameId) & (ex_group.frameId <= end_frame_id)]
    
    return play_df

# get if there was a sack
def get_sack_info(play_df):
    sack_list = ['qb_sack', 'qb_strip_sack']
    play_events = play_df.event.unique()
    is_sack = False
    sack_frame_id = -1
    for item in sack_list:
        if item in play_events:
            is_sack = True
            sack_frame_id = play_df.query("event == @item").frameId.values[0]
            break
    return is_sack, sack_frame_id

# adjust stats based on play Direction
def adjust_play(play_df):
    playDirection = play_df['playDirection'].unique()[0]
    if playDirection == 'left':
        play_df['adj_x'] = play_df['x']
        play_df['adj_y'] = play_df['y'] - 26.65
        play_df['adj_o'] = (play_df['o'] + 90) % 360
        play_df['adj_dir'] = (play_df['dir'] + 90) % 360
    if playDirection == 'right':
        play_df['adj_x'] = 120 - play_df['x']
        play_df['adj_y'] = (53.3/2) - play_df['y']
        play_df['adj_o'] = (play_df['o'] + 270) % 360
        play_df['adj_dir'] = (play_df['dir'] + 270) % 360
    
    play_df.loc[play_df.team == 'football', 'adj_o'] = 0
    play_df.loc[play_df.team == 'football', 'adj_dir'] = 0
    play_df.loc[play_df.team == 'football', 'nflId'] = 0
    
    return play_df




In [5]:

def add_team_indicator(play_df, games_df, game_id):
    game_df = games_df.query("gameId == @game_id")
    home_team = game_df['homeTeamAbbr'].values[0]
    away_team = game_df['visitorTeamAbbr'].values[0]
    
    conditions = [
        (play_df.team == home_team),
        (play_df.team == away_team),
        (play_df.team == 'football')
    ]
    choices = [1,2,3]
    play_df["team_indicator"] = np.select(conditions, choices)
    return play_df


In [6]:
ngs_df = create_ngs_df()

all_games = ngs_df.gameId.unique()

dataframe shape = (1118122, 16)


In [7]:
ngs_df.event.unique()

array(['None', 'ball_snap', 'autoevent_passforward', 'pass_forward',
       'autoevent_ballsnap', 'line_set', 'play_action', 'pass_arrived',
       'autoevent_passinterrupted', 'fumble', 'fumble_offense_recovered',
       'qb_sack', 'run', 'man_in_motion', 'pass_outcome_caught',
       'pass_outcome_incomplete', 'pass_tipped', 'qb_strip_sack', 'shift',
       'first_contact', 'huddle_break_offense', 'lateral', 'handoff',
       'penalty_flag', 'tackle', 'dropped_pass', 'out_of_bounds'],
      dtype=object)

In [8]:
# 203 should be fine

MAX_PLAY_LENGTH = ngs_df.frameId.max()

In [9]:
ngs_df.query("frameId > 170").playId.unique()

array([3406, 1143, 1547, 2734,  923, 2855])

In [10]:
features_list = []
target_list = []

In [11]:
features_to_keep = ['gameId', 'playId', 'frameId', 'nflId', 'team_indicator', 'adj_x', 'adj_y', 's', 'a', 'adj_o', 'adj_dir']

for game_index, game_id in enumerate(all_games):
    all_play_ids = plays.query("gameId == @game_id").playId.values
    
    for play_id in all_play_ids:
        ngs_play = ngs_df.query("gameId == @game_id and playId == @play_id")
        play_df = filter_play_edges(ngs_play)
        play_df = add_team_indicator(play_df, games_df, game_id)
        
        is_sack, sack_frame_id = get_sack_info(play_df)
        play_df = adjust_play(play_df)
        # for each frame, get our featuers and target
        all_frame_ids = play_df.frameId.unique()
        
        play_x_list = []
        play_y_list = []
        num_frames = len(all_frame_ids)
        for frame_id in all_frame_ids:
            
            time_df = play_df.query("frameId == @frame_id")
            
            feature_matrix = time_df[features_to_keep].values.flatten() # (23,11) -> array
            time_until_sack = -1
            if is_sack:
                time_until_sack = (sack_frame_id - frame_id)/10
                
            not_sack = abs(is_sack-1)
            target = np.array([not_sack, is_sack, time_until_sack]) # (3,1) array
            
            play_x_list.append(feature_matrix)
            play_y_list.append(target)
        
        # pad at beginning
        pad_list_x = []
        pad_list_y = []
        for i in range(MAX_PLAY_LENGTH - num_frames):
            pad_list_x.append(np.zeros(23*len(features_to_keep)))
            pad_list_y.append(np.zeros(3))
        
        
        features_list.append(pad_list_x+play_x_list)
        target_list.append(pad_list_y + play_y_list)
        
        


In [64]:
np.array(features_list).shape

(97, 203, 253)

In [12]:
target_arr = np.array(target_list)
feature_arr = np.array(features_list)

np.nan_to_num(feature_arr, copy=False, nan=0)
np.nan_to_num(target_arr, copy=False, nan=0)

array([[[ 0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ],
        ...,
        [ 1. ,  0. , -1. ],
        [ 1. ,  0. , -1. ],
        [ 1. ,  0. , -1. ]],

       [[ 0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ],
        ...,
        [ 1. ,  0. , -1. ],
        [ 1. ,  0. , -1. ],
        [ 1. ,  0. , -1. ]],

       [[ 0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ],
        ...,
        [ 1. ,  0. , -1. ],
        [ 1. ,  0. , -1. ],
        [ 1. ,  0. , -1. ]],

       ...,

       [[ 0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ],
        ...,
        [ 0. ,  1. ,  0.2],
        [ 0. ,  1. ,  0.1],
        [ 0. ,  1. ,  0. ]],

       [[ 0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ],
        ...,
        [ 1. ,  0. , -1. ],
        [ 1. ,  0. , -1. ],
        [ 1. ,  0. , -1. ]],

       [[ 0. ,  0. ,  0. ],
        [ 0. ,  0. ,  0. ],
        [ 0. ,  

In [13]:
import pickle

# np.save("./seq_data/feature_arr_ids", feature_arr)
# np.save("./seq_data/target_arr_ids", target_arr)

In [3]:
feature_arr = np.load("./seq_data/feature_arr_ids.npy")
target_arr = np.load("./seq_data/target_arr_ids.npy")

In [4]:
''' 
Create training, validation, and testing set
'''

print(target_arr.shape)

(8558, 203, 3)


In [5]:
#print(f"Shape of dataframe = {df.shape}")
#print(f"70% of length = {df.shape[0]*.7}")
#print(f"15% of length = {df.shape[0]*.15}")

# training set will start at. Pick these values so at the start of a new play
train_index_end = 6000
val_index_end = 7300

x_train = feature_arr[0:train_index_end, :, :]
x_val = feature_arr[train_index_end:val_index_end, :, :]
x_test = feature_arr[val_index_end:, :, :]

y_train = target_arr[0:train_index_end, :, :]
y_val = target_arr[train_index_end:val_index_end, :, :]
y_test = target_arr[val_index_end:, :, :]

In [6]:
# Confirm random seed functionality

# x = [2,3,4,5,6]
# y = [2,3,4,5,6]

# print("Setting seed once")
# np.random.seed(24)
# np.random.shuffle(x)
# np.random.shuffle(y)
# print(x)
# print(y)

# x = [2,3,4,5,6]
# y = [2,3,4,5,6]
# print("Setting seed multiple times")
# np.random.seed(24)
# np.random.shuffle(x)
# np.random.seed(24)
# np.random.shuffle(y)
# print(x)
# print(y)

# print(f"/n Need to reset seed every call")

In [7]:
print(x_train.shape)

(6000, 203, 253)


In [8]:
# Shuffle training set

np.random.seed(24)
np.random.shuffle(x_train)
np.random.seed(24)
np.random.shuffle(y_train)


In [7]:
# Standardize dataset - not doing it for small

# get mean/std of each feature. (N*23, 11) vector
train_mu = np.mean(x_train.reshape(-1,x_train.shape[-1]),axis=0)
train_std = np.std(x_train.reshape(-1,x_train.shape[-1]),axis=0)
train_std[train_std == 0] = 1  # so no divide by zero error

# don't standardize game, play, frame, nflid, or team_indicator features
x_train[:,:,5:] = (x_train[:,:,5:] - train_mu[5:])/train_std[5:]
x_val[:,:,5:] = (x_val[:,:,5:] - train_mu[5:])/train_std[5:]
x_test[:,:,5:] = (x_test[:,:,5:] - train_mu[5:])/train_std[5:]

In [9]:
print ("number of training examples = " + str(x_train.shape[0]))
print ("number of test examples = " + str(x_test.shape[0]))
print ("X_train shape: " + str(x_train.shape))
print ("Y_train shape: " + str(y_train.shape))
print ("X_val shape: " + str(x_val.shape))
print ("Y_val shape: " + str(y_val.shape))
print ("X_test shape: " + str(x_test.shape))
print ("Y_test shape: " + str(y_test.shape))

number of training examples = 6000
number of test examples = 1258
X_train shape: (6000, 203, 253)
Y_train shape: (6000, 203, 3)
X_val shape: (1300, 203, 253)
Y_val shape: (1300, 203, 3)
X_test shape: (1258, 203, 253)
Y_test shape: (1258, 203, 3)


In [10]:
# np.save("./seq_unnorm_data/x_train", x_train)
# np.save("./seq_unnorm_data/y_train", y_train)
# np.save("./seq_unnorm_data/x_val", x_val)
# np.save("./seq_unnorm_data/y_val", y_val)
# np.save("./seq_unnorm_data/x_test", x_test)
# np.save("./seq_unnorm_data/y_test", y_test)

In [9]:
np.save("./seq_data/x_train", x_train)
np.save("./seq_data/y_train", y_train)
np.save("./seq_data/x_val", x_val)
np.save("./seq_data/y_val", y_val)
np.save("./seq_data/x_test", x_test)
np.save("./seq_data/y_test", y_test)
np.save("./seq_data/train_mu", train_mu)
np.save("./seq_data/train_std", train_std)


In [None]:
''' 
Feature list:
['gameId', 'playId', 'frameId', 'nflId', 'team_indicator', 'adj_x', 'adj_y', 's', 'a', 'adj_o', 'adj_dir']
'''