In [2]:
import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)


In [2]:
''' 
Read in data files
'''

pff = pd.read_csv("data/pffScoutingData.csv")
plays = pd.read_csv("data/plays.csv")
games = pd.read_csv("data/games.csv")
games_df = pd.read_csv("data/games.csv")

In [3]:
def create_ngs_df():

    ngs_df = pd.read_csv("data/week1.csv", engine='c')
    print(f"dataframe shape = {ngs_df.shape}")

    for i in range(2,9):
        ngs_df = pd.concat([ngs_df,pd.read_csv(f"data/week{i}.csv", engine='c') ])
        
    # add column 'is_football'
    #ngs_df['is_football'] = ((ngs_df.team == 'football').values).astype(int)
    return ngs_df


In [4]:
# get frameId where ball is snapped and filter out plays before snap
def filter_play_edges(ex_group):
    ball_snap_events = ['autoevent_ballsnap', 'ball_snap']
    try:
        ball_snap_frame_id = ex_group.query("event in @ball_snap_events").frameId.values[0]
    except:
        ball_snap_frame_id = 0
    # filter out time after QB gets rid of ball or is sacked
    end_conditions = ['pass_forward', 'qb_sack', 'qb_strip_sack', 'fumble', 'lateral', 'handoff', 'out_of_bounds']
    end_frame_id = min(ex_group.query("event in @end_conditions").frameId.values, default=max(ex_group.frameId.values))
    # crop play
    play_df = ex_group[(ball_snap_frame_id <= ex_group.frameId) & (ex_group.frameId <= end_frame_id)]
    
    return play_df

# get if there was a sack
def get_sack_info(play_df):
    sack_list = ['qb_sack', 'qb_strip_sack']
    play_events = play_df.event.unique()
    is_sack = False
    sack_frame_id = -1
    for item in sack_list:
        if item in play_events:
            is_sack = True
            sack_frame_id = play_df.query("event == @item").frameId.values[0]
            break
    return is_sack, sack_frame_id

# adjust stats based on play Direction
def adjust_play(play_df):
    playDirection = play_df['playDirection'].unique()[0]
    if playDirection == 'left':
        play_df['adj_x'] = play_df['x']
        play_df['adj_y'] = play_df['y'] - 26.65
        play_df['adj_o'] = (play_df['o'] + 90) % 360
        play_df['adj_dir'] = (play_df['dir'] + 90) % 360
    if playDirection == 'right':
        play_df['adj_x'] = 120 - play_df['x']
        play_df['adj_y'] = (53.3/2) - play_df['y']
        play_df['adj_o'] = (play_df['o'] + 270) % 360
        play_df['adj_dir'] = (play_df['dir'] + 270) % 360
    
    play_df.loc[play_df.team == 'football', 'adj_o'] = 0
    play_df.loc[play_df.team == 'football', 'adj_dir'] = 0
    play_df.loc[play_df.team == 'football', 'nflId'] = 0
    
    return play_df




In [5]:

def add_team_indicator(play_df, games_df, game_id):
    game_df = games_df.query("gameId == @game_id")
    home_team = game_df['homeTeamAbbr'].values[0]
    away_team = game_df['visitorTeamAbbr'].values[0]
    
    conditions = [
        (play_df.team == home_team),
        (play_df.team == away_team),
        (play_df.team == 'football')
    ]
    choices = [1,2,3]
    play_df["team_indicator"] = np.select(conditions, choices)
    return play_df


In [6]:
ngs_df = create_ngs_df()

all_games = ngs_df.gameId.unique()

dataframe shape = (1118122, 16)


In [7]:
ngs_df.event.unique()

array(['None', 'ball_snap', 'autoevent_passforward', 'pass_forward',
       'autoevent_ballsnap', 'line_set', 'play_action', 'pass_arrived',
       'autoevent_passinterrupted', 'fumble', 'fumble_offense_recovered',
       'qb_sack', 'run', 'man_in_motion', 'pass_outcome_caught',
       'pass_outcome_incomplete', 'pass_tipped', 'qb_strip_sack', 'shift',
       'first_contact', 'huddle_break_offense', 'lateral', 'handoff',
       'penalty_flag', 'tackle', 'dropped_pass', 'out_of_bounds'],
      dtype=object)

In [8]:
features_list = []
target_list = []

In [18]:
features_to_keep = ['gameId', 'playId', 'frameId', 'nflId', 'team_indicator', 'adj_x', 'adj_y', 's', 'a', 'adj_o', 'adj_dir']

for game_index, game_id in enumerate(all_games):
    all_play_ids = plays.query("gameId == @game_id").playId.values
    
    for play_id in all_play_ids:
        ngs_play = ngs_df.query("gameId == @game_id and playId == @play_id")
        play_df = filter_play_edges(ngs_play)
        play_df = add_team_indicator(play_df, games_df, game_id)
        
        is_sack, sack_frame_id = get_sack_info(play_df)
        play_df = adjust_play(play_df)
        # for each frame, get our featuers and target
        all_frame_ids = play_df.frameId.unique()
        
        frame_id = np.random.choice(all_frame_ids, 1)[0]
        # print(frame_id[0])
        # print(type(frame_id))
        #for frame_id in all_frame_ids:
            
        time_df = play_df.query("frameId == @frame_id")
        
        if len(time_df) != 23:
            print(f"skipped at game {game_index} play {play_id} at time {frame_id}")
            continue
        
        feature_matrix = time_df[features_to_keep].values # (23,7) array
        time_until_sack = -1
        if is_sack:
            time_until_sack = (sack_frame_id - frame_id)/10
            
        not_sack = abs(is_sack-1)
        target = np.array([not_sack, is_sack, time_until_sack])
        
        features_list.append(feature_matrix)
        target_list.append(target)
        



In [3]:
target_arr = np.array(target_list)
feature_arr = np.array(features_list)

np.nan_to_num(feature_arr, copy=False, nan=0)
np.nan_to_num(target_arr, copy=False, nan=0)

NameError: name 'target_list' is not defined

In [20]:
import pickle

# np.save("./cleaned_data/small/feature_arr_ids", feature_arr)
# np.save("./cleaned_data/small/target_arr_ids", target_arr)

In [4]:
feature_arr = np.load("./cleaned_data/feature_arr_ids.npy")
target_arr = np.load("./cleaned_data/target_arr_ids.npy")

In [12]:
''' 
Create training, validation, and testing set
'''

' \nCreate training, validation, and testing set\n'

In [12]:
feature_arr[196647:196650][:,1,:]

array([[ 2.0211017e+09,  4.7800000e+02,  3.4000000e+01,  4.3290000e+04,
         1.0000000e+00,  9.6620000e+01, -2.7000000e+00,  2.1900000e+00,
         1.0000000e+00,  8.9050000e+01,  2.5250000e+01],
       [ 2.0211017e+09,  5.2100000e+02,  6.0000000e+00,  3.8553000e+04,
         2.0000000e+00,  8.8470000e+01,  6.3500000e+00,  0.0000000e+00,
         0.0000000e+00,  3.2767000e+02,  2.6174000e+02],
       [ 2.0211017e+09,  5.2100000e+02,  7.0000000e+00,  3.8553000e+04,
         2.0000000e+00,  8.8470000e+01,  6.3500000e+00,  0.0000000e+00,
         0.0000000e+00,  3.2680000e+02,  2.6990000e+02]])

In [13]:
target_arr[196647:196650]

array([[ 1.,  0., -1.],
       [ 1.,  0., -1.],
       [ 1.,  0., -1.]])

In [15]:
feature_arr[feature_arr[:,:,0] == 2021102407].shape

(50485, 11)

In [35]:
feature_arr.shape[0]*.15

42142.35

In [59]:
s = 196682 + 1 + 42186

feature_arr[s:s+5,1,:]

array([[2.02110241e+09, 3.32500000e+03, 3.50000000e+01, 3.99500000e+04,
        2.00000000e+00, 7.81600000e+01, 3.93000000e+00, 1.69000000e+00,
        2.41000000e+00, 9.95500000e+01, 1.61060000e+02],
       [2.02110241e+09, 3.35400000e+03, 6.00000000e+00, 3.99500000e+04,
        2.00000000e+00, 5.20700000e+01, 5.41000000e+00, 1.20000000e-01,
        9.70000000e-01, 3.37090000e+02, 1.24440000e+02],
       [2.02110241e+09, 3.35400000e+03, 7.00000000e+00, 3.99500000e+04,
        2.00000000e+00, 5.21100000e+01, 5.44000000e+00, 3.50000000e-01,
        1.66000000e+00, 3.39290000e+02, 1.38140000e+02],
       [2.02110241e+09, 3.35400000e+03, 8.00000000e+00, 3.99500000e+04,
        2.00000000e+00, 5.21600000e+01, 5.47000000e+00, 6.60000000e-01,
        2.24000000e+00, 3.49050000e+02, 1.43860000e+02],
       [2.02110241e+09, 3.35400000e+03, 9.00000000e+00, 3.99500000e+04,
        2.00000000e+00, 5.22400000e+01, 5.52000000e+00, 1.02000000e+00,
        2.39000000e+00, 3.53010000e+02, 1.47540000e+

In [42]:
#feature_arr.shape[0]*.7 = 196664
#feature_arr.shape[0]*.15 = 42142

i = 196682
j = 42157

feature_arr[i+j:i+j+5][:,1,:]

array([[ 2.02110241e+09,  3.30100000e+03,  3.00000000e+01,
         3.99500000e+04,  2.00000000e+00,  9.25200000e+01,
        -5.80000000e-01,  2.95000000e+00,  1.42000000e+00,
         1.52430000e+02,  1.97440000e+02],
       [ 2.02110241e+09,  3.32500000e+03,  6.00000000e+00,
         3.99500000e+04,  2.00000000e+00,  7.20400000e+01,
         5.34000000e+00,  2.80000000e-01,  1.36000000e+00,
         2.42000000e+00,  1.33500000e+02],
       [ 2.02110241e+09,  3.32500000e+03,  7.00000000e+00,
         3.99500000e+04,  2.00000000e+00,  7.20800000e+01,
         5.38000000e+00,  5.50000000e-01,  2.04000000e+00,
         3.30000000e+00,  1.36910000e+02],
       [ 2.02110241e+09,  3.32500000e+03,  8.00000000e+00,
         3.99500000e+04,  2.00000000e+00,  7.21400000e+01,
         5.43000000e+00,  8.60000000e-01,  2.23000000e+00,
         4.44000000e+00,  1.41690000e+02],
       [ 2.02110241e+09,  3.32500000e+03,  9.00000000e+00,
         3.99500000e+04,  2.00000000e+00,  7.22400000e+01,
  

In [60]:
training_index_end = 196682 + 1 # the index of the last timestep of a play
val_index_end = training_index_end + 42186 + 1

x_train = feature_arr[0:training_index_end,:,:]
x_val = feature_arr[training_index_end:val_index_end,:,:]
x_test = feature_arr[val_index_end:,:,:]

y_train = target_arr[0:training_index_end]
y_val = target_arr[training_index_end:val_index_end]
y_test = target_arr[val_index_end:]

In [13]:
# reshaped_x = feature_arr.reshape(feature_arr.shape[0], -1)

# #print(f"Shape of dataframe = {df.shape}")
# #print(f"70% of length = {df.shape[0]*.7}")
# #print(f"15% of length = {df.shape[0]*.15}")

# # training set will start at. Pick these values so at the start of a new play
# train_index_end = 196648
# val_index_end = 238779

# x_train = reshaped_x[0:train_index_end, :].reshape(-1, feature_arr.shape[1], feature_arr.shape[2])
# x_val = reshaped_x[train_index_end:val_index_end, :].reshape(-1, feature_arr.shape[1], feature_arr.shape[2])
# x_test = reshaped_x[val_index_end:, :].reshape(-1, feature_arr.shape[1], feature_arr.shape[2])

# y_train = target_arr[0:train_index_end]
# y_val = target_arr[train_index_end:val_index_end]
# y_test = target_arr[val_index_end:]

In [14]:
# Confirm random seed functionality

# x = [2,3,4,5,6]
# y = [2,3,4,5,6]

# print("Setting seed once")
# np.random.seed(24)
# np.random.shuffle(x)
# np.random.shuffle(y)
# print(x)
# print(y)

# x = [2,3,4,5,6]
# y = [2,3,4,5,6]
# print("Setting seed multiple times")
# np.random.seed(24)
# np.random.shuffle(x)
# np.random.seed(24)
# np.random.shuffle(y)
# print(x)
# print(y)

# print(f"/n Need to reset seed every call")

In [25]:
# unnormalized 

# train_index_end = int(len(feature_arr)*.7)
# val_index_end = int(len(feature_arr)*.7 + len(feature_arr)*.15)

# x_train = feature_arr[0:train_index_end, :]
# x_val = feature_arr[train_index_end:val_index_end, :]
# x_test = feature_arr[val_index_end:, :]

# y_train = target_arr[0:train_index_end]
# y_val = target_arr[train_index_end:val_index_end]
# y_test = target_arr[val_index_end:]

In [26]:
# Shuffle training set

# np.random.seed(24)
# np.random.shuffle(x_train)
# np.random.seed(24)
# np.random.shuffle(y_train)


In [27]:
# Standardize dataset

# get mean/std of each feature. (N*23, 11) vector
train_mu = np.mean(x_train.reshape(-1,x_train.shape[-1]),axis=0)
train_std = np.std(x_train.reshape(-1,x_train.shape[-1]),axis=0)

# don't standardize game, play, frame, nflid, or team_indicator features
x_train[:,:,5:] = (x_train[:,:,5:] - train_mu[5:])/train_std[5:]
x_val[:,:,5:] = (x_val[:,:,5:] - train_mu[5:])/train_std[5:]
x_test[:,:,5:] = (x_test[:,:,5:] - train_mu[5:])/train_std[5:]

In [61]:
print ("number of training examples = " + str(x_train.shape[0]))
print ("number of test examples = " + str(x_test.shape[0]))
print ("X_train shape: " + str(x_train.shape))
print ("Y_train shape: " + str(y_train.shape))
print ("X_val shape: " + str(x_val.shape))
print ("Y_val shape: " + str(y_val.shape))
print ("X_test shape: " + str(x_test.shape))
print ("Y_test shape: " + str(y_test.shape))

number of training examples = 196683
number of test examples = 42079
X_train shape: (196683, 23, 11)
Y_train shape: (196683, 3)
X_val shape: (42187, 23, 11)
Y_val shape: (42187, 3)
X_test shape: (42079, 23, 11)
Y_test shape: (42079, 3)


In [62]:
# np.save("./unnorm_no_shuffle_data/x_train", x_train)
# np.save("./unnorm_no_shuffle_data/y_train", y_train)
# np.save("./unnorm_no_shuffle_data/x_val", x_val)
# np.save("./unnorm_no_shuffle_data/y_val", y_val)
# np.save("./unnorm_no_shuffle_data/x_test", x_test)
# np.save("./unnorm_no_shuffle_data/y_test", y_test)
# np.save("./cleaned_data/train_mu", train_mu)
# np.save("./cleaned_data/train_std", train_std)


In [19]:
train_mu

array([2.02109570e+09, 2.16106625e+03, 2.44495444e+01, 4.36565968e+04,
       1.56521739e+00, 6.02684855e+01, 4.11100611e-02, 2.75592956e+00,
       1.96032014e+00, 1.69615780e+02, 1.70138835e+02])

In [None]:
''' 
Feature list:
['gameId', 'playId', 'frameId', 'nflId', 'team_indicator', 'adj_x', 'adj_y', 's', 'a', 'adj_o', 'adj_dir']
'''