In [1]:
import pandas as pd
import numpy as np
import time
import json
from datetime import datetime
import pickle

import torch

In [2]:
NEW_FOOTBALL_ID = -1
NUM_CLASSES = 23
NUM_AGENTS = 23

In [3]:
pd.options.display.max_columns = None

In [4]:
import sys

sys.path.append('../')

In [5]:
from utils import augment_ngs_frame

In [6]:
def create_ngs_df():
    ngs_df = pd.read_csv("../data/tracking_week_1.csv")
    print(f"dataframe shape = {ngs_df.shape}")
    for i in range(2,10):
        ngs_df = pd.concat([ngs_df,pd.read_csv(f"../data/tracking_week_{i}.csv") ])
    return ngs_df


In [25]:
''' 
Out of bounds plays don't need to have a tackle event: 2022110700_3429

'''
def filter_play_edges(ngs_play):
    events = set(ngs_play.event.unique())   # set of all events in 
    tocheck = ['penalty_flag'] # items to check
    events_in_play = [i for i in tocheck if i in events]
    if len(events_in_play) != 0:
        return pd.DataFrame()
    
    # filter edges
    tocheck = ['out_of_bounds', 'touchdown', 'fumble', 'safety', 'pass_outcome_touchdown', 'tackle'] # items to check
    end_events_in_play = [i for i in tocheck if i in events]
    print(f"end events in play = {end_events_in_play}")
    if len(end_events_in_play) != 0:
        end_frame_id = ngs_play.query("event in @end_events_in_play").frameId.unique().values[0]
        ngs_play = ngs_play.loc[ngs_play.frameId <= end_frame_id]
    
    print(f"final row = {ngs_play.iloc[-1].event}")
    return ngs_play

In [8]:
''' 
Function to return pandas series of given play 
    also adds 'line_of_scrimmage' calculation
'''
def get_play(play_df : pd.DataFrame, game_id : np.int64, play_id : np.int64) -> pd.Series :
    play_info = play_df.loc[(play_df.gameId == game_id) & (play_df.playId == play_id)].iloc[0]
    # line of scrimmage
    YardsFromOwnGoal = np.where(play_info.yardlineSide == play_info.possessionTeam, play_info.yardlineNumber, 50 + (50-play_info.yardlineNumber))
    YardsFromOwnGoal = np.where(play_info.yardlineNumber == 50, 50, YardsFromOwnGoal)
    play_info['line_of_scrimmage'] = YardsFromOwnGoal.item()
    return play_info

# get tackler id
def get_tackler_id(tackles : pd.DataFrame, game_id : np.int64, play_id : np.int64, NEW_FOOTBALL_ID=-1):
    # get tackler info
    play_tackle_info = tackles.loc[(tackles.gameId == game_id) & (tackles.playId == play_id)]
    if play_tackle_info.tackle.sum() == 0:  
        if play_tackle_info.assist.sum() == 0: # no tackles on play
            tackler_id = NEW_FOOTBALL_ID        # set to football id, so future calculations easier
        else:                                   # there were assists, so remove
            return None
    else:
        tackler_id = play_tackle_info.loc[play_tackle_info.tackle == 1,'nflId'].item()
    return tackler_id

In [9]:
games = pd.read_csv("../data/games.csv")
players = pd.read_csv("../data/players.csv")
plays = pd.read_csv("../data/plays.csv")
tackles = pd.read_csv("../data/tackles.csv")

In [10]:
players['height_inches'] = players.height.apply(lambda x: int(x.split("-")[0])*12 + int(x.split("-")[1]))

In [11]:
''' 
return numpay array (9,)
['line_of_scrimmage', 'yards_to_go', 'quarter', 'down', 'seconds_left', 'def_win_prob', 'def_win_prob', 'week', 'is_thursday']
'''
def calc_game_features(game_info : pd.Series, play_info : pd.Series) -> list:

    game_datetime = datetime.strptime(game_info.gameDate + " " + game_info.gameTimeEastern, '%m/%d/%Y %H:%M:%S')
    is_thursday = int(game_datetime.weekday() == 3)

    min, sec = play_info.gameClock.split(":")
    seconds_left = int(min)*60 + int(sec)

    def_are_home = int((game_info.homeTeamAbbr == play_info.defensiveTeam)) #bool

    if def_are_home:
        def_win_prob = play_info.preSnapHomeTeamWinProbability
    else:
        def_win_prob = play_info.preSnapVisitorTeamWinProbability

    game_features = np.array([play_info.line_of_scrimmage, play_info.yardsToGo, play_info.quarter, play_info.down, seconds_left, def_are_home, def_win_prob, game_info.week, is_thursday])

    return game_features

In [12]:
players.position.unique()

array(['QB', 'T', 'TE', 'WR', 'DE', 'NT', 'SS', 'FS', 'G', 'OLB', 'DT',
       'CB', 'RB', 'C', 'ILB', 'MLB', 'FB', 'LS', 'DB'], dtype=object)

In [13]:
ngs_df = create_ngs_df()
all_games = ngs_df.gameId.unique()

dataframe shape = (1407439, 17)


In [14]:
ngs_df.loc[ngs_df.club == 'football', 'nflId'] = NEW_FOOTBALL_ID

In [42]:
''' 
Create a dataset of sequences of length 164, IDed by (game_id)_(play_id)
'''

MAX_FRAME_ID = 164
SAVE=False

base_dir = f"seq_clipped_sorted_data"

game_play_id_list = []  # list of IDs
seq_length_list = []

feature_list = []
target_list = []
id_list = []
context_list = []

features_to_keep = ['attacking_team','football','ball_carrier','x_adj','y_adj','dir_adj','dir_o', 's', 'a', 'height_inches', 'weight']
game_features = ['line_of_scrimmage', 'yards_to_go', 'quarter', 'down', 'seconds_left', 'def_win_prob', 'def_win_prob', 'week', 'is_thursday']


In [43]:
num_plays_kept = 0
penalty_plays = 0
assist_plays = 0
plays_removed_in_clipped = 0
start_time = time.time()
print(f"Starting collection...")

for game_index, game_id in enumerate(all_games):
    all_play_ids = plays.query("gameId == @game_id").playId.values
    for play_id in all_play_ids:
        try:
            current_id = f"{str(game_id)}_{str(play_id)}"
            
            play_info = get_play(plays, game_id, play_id)                   # store play info in series
            if not np.isnan(play_info.penaltyYards):
                penalty_plays += 1
                continue
            tackler_id = get_tackler_id(tackles, game_id, play_id)          # store tackler id
            if tackler_id == None:
                assist_plays += 1
                continue

            ngs_full_play = ngs_df.loc[(ngs_df.gameId == game_id) & (ngs_df.playId == play_id)].copy()
            ngs_full_play = filter_play_edges(ngs_full_play)
            if len(ngs_full_play) == 0:
                plays_removed_in_clipped += 1
                continue

            # add height and weight
            ngs_full_play = pd.merge(ngs_full_play, players.loc[:,['nflId', 'height_inches', 'weight']], how='left',on='nflId').fillna(0)

            frame_ids = ngs_full_play.frameId.unique()
            final_frame_in_play = int(frame_ids.max())
            seq_lengths = [final_frame_in_play]

            game_info = games.query("gameId==@game_id").iloc[0] # series
            game_features = calc_game_features(game_info, play_info)    # list len 9
            all_timesteps_game_features = np.tile(game_features.reshape(1,-1), (final_frame_in_play,1))
            # update time remaining
            all_timesteps_game_features[:,4] = all_timesteps_game_features[:,4] - np.arange(0,final_frame_in_play)*0.1

            # add info about play, which will be useful to visualize
            play_info_dict = {'tackler_id':tackler_id,
                                'final_frame_in_play':final_frame_in_play, 
                                'ball_carrier_id': play_info.ballCarrierId,
                                'playResult': play_info.playResult}
            #play_info_list = [tackler_id, final_frame_in_play, play_info.ballCarrierId, play_info.playResult]

            player_ids = []

            play_features_list = []
            play_targets_list = []
            for frame_id in range(1, final_frame_in_play+1):
                
                ngs_frame_df = ngs_full_play.loc[ngs_full_play.frameId == frame_id].copy()

                augment_ngs_frame(play_info.possessionTeam, play_info.ballCarrierId, ngs_frame_df)
                ngs_frame_df['tackle'] = (ngs_frame_df.nflId == tackler_id).astype(int)     # first entry represents 'no tackle'

                feature_matrix = ngs_frame_df[features_to_keep].to_numpy().flatten() # (23,9) -> (207,)
                target = ngs_frame_df.loc[:, 'tackle'].values  # (23,) array
                
                play_features_list.append(feature_matrix)
                play_targets_list.append(target)

                if len(player_ids) == 0:
                    player_ids = ngs_frame_df.nflId.unique() # numpy sorted array

            # # pad values at end
            for pad_id in range(0, MAX_FRAME_ID - final_frame_in_play):
                play_features_list.append(np.zeros(len(features_to_keep)*(NUM_AGENTS), dtype=np.int64))
                play_targets_list.append(np.zeros(NUM_CLASSES, dtype=np.int64))
            # pad values at end of game features
            full_context_vector = np.concatenate([all_timesteps_game_features, np.zeros((MAX_FRAME_ID - final_frame_in_play, all_timesteps_game_features.shape[1]))], axis=0)
            full_context_vector = torch.tensor(full_context_vector)
            
            if SAVE:
                torch.save(full_context_vector, f"../cleaned_data/{base_dir}/context_vector/{current_id}.pt")
                torch.save(torch.Tensor(play_features_list), f"../cleaned_data/{base_dir}/features/{current_id}.pt")
                torch.save(torch.Tensor(play_targets_list), f"../cleaned_data/{base_dir}/labels/{current_id}.pt")
                with open(f"../cleaned_data/{base_dir}/play_info_dict/{current_id}", 'wb') as file:
                   pickle.dump(play_info_dict, file)
                np.save(f"../cleaned_data/{base_dir}/player_ids/{current_id}.npy", player_ids)

            game_play_id_list.append(current_id)      # add ID to id list
            seq_length_list.append(seq_lengths)

            feature_list.append(play_features_list)
            target_list.append(play_targets_list)
            id_list.append(player_ids)
            context_list.append(full_context_vector)

            num_plays_kept += 1
        except Exception as e:
            print(f"skipping game {game_id}, play {play_id} bc exception {e}")
        
        if num_plays_kept % 1000 == 0:
            cur_time = time.time()
            print(f"Recorded {num_plays_kept} plays in {round((cur_time - start_time)/60, 3)} minutes")
        
    break

end_time = time.time()
print(f"Recorded {num_plays_kept} plays in {round((end_time - start_time)/60, 3)} minutes")

Starting collection...
final row =            gameId playId nflId displayName frameId  \
44251  2022090800   2184  -1.0    football      44   

                             time jerseyNumber      club playDirection      x  \
44251  2022-09-08 22:01:31.700000          NaN  football          left  25.68   

               y     s     a  dis    o  dir event  
44251  31.120001  1.59  3.31  0.2  NaN  NaN   NaN  
final row event = nan
end events in play = ['tackle']
final row = tackle
final POST row event = tackle
final POST row =            gameId playId nflId displayName frameId  \
44247  2022090800   2184  -1.0    football      40   

                             time jerseyNumber      club playDirection  \
44247  2022-09-08 22:01:31.299999          NaN  football          left   

               x      y     s     a   dis    o  dir   event  
44247  25.629999  30.25  3.11  2.18  0.35  NaN  NaN  tackle  
final row =            gameId playId nflId displayName frameId  \
71460  2022090800   3

KeyboardInterrupt: 

In [19]:

seq_length_tensor = torch.Tensor(seq_length_list)
feature_tensor = torch.Tensor(feature_list)
target_tensor = torch.Tensor(target_list)
context_tensor = torch.stack(context_list)

id_list = np.array(id_list)

RuntimeError: stack expects a non-empty TensorList

In [18]:
print(f"Number of plays:")
print(f"Kept={num_plays_kept}")
print(f"w/ penalties={penalty_plays}")
print(f"w/ assists (removed)={assist_plays}")
print(f"w/ clipped (removed)={plays_removed_in_clipped}")


Number of plays:
Kept=9928
w/ penalties=615
w/ assists (removed)=1934
w/ clipped (removed)=0


In [19]:
with open(f"../cleaned_data/{base_dir}/game_play_id.json", 'w') as f:
    json.dump(game_play_id_list, f, indent=2) 

torch.save(seq_length_tensor, f"../cleaned_data/{base_dir}/seq_length_tensor.pt")
torch.save(feature_tensor, f"../cleaned_data/{base_dir}/all_features.pt")
torch.save(target_tensor, f"../cleaned_data/{base_dir}/all_targets.pt")
torch.save(context_tensor, f"../cleaned_data/{base_dir}/all_context.pt")
np.save(f"../cleaned_data/{base_dir}/all_id_info.npy", np.array(id_list))

In [15]:
''' Test area '''
pd.set_option('display.max_columns', None)

In [32]:
ngs_df.query("event == 'tackle'").drop_duplicates(subset=['gameId', 'playId', 'frameId'])

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
17,2022090800,56,35472.0,Rodger Saffold,18,2022-09-08 20:24:06.900000,76.0,BUF,left,88.23,27.09,1.72,0.26,0.18,288.76,302.08,tackle
531,2022090800,80,35472.0,Rodger Saffold,26,2022-09-08 20:24:38.200000,76.0,BUF,left,81.32,27.31,1.84,0.61,0.19,313.15,304.54,tackle
1240,2022090800,101,35472.0,Rodger Saffold,45,2022-09-08 20:25:12.599999,76.0,BUF,left,67.52,35.38,1.56,0.93,0.16,263.27,302.92,tackle
2351,2022090800,122,35472.0,Rodger Saffold,29,2022-09-08 20:25:53.799999,76.0,BUF,left,61.27,42.23,5.97,2.44,0.62,311.46,304.49,tackle
3133,2022090800,146,35472.0,Rodger Saffold,52,2022-09-08 20:26:34.500000,76.0,BUF,left,54.09,26.31,1.79,1.54,0.18,162.09,177.59,tackle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1143660,2022110700,3658,37724.0,Chris Harris,55,2022-11-07 22:59:53.200000,19.0,NO,left,47.31,17.86,0.47,2.84,0.06,99.00,218.21,tackle
1145034,2022110700,3686,38557.0,Kevin Zeitler,72,2022-11-07 23:00:38.299999,70.0,BAL,left,59.60,31.40,1.26,1.47,0.14,265.72,296.85,tackle
1146764,2022110700,3707,38557.0,Kevin Zeitler,54,2022-11-07 23:03:50.799999,70.0,BAL,left,59.08,29.28,1.55,0.30,0.16,328.32,330.42,tackle
1148082,2022110700,3740,38557.0,Kevin Zeitler,38,2022-11-07 23:04:34.900000,70.0,BAL,left,57.66,21.35,3.72,0.78,0.39,171.54,182.88,tackle


In [44]:
play_df = ngs_df.loc[(ngs_df.gameId == 2022090800) & (ngs_df.playId == 2184) & (ngs_df.displayName == 'football')]

In [46]:
play_df.head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
44208,2022090800,2184,-1.0,football,1,2022-09-08 22:01:27.400000,,football,left,28.02,12.04,22.540001,5.34,2.28,,,pass_arrived
44209,2022090800,2184,-1.0,football,2,2022-09-08 22:01:27.500000,,football,left,27.190001,11.58,4.74,0.57,0.96,,,
44210,2022090800,2184,-1.0,football,3,2022-09-08 22:01:27.599999,,football,left,27.25,12.05,4.64,1.22,0.47,,,
44211,2022090800,2184,-1.0,football,4,2022-09-08 22:01:27.700000,,football,left,27.32,12.5,4.51,1.42,0.46,,,
44212,2022090800,2184,-1.0,football,5,2022-09-08 22:01:27.799999,,football,left,27.370001,12.94,4.36,1.58,0.44,,,


In [49]:
tackles.query("gameId == 2022090800 and playId == 2184")

Unnamed: 0,gameId,playId,nflId,tackle,assist,forcedFumble,pff_missedTackle
638,2022090800,2184,52492,1,0,0,0


In [51]:
ngs_df.query("nflId == 52492").head()

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
5743,2022090800,191,52492.0,Terrell Lewis,1,2022-09-08 20:27:42.099999,52.0,LA,left,44.29,30.61,0.0,0.0,0.0,99.47,304.02,
5744,2022090800,191,52492.0,Terrell Lewis,2,2022-09-08 20:27:42.200000,52.0,LA,left,44.29,30.61,0.0,0.0,0.0,101.74,295.25,
5745,2022090800,191,52492.0,Terrell Lewis,3,2022-09-08 20:27:42.299999,52.0,LA,left,44.29,30.61,0.0,0.0,0.01,101.74,285.35,
5746,2022090800,191,52492.0,Terrell Lewis,4,2022-09-08 20:27:42.400000,52.0,LA,left,44.29,30.6,0.0,0.0,0.01,101.74,270.48,
5747,2022090800,191,52492.0,Terrell Lewis,5,2022-09-08 20:27:42.500000,52.0,LA,left,44.3,30.6,0.0,0.0,0.01,101.04,265.74,


In [48]:
plays.query("gameId == 2022090800 and playId == 2184")

Unnamed: 0,gameId,playId,ballCarrierId,ballCarrierDisplayName,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,passResult,passLength,penaltyYards,prePenaltyPlayResult,playResult,playNullifiedByPenalty,absoluteYardlineNumber,offenseFormation,defendersInTheBox,passProbability,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,homeTeamWinProbabilityAdded,visitorTeamWinProbilityAdded,expectedPoints,expectedPointsAdded,foulName1,foulName2,foulNFLId1,foulNFLId2
209,2022090800,2184,42489,Stefon Diggs,(9:24) (Shotgun) J.Allen pass short left to S....,3,1,10,BUF,LA,LA,22,9:24,10,10,C,4.0,,7,7,N,32,SHOTGUN,5.0,0.796962,0.324996,0.675004,-0.013726,0.013726,4.7305,0.406682,,,,


In [29]:
play_df = ngs_df.loc[(ngs_df.gameId == game_id) & (ngs_df.playId == play_id)]

In [24]:
players.height.apply(lambda x: int(x.split("-")[0])*12 + int(x.split("-")[1]))

0       76
1       76
2       74
3       78
4       76
        ..
1678    78
1679    72
1680    74
1681    73
1682    74
Name: height, Length: 1683, dtype: int64

In [31]:
play_df_with_info = pd.merge(play_df, players.loc[:,['nflId', 'height_inches', 'weight']], how='left',on='nflId')

In [43]:
play_df.fillna(0)

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
19113,2022090800,896,35472.0,Rodger Saffold,1,2022-09-08 20:55:39.900000,76.0,BUF,right,42.700000,25.54,0.01,0.01,0.00,115.69,22.90,0
19114,2022090800,896,35472.0,Rodger Saffold,2,2022-09-08 20:55:40.000000,76.0,BUF,right,42.700000,25.54,0.01,0.01,0.00,116.60,29.20,0
19115,2022090800,896,35472.0,Rodger Saffold,3,2022-09-08 20:55:40.099999,76.0,BUF,right,42.700000,25.54,0.01,0.01,0.01,117.76,41.92,0
19116,2022090800,896,35472.0,Rodger Saffold,4,2022-09-08 20:55:40.200000,76.0,BUF,right,42.710000,25.53,0.00,0.01,0.01,118.54,61.56,0
19117,2022090800,896,35472.0,Rodger Saffold,5,2022-09-08 20:55:40.299999,76.0,BUF,right,42.710000,25.52,0.01,0.14,0.01,118.54,130.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20051,2022090800,896,-1.0,football,37,2022-09-08 20:55:43.500000,0.0,football,right,45.889999,12.99,4.73,5.46,0.49,0.00,0.00,fumble
20052,2022090800,896,-1.0,football,38,2022-09-08 20:55:43.599999,0.0,football,right,46.110001,12.59,4.29,5.39,0.45,0.00,0.00,0
20053,2022090800,896,-1.0,football,39,2022-09-08 20:55:43.700000,0.0,football,right,46.279999,12.20,4.08,5.00,0.43,0.00,0.00,0
20054,2022090800,896,-1.0,football,40,2022-09-08 20:55:43.799999,0.0,football,right,46.400002,11.80,4.06,4.75,0.41,0.00,0.00,0


In [33]:
play_df_with_info.nflId.unique()

array([ 3.5472e+04,  3.8577e+04,  4.1239e+04,  4.2392e+04,  4.2489e+04,
        4.2816e+04,  4.3294e+04,  4.3298e+04,  4.4875e+04,  4.6076e+04,
        4.7844e+04,  4.7857e+04,  4.7862e+04,  4.7879e+04,  4.7917e+04,
        4.7939e+04,  4.8026e+04,  4.8512e+04,  5.2536e+04,  5.3522e+04,
        5.3532e+04,  5.4528e+04, -1.0000e+00])

In [16]:
ngs_df.event.unique()

array([nan, 'pass_arrived', 'pass_outcome_caught', 'tackle', 'run',
       'first_contact', 'ball_snap', 'handoff', 'touchdown',
       'out_of_bounds', 'man_in_motion', 'fumble', 'play_action',
       'pass_forward', 'lateral', 'autoevent_passforward',
       'autoevent_passinterrupted', 'line_set', 'qb_slide', 'shift',
       'run_pass_option', 'qb_sack', 'pass_shovel', 'autoevent_ballsnap',
       'snap_direct', 'fumble_defense_recovered',
       'fumble_offense_recovered', 'penalty_flag', 'safety',
       'pass_outcome_touchdown', 'penalty_accepted'], dtype=object)

In [17]:
ngs_df.event.value_counts()

event
first_contact                242262
tackle                       230924
ball_snap                    145408
handoff                      135381
pass_outcome_caught          130087
pass_arrived                 113825
out_of_bounds                 41629
run                           21804
man_in_motion                  9683
touchdown                      9523
play_action                    6463
shift                          3358
qb_slide                       3158
fumble                         2415
pass_forward                   1899
snap_direct                    1150
lateral                         618
line_set                        552
pass_shovel                     414
autoevent_passforward           360
autoevent_passinterrupted       324
qb_sack                         230
autoevent_ballsnap              162
fumble_defense_recovered        115
penalty_flag                     93
fumble_offense_recovered         69
safety                           69
run_pass_option       

In [30]:
ngs_df.query("event == 'pass_outcome_touchdown'").iloc[0:30,:]

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
557138,2022092504,4168,38632.0,Kirk Cousins,10,2022-09-25 16:09:46.500000,8.0,MIN,right,72.66,32.6,0.89,0.97,0.09,181.0,164.56,pass_outcome_touchdown
557152,2022092504,4168,40488.0,Adam Thielen,10,2022-09-25 16:09:46.500000,19.0,MIN,right,94.69,10.82,3.47,1.92,0.35,138.43,159.52,pass_outcome_touchdown
557166,2022092504,4168,44834.0,Charles Harris,10,2022-09-25 16:09:46.500000,53.0,DET,right,72.8,30.04,1.31,1.16,0.13,103.33,102.29,pass_outcome_touchdown
557180,2022092504,4168,44888.0,Alex Anzalone,10,2022-09-25 16:09:46.500000,34.0,DET,right,84.71,44.04,4.87,2.53,0.5,16.07,6.26,pass_outcome_touchdown
557194,2022092504,4168,46099.0,Mike Hughes,10,2022-09-25 16:09:46.500000,23.0,DET,right,107.97,11.46,6.55,3.71,0.67,183.31,149.63,pass_outcome_touchdown
557208,2022092504,4168,46131.0,Brian O'Neill,10,2022-09-25 16:09:46.500000,75.0,MIN,right,75.56,25.05,2.7,0.08,0.29,126.19,129.96,pass_outcome_touchdown
557222,2022092504,4168,46259.0,DeShon Elliott,10,2022-09-25 16:09:46.500000,5.0,DET,right,108.72,19.82,6.03,3.0,0.61,168.6,171.22,pass_outcome_touchdown
557236,2022092504,4168,47801.0,Garrett Bradbury,10,2022-09-25 16:09:46.500000,56.0,MIN,right,80.61,23.93,3.03,0.63,0.31,114.25,135.13,pass_outcome_touchdown
557250,2022092504,4168,47833.0,Irv Smith,10,2022-09-25 16:09:46.500000,84.0,MIN,right,83.94,5.75,4.41,2.31,0.44,111.69,122.58,pass_outcome_touchdown
557264,2022092504,4168,47864.0,Will Harris,10,2022-09-25 16:09:46.500000,25.0,DET,right,92.5,6.4,6.08,1.73,0.61,114.48,112.29,pass_outcome_touchdown


In [48]:
game_id = 2022092504
play_id = 4168

In [32]:
play_info = get_play(plays, game_id, play_id)

In [33]:
tackler_id = get_tackler_id(tackles, game_id, play_id)

In [35]:
tackler_id

-1

In [36]:
pd.DataFrame(play_info).T

Unnamed: 0,gameId,playId,ballCarrierId,ballCarrierDisplayName,playDescription,quarter,down,yardsToGo,possessionTeam,defensiveTeam,yardlineSide,yardlineNumber,gameClock,preSnapHomeScore,preSnapVisitorScore,passResult,passLength,penaltyYards,prePenaltyPlayResult,playResult,playNullifiedByPenalty,absoluteYardlineNumber,offenseFormation,defendersInTheBox,passProbability,preSnapHomeTeamWinProbability,preSnapVisitorTeamWinProbability,homeTeamWinProbabilityAdded,visitorTeamWinProbilityAdded,expectedPoints,expectedPointsAdded,foulName1,foulName2,foulNFLId1,foulNFLId2,line_of_scrimmage
5905,2022092504,4168,52584,K.J. Osborn,(:50) (Shotgun) K.Cousins pass deep right to K...,4,1,10,MIN,DET,DET,28,0:50,21,24,C,27.0,,28,28,N,82,SHOTGUN,5.0,0.613358,0.475384,0.524616,0.471527,-0.471527,2.319934,4.680066,,,,,72


In [37]:
play_info.playDescription

'(:50) (Shotgun) K.Cousins pass deep right to K.Osborn for 28 yards, TOUCHDOWN [A.McNeill].'

In [38]:
ngs_df.loc[(ngs_df.gameId == game_id) & (ngs_df.playId == play_id) & (ngs_df.frameId == 1)].sort_values("nflId")

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
557437,2022092504,4168,-1.0,football,1,2022-09-25 16:09:45.599999,,football,right,100.419998,13.66,20.889999,4.49,2.11,,,
557129,2022092504,4168,38632.0,Kirk Cousins,1,2022-09-25 16:09:45.599999,8.0,MIN,right,72.89,32.49,2.1,4.59,0.23,144.85,332.43,
557143,2022092504,4168,40488.0,Adam Thielen,1,2022-09-25 16:09:45.599999,19.0,MIN,right,94.57,14.15,4.26,1.04,0.42,185.55,190.86,
557157,2022092504,4168,44834.0,Charles Harris,1,2022-09-25 16:09:45.599999,53.0,DET,right,71.53,29.83,1.62,0.19,0.16,97.95,69.68,
557171,2022092504,4168,44888.0,Alex Anzalone,1,2022-09-25 16:09:45.599999,34.0,DET,right,83.51,39.12,6.09,2.52,0.61,18.61,27.03,
557185,2022092504,4168,46099.0,Mike Hughes,1,2022-09-25 16:09:45.599999,23.0,DET,right,103.83,17.2,8.65,2.32,0.87,162.14,137.31,
557199,2022092504,4168,46131.0,Brian O'Neill,1,2022-09-25 16:09:45.599999,75.0,MIN,right,73.93,26.4,1.81,0.61,0.19,156.7,126.01,
557213,2022092504,4168,46259.0,DeShon Elliott,1,2022-09-25 16:09:45.599999,5.0,DET,right,108.4,25.82,6.91,2.02,0.69,179.02,174.84,
557227,2022092504,4168,47801.0,Garrett Bradbury,1,2022-09-25 16:09:45.599999,56.0,MIN,right,79.08,25.96,2.69,1.3,0.28,134.46,156.41,
557241,2022092504,4168,47833.0,Irv Smith,1,2022-09-25 16:09:45.599999,84.0,MIN,right,81.58,8.57,4.2,2.57,0.43,145.16,158.11,


In [39]:
ngs_df.loc[(ngs_df.gameId == game_id) & (ngs_df.playId == play_id) & (ngs_df.displayName == 'K.J. Osborn')]

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
557353,2022092504,4168,52584.0,K.J. Osborn,1,2022-09-25 16:09:45.599999,17.0,MIN,right,106.1,12.34,8.72,2.36,0.88,219.41,144.5,
557354,2022092504,4168,52584.0,K.J. Osborn,2,2022-09-25 16:09:45.700000,17.0,MIN,right,106.6,11.62,8.68,2.16,0.87,221.81,145.92,
557355,2022092504,4168,52584.0,K.J. Osborn,3,2022-09-25 16:09:45.799999,17.0,MIN,right,107.09,10.9,8.66,1.82,0.87,221.81,146.61,
557356,2022092504,4168,52584.0,K.J. Osborn,4,2022-09-25 16:09:45.900000,17.0,MIN,right,107.58,10.17,8.68,1.35,0.88,212.61,146.59,
557357,2022092504,4168,52584.0,K.J. Osborn,5,2022-09-25 16:09:46.000000,17.0,MIN,right,108.07,9.45,8.64,1.23,0.87,206.89,146.23,pass_arrived
557358,2022092504,4168,52584.0,K.J. Osborn,6,2022-09-25 16:09:46.099999,17.0,MIN,right,108.56,8.73,8.58,1.34,0.87,198.16,145.63,pass_outcome_caught
557359,2022092504,4168,52584.0,K.J. Osborn,7,2022-09-25 16:09:46.200000,17.0,MIN,right,109.06,8.04,8.52,1.68,0.86,189.19,144.33,
557360,2022092504,4168,52584.0,K.J. Osborn,8,2022-09-25 16:09:46.299999,17.0,MIN,right,109.57,7.35,8.42,2.18,0.85,170.87,142.66,
557361,2022092504,4168,52584.0,K.J. Osborn,9,2022-09-25 16:09:46.400000,17.0,MIN,right,110.08,6.7,8.29,2.52,0.83,156.64,141.18,
557362,2022092504,4168,52584.0,K.J. Osborn,10,2022-09-25 16:09:46.500000,17.0,MIN,right,110.6,6.07,8.16,2.66,0.82,148.23,139.43,pass_outcome_touchdown


In [59]:
ngs_df.loc[(ngs_df.gameId == 2022091100) & (ngs_df.playId == 2374)].loc[ngs_df.loc[(ngs_df.gameId == 2022091100) & (ngs_df.playId == 2374)].frameId < 9]

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
136689,2022091100,2374,38607.0,Demario Davis,1,2022-09-11 14:55:37.500000,56.0,NO,right,65.660000,13.66,5.22,1.60,0.52,156.74,159.21,
136690,2022091100,2374,38607.0,Demario Davis,2,2022-09-11 14:55:37.599999,56.0,NO,right,65.860000,13.17,5.31,1.36,0.53,152.47,157.68,pass_arrived
136691,2022091100,2374,38607.0,Demario Davis,3,2022-09-11 14:55:37.700000,56.0,NO,right,66.060000,12.68,5.36,1.10,0.54,149.49,156.57,
136692,2022091100,2374,38607.0,Demario Davis,4,2022-09-11 14:55:37.799999,56.0,NO,right,66.280000,12.18,5.37,0.93,0.54,145.87,155.64,
136693,2022091100,2374,38607.0,Demario Davis,5,2022-09-11 14:55:37.900000,56.0,NO,right,66.510000,11.70,5.31,1.10,0.53,142.89,154.35,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136978,2022091100,2374,-1.0,football,4,2022-09-11 14:55:37.799999,,football,right,69.699997,2.39,4.68,1.93,0.49,,,
136979,2022091100,2374,-1.0,football,5,2022-09-11 14:55:37.900000,,football,right,69.540001,1.95,4.77,2.03,0.48,,,
136980,2022091100,2374,-1.0,football,6,2022-09-11 14:55:38.000000,,football,right,69.360001,1.50,4.79,1.35,0.48,,,pass_outcome_caught
136981,2022091100,2374,-1.0,football,7,2022-09-11 14:55:38.099999,,football,right,69.180000,1.06,4.76,0.60,0.48,,,


In [None]:
''' 
Out of bounds plays don't need to have a tackle event: 2022110700_3429

'''
def filter_play_edges(ngs_play):
    events = set(ngs_play.events.unique())   # set of all events in 
    tocheck = ['fumble', 'penalty_flag', 'safety', 'pass_outcome_touchdown'] # items to check
    events_in_play = [i for i in tocheck if i in events]
    if len(events_in_play) != 0:
        return None
    
    # filter edges
    tocheck = ['out_of_bounds', 'touchdown'] # items to check
    end_events_in_play = [i for i in tocheck if i in events]
    if len(end_events_in_play) != 0:
        end_frame_id = ngs_df.query("event in @end_events_in_play").frameId.values[0]
        ngs_play = ngs_play.loc[ngs_play.frameId <= end_frame_id]
    
    return ngs_play


In [11]:
all_games = ngs_df.gameId.unique()

In [12]:
game_ex = all_games[0]
play_ex = 56

In [14]:
full_play = ngs_df.loc[(ngs_df.gameId == game_ex) & (ngs_df.playId == play_ex)].copy()

In [21]:
full_play.iloc[0:20,:]

Unnamed: 0,gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
0,2022090800,56,35472.0,Rodger Saffold,1,2022-09-08 20:24:05.200000,76.0,BUF,left,88.37,27.27,1.62,1.15,0.16,231.74,147.9,
1,2022090800,56,35472.0,Rodger Saffold,2,2022-09-08 20:24:05.299999,76.0,BUF,left,88.47,27.13,1.67,0.61,0.17,230.98,148.53,pass_arrived
2,2022090800,56,35472.0,Rodger Saffold,3,2022-09-08 20:24:05.400000,76.0,BUF,left,88.56,27.01,1.57,0.49,0.15,230.98,147.05,
3,2022090800,56,35472.0,Rodger Saffold,4,2022-09-08 20:24:05.500000,76.0,BUF,left,88.64,26.9,1.44,0.89,0.14,232.38,145.42,
4,2022090800,56,35472.0,Rodger Saffold,5,2022-09-08 20:24:05.599999,76.0,BUF,left,88.72,26.8,1.29,1.24,0.13,233.36,141.95,
5,2022090800,56,35472.0,Rodger Saffold,6,2022-09-08 20:24:05.700000,76.0,BUF,left,88.8,26.7,1.15,1.42,0.12,234.48,139.41,pass_outcome_caught
6,2022090800,56,35472.0,Rodger Saffold,7,2022-09-08 20:24:05.799999,76.0,BUF,left,88.87,26.64,0.93,1.69,0.09,235.77,134.32,
7,2022090800,56,35472.0,Rodger Saffold,8,2022-09-08 20:24:05.900000,76.0,BUF,left,88.91,26.59,0.68,1.74,0.07,240.0,131.01,
8,2022090800,56,35472.0,Rodger Saffold,9,2022-09-08 20:24:06.000000,76.0,BUF,left,88.94,26.57,0.42,1.74,0.04,243.56,122.29,
9,2022090800,56,35472.0,Rodger Saffold,10,2022-09-08 20:24:06.099999,76.0,BUF,left,88.95,26.58,0.14,1.83,0.01,246.07,85.87,


In [None]:
for game_index, game_id in enumerate(all_games):
    all_play_ids = plays.query("gameId == @game_id").playId.values
    for play_id in all_play_ids:
        try:
            current_id = f"{str(game_id)}_{str(play_id)}"
            
            play_info = get_play(plays, game_id, play_id)                   # store play info in series
            if not np.isnan(play_info.penaltyYards):
                continue
            tackler_id = get_tackler_id(tackles, game_id, play_id)          # store tackler id
            if tackler_id == None:
                continue

            ngs_full_play = ngs_df.loc[(ngs_df.gameId == game_id) & (ngs_df.playId == play_id)].copy()
        
        except Exception as e:
            print(f"Skipping play bc {e}")