[Kaggle Competition](https://www.kaggle.com/c/nfl-big-data-bowl-2021)

Intent is to look into passing stats of targeted receiver vs. defender to see if physical traits differences between the two are important when determining defensive success.

1. May need to only look at man coverage for this - build model to predict man vs. zone scheme based on labeled data so that entirety of season play data can be used?
2. Include speed/agility as a differentiator? Will need to determine max speed/accel of players from full season tracking data
3. May need to find and exclude plays thrown into double coverage (i.e. second defender close to ball)
4. Calculate stats like passer rating for combinations, heat map for height-weight difference

## Notes

1. For plays, remove typeDropback == UNKNOWN when applying man/zone model (either spike, or trick plays on special teams)

In [1]:
# add local directory to import path
import os
import sys
module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.optimize as optim

import joblib

# local import 
import nflutil

In [3]:
play_df = pd.read_csv('csv/plays.csv')
game_df = pd.read_csv('csv/games.csv')
player_df = pd.read_csv('csv/players.csv')
target_df = pd.read_csv('csv/targetedReceiver.csv')
coverage_df = pd.read_csv('csv/coverages_week1.csv')

In [4]:
# load the zone classifier
clf_zone = joblib.load('zone_classifier.joblib')

In [5]:
week_num = 15

track_df = pd.read_csv(f'csv/week{week_num}.csv')

# Call out bad plays to remove from analysis

1. gameId = 2018102101, playId = 3078: Offense positions are clearly wrong at start and during play
2. gameId = 2018092301, playId = 477: SS Jefferson position jumps around and is intermittent during play
2. gameId = 2018092301, playId = 949: Same as play 477

In [6]:
a=pd.DataFrame({'a': [1,2,3], 'b':[4,5,6]})
a[['a','b']].apply(tuple,1).isin([(1,4), (1,5)])

0     True
1    False
2    False
dtype: bool

In [7]:
# list of (gameId, playId)
bad_plays = [(2018102101, 3078),
             (2018092301, 477),
             (2018092301, 949)]

# 1. Determine the closest defender to each play

In [8]:
# Function to calculate the closest defender when the ball arrives at the receiver
def closest_defender(track_df, game_df, play_df):
    # returns the nflId of the defensive player closest to the targeted player at the resolution of the pass attempt
    cd_series = pd.Series([np.nan, np.nan], index=['nflId_def', 'dist_def'])
    
    # get the current play gameId and playId
    game_id = track_df.gameId.iloc[0]
    play_id = track_df.playId.iloc[0]
    
    # determine who has the ball (team code)
    home_abbr = game_df[game_df.gameId==game_id].iloc[0]['homeTeamAbbr']
    away_abbr = game_df[game_df.gameId==game_id].iloc[0]['visitorTeamAbbr']
    
    abbr_possess = play_df[(play_df.gameId==game_id) & (play_df.playId==play_id)].iloc[0]['possessionTeam']
    
    if abbr_possess == home_abbr:
        team_poss = 'home'
        team_def = 'away'
    else:
        team_poss = 'away'
        team_def = 'home'
    
    if np.any(track_df.targetNflId.isna()):
        # no targeted receiver data for this play, return NaN
        return cd_series
    
    # constants
#     DEF_POSITIONS = ['DE', 'DL', 'NT', 'LB', 'MLB', 'ILB', 'OLB', 'DB', 'CB', 'FS', 'SS', 'S']
    PASS_END_EVENTS = ['pass_arrived',
                       'pass_outcome_interception',
                       'pass_outcome_incomplete',
                       'pass_outcome_caught']
    # find the frameId of the earliest of the pass end events
    # - account for errors on individual players: group by event, find the median frame for each event, then
    #   take the minimum of the medians to get the first event (essentially voting then min)
    frame_id = track_df.loc[track_df.event.isin(PASS_END_EVENTS), ['event', 'frameId']].groupby('event').median().min().iloc[0]
    if np.isnan(frame_id):
        return cd_series  # no frame to choose closest defender, return nan
    
    # get the data of the applicable frame
    frame_df = track_df[track_df.frameId == frame_id]
    
    # get the location of the targeted receiver
    if not np.any(frame_df.nflId == frame_df.targetNflId):
        # no target data for the play, discard
        return cd_series
    
    # QB throwaways consider the target the QB. If this is the case, do not consider closest defender.
    tgt_position = frame_df.loc[frame_df.nflId == frame_df.targetNflId, 'position'].iloc[0]
    if tgt_position == 'QB':
        return cd_series
    
    x_tgt = frame_df.loc[frame_df.nflId == frame_df.targetNflId, 'x'].iloc[0]
    y_tgt = frame_df.loc[frame_df.nflId == frame_df.targetNflId, 'y'].iloc[0]

    
    # get the location of defenders
    if not np.any(frame_df.team == team_def):
        # no defender data - return nan
        return cd_series
    
    nfl_id_def = frame_df.loc[frame_df.team == team_def, 'nflId'].to_numpy()
    x_def = frame_df.loc[frame_df.team == team_def, 'x'].to_numpy()
    y_def = frame_df.loc[frame_df.team == team_def, 'y'].to_numpy()
    
    # calculate the distance between the defenders and the targeted receiver
    dist_def = np.sqrt((x_def - x_tgt)**2 + (y_def - y_tgt)**2)
    
    # determine the closest defender to the ball
    try:
        idx_min = np.argmin(dist_def)
    except Exception as err:
        print(frame_df)
        raise err
    
    # return ID and distance for closest defender
    cd_series['nflId_def'] = nfl_id_def[idx_min]
    cd_series['dist_def'] = dist_def[idx_min]
    return cd_series
    

In [9]:
# Function to calculate the closest defender when the ball arrives at the receiver
def depth_of_pass(track_df):
    
    # transform the directionality of the data
    play_track_df = nflutil.transform_tracking_data(track_df)
    
    # returns the nflId of the defensive player closest to the targeted player at the resolution of the pass attempt
    pass_depth = np.nan
    
    if np.any(play_track_df.targetNflId.isna()):
        # no targeted receiver data for this play, return NaN
        return np.nan
    
    # constants
    PASS_END_EVENTS = ['pass_arrived',
                       'pass_outcome_interception',
                       'pass_outcome_incomplete',
                       'pass_outcome_caught']
    # find the frameId of the earliest of the pass end events
    # - account for errors on individual players: group by event, find the median frame for each event, then
    #   take the minimum of the medians to get the first event (essentially voting then min)
    frame_id = (play_track_df.loc[play_track_df.event.isin(PASS_END_EVENTS), ['event', 'frameId']]
                .groupby('event')
                .median().min().iloc[0])
    
    if np.isnan(frame_id):
        return np.nan  # no frame to choose closest defender, return nan
    
    # get the data of the applicable frame
    frame_df = play_track_df[play_track_df.frameId == frame_id].copy()
    
    # get the location of the targeted receiver
    if not np.any(frame_df.nflId == frame_df.targetNflId):
        # no target data for the play, discard
        return np.nan
    
    # QB throwaways consider the target the QB. If this is the case, do not consider closest defender.
    tgt_position = frame_df.loc[frame_df.nflId == frame_df.targetNflId, 'position'].iloc[0]
    if tgt_position == 'QB':
        return np.nan
    
    # get x-coordinate of the line of scrimmage (in normalized direction)
    x_los = play_track_df.x[(play_track_df.team == 'football') & (play_track_df.frameId == 1)].iloc[0]
    
    # save the distance downfield of all observations relative to the line of scrimmage
    frame_df['depth'] = frame_df['x'] - x_los
    
    depth_tgt = frame_df.loc[frame_df.nflId == frame_df.targetNflId, 'depth'].iloc[0]
#     depth_football = frame_df.loc[frame_df.team == 'football', 'depth'].iloc[0]
    
    # return the depth of the target
    return depth_tgt
    

Test the function for a given play:

In [10]:
game_id = 2018121300
play_id = 84
x = pd.merge(track_df[(track_df.gameId == game_id) & (track_df.playId == play_id)], target_df, 
             how='left', on=['gameId','playId'])
x.head()

Unnamed: 0,time,x,y,s,a,dis,o,dir,event,nflId,displayName,jerseyNumber,position,frameId,team,gameId,playId,playDirection,route,targetNflId
0,2018-12-14T01:23:55.400Z,79.82,24.76,4.95,2.71,0.5,159.18,152.45,,496723.0,Eric Berry,29.0,SS,1,home,2018121300,84,left,,2553913.0
1,2018-12-14T01:23:55.400Z,89.06,22.81,1.74,4.09,0.2,253.9,199.21,,2495288.0,Virgil Green,88.0,TE,1,away,2018121300,84,left,,2553913.0
2,2018-12-14T01:23:55.400Z,86.28,32.45,0.01,0.01,0.0,122.22,69.31,,2495493.0,Justin Houston,50.0,OLB,1,home,2018121300,84,left,,2553913.0
3,2018-12-14T01:23:55.400Z,91.54,27.3,0.11,0.09,0.0,275.04,341.42,,2506121.0,Philip Rivers,17.0,QB,1,away,2018121300,84,left,,2553913.0
4,2018-12-14T01:23:55.400Z,79.44,44.93,0.33,0.41,0.03,85.63,319.27,,2530794.0,Ron Parker,38.0,FS,1,home,2018121300,84,left,,2553913.0


In [11]:
closest_defender(x, game_df, play_df)

nflId_def    2.552265e+06
dist_def     5.889822e-01
dtype: float64

In [12]:
depth_of_pass(x)

25.110000000000007

## Generate list of closest defenders and coverage type for all plays in the season

In [13]:
def create_play_features(play_track_df, game_df, play_df, t_defender_thresh=1.5, t_scheme_develop=4, t_reaction_time=0):
    ### THE INPUT TRACKING DATA MUST BE NORMALIZED FOR DIRECTION BEFORE INPUT INTO THIS FUNCTION
    # inputs:
    #     - play_track_df: DataFrame of the raw player tracking data for an individual play.
    #                      MUST ONLY BE FOR A SINGLE PLAY, CANNOT HANDLE MULTIPLE PLAYS.
    #     - t_defender_thresh: time in seconds after the snap to determine which players are in coverage
    #     - t_scheme_develop: time in seconds after the snap to set as a max time threshold
    #                         (i.e. before the play breaks down, after which the movement 
    #                          is not always indicative of the coverage scheme)
    #     - t_reaction_time: time in seconds after the ball thrown to continue taking statistics (paths
    #                        won't change until players realize the ball has been thrown)
    
    # local constants
    DEF_DEEP_THRESH = 10  # yards behind the line of scrimmage considered "deep" coverage

    # work on a copy of the data rather than the actual data (for temporary features)
    play_track_df = play_track_df.copy().drop_duplicates()
    
    # get the current play gameId and playId
    game_id = play_track_df.gameId.iloc[0]
    play_id = play_track_df.playId.iloc[0]
    
    # determine who has the ball (team code)
    home_abbr = game_df[game_df.gameId==game_id].iloc[0]['homeTeamAbbr']
    away_abbr = game_df[game_df.gameId==game_id].iloc[0]['visitorTeamAbbr']

    
    abbr_possess = play_df[(play_df.gameId==game_id) & (play_df.playId==play_id)].iloc[0]['possessionTeam']
    
    if abbr_possess == home_abbr:
        team_poss = 'home'
        team_def = 'away'
    else:
        team_poss = 'away'
        team_def = 'home'
        
 
    # ------------- FEATURE GENERATION SETUP/INTERMEDIATE CALCULATIONS ----------------------
     
    # get play information
    x_los = play_track_df.x[(play_track_df.team == 'football') & (play_track_df.frameId == 1)].iloc[0]
    
    # save the distance downfield of all observations relative to the line of scrimmage
    play_track_df['depth'] = play_track_df['x'] - x_los
    
    # get frameId for specific points in the play (exclude handoff: not a material pivot part of the play,
    # also sometimes occurs prior to the snap)
    pivot_events = ['pass_forward', 'qb_sack', 'fumble', 'qb_strip_sack', 'pass_shovel']
    
    frame_max = play_track_df.frameId.max()
    frame_snap = play_track_df[play_track_df.event=='ball_snap']['frameId'].iloc[0]
    if np.any(play_track_df.event.isin(pivot_events)):
        # find the frameId of the earliest of the pivot events
        # - account for errors on individual players: group by event, find the median frame for each event, then
        #   take the minimum of the medians to get the first event (essentially voting then min)
        frame_pivot = (play_track_df.loc[play_track_df.event.isin(pivot_events), ['event', 'frameId']]
                       .groupby('event').median().min().iloc[0]
                       + int(round(10 * t_reaction_time)))
    else:
        frame_pivot = frame_max
    
    # save important frameId's in the play:
    frame_start = frame_snap
    frame_cover_freeze = min(frame_max, frame_pivot, int(round(frame_snap + 10*t_defender_thresh)))
    frame_scheme_develop = int(frame_snap + round(10*t_scheme_develop))
    frame_end = min(frame_pivot, frame_scheme_develop, frame_max)
    
    
    # filter out data from frames outside of the range (frame_start <= F <= frame_end)
    play_track_df = play_track_df[(play_track_df.frameId >= frame_start) & (play_track_df.frameId <= frame_end)]
    
    
    # ----- SAVE SLICES OF DATAFRAME FOR DEFENDERS AND COVERAGE AND ELIGIBLE RECEIVERS ----
    
    ### get defensive player tracks that are in coverage (i.e. not blitzing/rushing the passer)
    cover_players = play_track_df.nflId[(play_track_df.frameId == frame_cover_freeze) &
                          (play_track_df.team == team_def) & 
                          (play_track_df.depth > 0)]
    def_track = play_track_df[play_track_df.nflId.isin(cover_players)].pivot(
        index='frameId', columns='nflId', values=['x', 'depth', 'y', 's', 'a', 'dir', 'o'])
    
    ### get offensive player tracks of eligible receivers (minus QB)
    # all players
    off_track = play_track_df[(play_track_df.team == team_poss) & (play_track_df.position != 'QB')].pivot(
        index='frameId', columns='nflId', values=['x', 'depth', 'y', 's', 'a', 'dir', 'o'])
    
    # players downfield (depth > 0)
    downfield_players = play_track_df.nflId[(play_track_df.frameId == frame_cover_freeze) &
                          (play_track_df.team == team_poss) & 
                          (play_track_df.depth > 0)]
    # edge case where there are no downfield receivers at the time of throw: quick screen,
    # goal-line, etc. --> classify all offensive players as "downfield"
    if len(downfield_players) == 0:
        downfield_players = play_track_df.nflId[(play_track_df.frameId == frame_cover_freeze) &
                          (play_track_df.team == team_poss) &
                          (play_track_df.position != 'QB')]
    
    downfield_track = play_track_df[play_track_df.nflId.isin(downfield_players)].pivot(
        index='frameId', columns='nflId', values=['x', 'depth', 'y', 's', 'a', 'dir', 'o'])
    
    # ----- OUTERMOST OFF-DEF MATCHUP FEATURES --------------------------------------------------------
    
    # get attributes for the outermost receivers and defenders at the snap to identify 
    # coverage as inside or outside technique
    
    # identify outermost defenders at snap (get corresponding array column index)
    min_def_y_idx = play_track_df.loc[(play_track_df.frameId == frame_snap) &
                              (play_track_df.team == team_def), 'y'].idxmin()
    max_def_y_idx = play_track_df.loc[(play_track_df.frameId == frame_snap) &
                              (play_track_df.team == team_def), 'y'].idxmax()
    right_def_nfl_id = play_track_df.nflId.loc[min_def_y_idx]
    left_def_nfl_id = play_track_df.nflId.loc[max_def_y_idx]
    
    # get outermost defender at snap X- and Y-coordinates during the play
    outer_def_x = np.hstack([play_track_df.x[play_track_df.nflId == left_def_nfl_id].to_numpy().reshape(-1,1),
                            play_track_df.x[play_track_df.nflId == right_def_nfl_id].to_numpy().reshape(-1,1)])
    outer_def_y = np.hstack([play_track_df.y[play_track_df.nflId == left_def_nfl_id].to_numpy().reshape(-1,1),
                            play_track_df.y[play_track_df.nflId == right_def_nfl_id].to_numpy().reshape(-1,1)])
    
    # identify outermost receivers at snap (get corresponding array column index)
    min_off_y_idx = play_track_df.loc[(play_track_df.frameId == frame_snap) &
                              (play_track_df.team == team_poss), 'y'].idxmin()
    max_off_y_idx = play_track_df.loc[(play_track_df.frameId == frame_snap) &
                              (play_track_df.team == team_poss), 'y'].idxmax()
    right_off_nfl_id = play_track_df.nflId.loc[min_off_y_idx]
    left_off_nfl_id = play_track_df.nflId.loc[max_off_y_idx]
    
    # get outermost receiver at snap X- and Y-coordinates during the play
    outer_off_x = np.hstack([play_track_df.x[play_track_df.nflId == left_off_nfl_id].to_numpy().reshape(-1,1),
                            play_track_df.x[play_track_df.nflId == right_off_nfl_id].to_numpy().reshape(-1,1)])
    outer_off_y = np.hstack([play_track_df.y[play_track_df.nflId == left_off_nfl_id].to_numpy().reshape(-1,1),
                            play_track_df.y[play_track_df.nflId == right_off_nfl_id].to_numpy().reshape(-1,1)])
    outer_off_dis = np.hstack([play_track_df.dis[play_track_df.nflId == left_off_nfl_id].to_numpy().reshape(-1,1),
                            play_track_df.dis[play_track_df.nflId == right_off_nfl_id].to_numpy().reshape(-1,1)])
    
    # -- Determine if the defender shadows outside or inside of the outer receiver:
    # -- inner likely man, outer likely zone. look at snap and aggregate until throw
    
    # distance from the middle of the field 
    dist_def_mid = np.abs(53.3/2 - outer_def_y)
    dist_off_mid = np.abs(53.3/2 - outer_off_y)
    # def - off: positive if defender is outside, negative if defender is inside
    dist_shadow_out = dist_def_mid - dist_off_mid

    # aggregate over play (mean offset inside or outside)
    dist_shadow_out_play = np.nanmean(dist_shadow_out, axis=0)
    
    # ------ PLAY CHARACTERISTICS AT SPECIFIC FRAMES/POINTS IN TIME -----------------------
    
    # find characteristics of scheme at the snap (line of scrimmage naturally divides offense + defense)
    cb_id = play_track_df[play_track_df.position=='CB']['nflId'].unique()
    n_cb = len(cb_id)
    
    if n_cb > 0:
        # depth at snap
        cb_depth_at_snap = play_track_df.loc[(play_track_df.frameId == frame_start) 
                                             & (play_track_df.nflId.isin(cb_id)), 'depth']
        
    
    # find characteristics of players in coverage at the "cover freeze time"
    n_deep_freeze = np.sum((play_track_df.nflId.isin(cover_players)) & 
                           (play_track_df.depth >= DEF_DEEP_THRESH) &
                           (play_track_df.frameId == frame_cover_freeze)) 
    
    
    # ------GENERATE FEATURES FOR EACH COVERAGE PLAYER AT EACH FRAME ---------------------
    
    feature_data = {'depth_mean': [],
                    'speed_var': [],
                    'off_mean': [],
                   }
    
    # data that is not dependent on the specific player
    x_off = off_track['x'].to_numpy()  # (n_frame, n_off) array
    y_off = off_track['y'].to_numpy()  # (n_frame, n_off) array
    x_def_full = def_track['x'].to_numpy()  # (n_frame, n_def) array
    y_def_full = def_track['y'].to_numpy()  # (n_frame, n_def) array
    
    
    ### --- loop over each cover player (defense) ---------------------------------------
    for i, player in enumerate(cover_players):
        x_player = def_track['x'][player].to_numpy().reshape(-1, 1)  # (n_frame,1) array
        depth_player = def_track['depth'][player].to_numpy().reshape(-1, 1)  # (n_frame,1) array
        y_player = def_track['y'][player].to_numpy().reshape(-1, 1)  # (n_frame,1) array
        s_player = def_track['s'][player].to_numpy().reshape(-1, 1)  # (n_frame,1) array

        # calculate distance to each player at each time
        dist_off = np.sqrt((x_player - x_off)**2 + (y_player - y_off)**2)  # (n_frame, n_off) array
        dist_off_min = np.nanmin(dist_off, axis=1) # (n_frame,) array
        if np.any(np.isnan(dist_off_min)):
            print(f'WARNING: All-nan row found in dist_off_min for gameId = {game_id}, playId = {play_id}')
        
        # save average distance
        feature_data['depth_mean'].append(np.nanmean(depth_player))
        feature_data['speed_var'].append(np.nanvar(s_player))
        feature_data['off_mean'].append(np.nanmean(dist_off_min))
        
    # put results into a dataframe
    def_df = pd.DataFrame(feature_data, index=cover_players)
    
    
    ### -- loop over each downfield offensive player  (offense) ----------------------
    downfield_data = {
        'df_rec_space_mean': [],
    }
    
    for i, player in enumerate(downfield_players):
        # extract location of player and put as column vector
        x_player = downfield_track['x'][player].to_numpy().reshape(-1, 1)  # (n_frame,1) array
        y_player = downfield_track['y'][player].to_numpy().reshape(-1, 1)  # (n_frame,1) array
        
        # calculate distance to each defensive at each time
        dist_to_defender = np.sqrt((x_player - x_def_full)**2 + (y_player - y_def_full)**2)  # (n_frame, n_def) array
        # get distance to closest defender
        dist_to_defender_min = np.nanmin(dist_to_defender, axis=1) # (n_frame,) array
        if np.any(np.isnan(dist_to_defender_min)):
            print(f'WARNING: All-nan row found in dist_to_defender_min for gameId = {game_id}, playId = {play_id}')
        
        # save average of distance to closest defender
        downfield_data['df_rec_space_mean'].append(np.nanmean(dist_to_defender_min))
    
    # put results into a dataframe
    downfield_df = pd.DataFrame(downfield_data, index=downfield_players)
    
    
    # ----------- COLLECT ALL FEATURES INTO OUTPUT SERIES -------------------------
    
    # return averages of features generated from applicable players, for play-level feature
    out_data = pd.concat([def_df.mean(), downfield_df.mean()])
    
    # add in number of deep defenders at the "freeze frame"
    out_data['n_deep_frz'] = n_deep_freeze
    
    # add in inside-outside technique feature
    out_data['dist_shadow_out_play_mean'] = np.mean(dist_shadow_out_play)
    
    # add in CB-specific feature (depth at snap of all cornerbacks)
    if n_cb > 0:
        out_data['cb_depth_snap_min'] = np.nanmin(cb_depth_at_snap)
    else:
        out_data['cb_depth_snap_min'] = 0
        
    return out_data

In [14]:
def create_zone_predict_dataframe(clf_zone, track_df, game_df, play_df,
                             t_scheme_develop=4, t_reaction_time=0,
                            coverage_df=None, bad_plays=None):
    
    # remove bad plays (if provided)
    if bad_plays is not None and len(bad_plays) > 0:
        track_df = track_df[~track_df[['gameId','playId']].apply(tuple, 1).isin(bad_plays)]
    
    # filter out any plays that are missing an entire team (does occur in the dataset where the defense is missing:
    # 3 is for 'home', 'away', and 'football')
    track_df = track_df.groupby(['gameId','playId']).filter(lambda df: len(df.team.unique()) == 3)
    
    # filter out spike plays
    track_df = track_df.groupby(['gameId','playId']).filter(lambda df: np.all(df.event != 'qb_spike'))
    
    # Transform the raw tracking data so that all offensive plays face the same direction,
    # group the tracking data for each play together
    test_df_group = nflutil.transform_tracking_data(track_df).groupby(['gameId', 'playId'])

    # ------ Create the features for each play ---------------------
    feature_df = pd.DataFrame()

    col_names = []
    values = []

    # loop over each play
    for (loop_game_id, loop_play_id), loop_track_df in test_df_group:

        # error block for easier debugging if a particular play runs into an error
        try:
            features = create_play_features(loop_track_df,
                                            game_df,
                                            play_df,
                                            t_scheme_develop=t_scheme_develop,
                                            t_reaction_time=t_reaction_time)
        except Exception as err:
            print(f'error in gameId {loop_game_id}, playId {loop_play_id}')
            raise err

        # first loop: save the output dataframe column names (gameId, playId, all feature names)
        if not col_names: # empty
            col_names.extend(['gameId', 'playId'])
            col_names.extend(features.index.tolist())

        # save the gameId, playId, and all feature values into a list
        loop_values = [loop_game_id, loop_play_id]
        loop_values.extend(features.values.tolist())
        values.append(loop_values)

    # convert the features into a dataframe (1 row per play), inner join on plays with labeled coverages
    feature_df = pd.DataFrame(values, columns=col_names)
    
    # create zone-labeled plays
    labeled_play_df = feature_df.copy()

    # make predictions
    labeled_play_df['zone'] = clf_zone.predict(feature_df.drop(columns=['gameId','playId']))
    
    # if the actual labels are known, set the plays to those values
    if coverage_df is not None:
        # classify the actual coverage as man or zone
        temp_df = coverage_df.copy()
        temp_df['zone'] = np.nan  # initialize all as nan
        temp_df.loc[temp_df.coverage.str.contains('Zone'), 'zone'] = 1
        temp_df.loc[temp_df.coverage.str.contains('Man'), 'zone'] = 0
        
        if np.sum(temp_df.zone.isna()) > 0:
            raise ValueError('coverage_df contains a value in the "coverage" field that does not contain "Man" or "Zone"')
        
        
        # overwrite particular plays with known values
        known_coverage_idx = pd.MultiIndex.from_frame(temp_df[['gameId','playId']])
        labeled_play_df.set_index(['gameId','playId'], inplace=True)
        labeled_play_df.loc[known_coverage_idx, 'zone'] = temp_df['zone'].to_numpy()
        # set index back to original
        labeled_play_df.reset_index(inplace=True)

    
    # return the feature dataframe
    return labeled_play_df

## loop through weeks

In [None]:
processed_weeks = []  # list of DataFrames containing processed data
zone_predict = []  # list of DataFrames containing zone prediction data

max_week = 17

for week in range(1,max_week+1):  # weeks 1-17
    print(f'Analyzing tracking data for week {week}...')
    # load tracking data
    temp_track_df = pd.read_csv(f'csv/week{week}.csv')
    # add the targeted receiver to the tracking data dataframe
    temp_track_df = pd.merge(temp_track_df, target_df, how='left', on=['gameId', 'playId'])
    # calculate closest defender
    closest_def = temp_track_df.groupby(['gameId','playId']).apply(closest_defender, game_df, play_df).reset_index()
    # calculate depth of pass
    dop_df = temp_track_df.groupby(['gameId','playId']).apply(depth_of_pass).reset_index()
    dop_df.rename(columns={0: 'pass_depth'}, inplace=True)
    # combine into single dataframe
    processed_weeks.append(pd.merge(closest_def, dop_df, on=['gameId','playId']))
    
    # predict man/zone coverage
    if week == 1:
        zone_df = create_zone_predict_dataframe(clf_zone, temp_track_df, game_df, play_df, coverage_df=coverage_df,
                                               bad_plays=bad_plays)
    else:
        # no known coverage
        zone_df = create_zone_predict_dataframe(clf_zone, temp_track_df, game_df, play_df,
                                               bad_plays=bad_plays)
        
    zone_predict.append(zone_df)
    
    
print('Loop complete.')
# concatenate into a full matrix
closest_def_df = pd.concat(processed_weeks, ignore_index=True)
# remove "special" plays
closest_def_df.set_index(['gameId','playId'], inplace=True)
valid_plays_mi = pd.MultiIndex.from_frame(play_df.loc[play_df.typeDropback!='UNKNOWN', ['gameId','playId']])
closest_def_df = closest_def_df.loc[valid_plays_mi]
closest_def_df.reset_index(inplace=True)

# concatenate zone dataframes
zone_def_df = pd.concat(zone_predict, ignore_index=True)

Analyzing tracking data for week 1...
Analyzing tracking data for week 2...
Analyzing tracking data for week 3...
Analyzing tracking data for week 4...
Analyzing tracking data for week 5...
Analyzing tracking data for week 6...
Analyzing tracking data for week 7...
Analyzing tracking data for week 8...
Analyzing tracking data for week 9...
Analyzing tracking data for week 10...
Analyzing tracking data for week 11...
Analyzing tracking data for week 12...
Analyzing tracking data for week 13...
Analyzing tracking data for week 14...


In [None]:
# create base dataframe for pass play characteristics
pass_df = pd.merge(closest_def_df, zone_def_df,
                          how='left',
                          on=['gameId','playId'])
pass_df.head()

In [None]:
print('Distribution of dist_def:')
pass_df['dist_def'].describe(percentiles=[.25, .5, .75, .9, .95, .97, .98, .99, .995])

In [None]:
print('Distribution of dist_def when depth_mean > 15:')
pass_df.loc[pass_df.depth_mean > 15, 'dist_def'].describe(percentiles=[.25, .5, .75, .9, .95, .97])

Over 99.5% of the cases are below 15 yards to the closest defender. This is a reasonable cutoff for downstream analysis so that prevent or soft zone (coverages designed to give up yards but not points) does not count against an individual's performance, since filtering the mean defensive team depth of over 15 yards during the play has a closest defender distance of under 15 yards 90% of the time as well (signifying soft coverage but not prevent). Therefore removing all cases above 15 yards to the closest defender effectively filters out the vast majority of prevent defense plays where there is no real attempt to force an incompletion.

In [None]:
# flag scenarios where the closest individual is not really attempting to cause an incompletion
dist_def_cutoff = 15
pass_df['covered'] = (pass_df.dist_def < dist_def_cutoff).astype(int)

In [None]:
pass_df.sort_values('dist_def', ascending=True).dropna()

# 2. Get the height of each player (target and defender)

### Convert player height into consistent format (inches)

In [None]:
def ftin_to_in(ftin):
    # converts ft-in notation to inches (i.e. 6-2 to 74)
    [f, i] = ftin.split('-')
    return 12 * int(f) + int(i)

In [None]:
ind_ftin = player_df.height.str.contains('-')
player_df.loc[ind_ftin, 'height'] = player_df.height[ind_ftin].apply(ftin_to_in)
player_df.loc[:, 'height'] = player_df.height.astype('int')

In [None]:
player_df.height.value_counts().sort_index(ascending=False)

### Add the height of target and defender to dataframe

Add targeted player:

In [None]:
pass_df = pd.merge(pass_df, target_df.rename(columns={'targetNflId': 'nflId_target'}),
        how='left',
        on=['gameId','playId'])
pass_df.head()

Add defender height:

In [None]:
pass_df = pd.merge(pass_df, player_df[['nflId', 'height']].rename(columns={'nflId': 'nflId_def', 'height': 'def_height'}),
                   how='left',
                   on='nflId_def'
                  )
pass_df.head()

Add targeted player height:

In [None]:
pass_df = pd.merge(pass_df, player_df[['nflId', 'height']].rename(columns={'nflId': 'nflId_target', 'height': 'tgt_height'}),
                   how='left',
                   on='nflId_target'
                  )
pass_df.head()

## 2a. Calculate the height difference between each player 

Calculate the height difference between receiver and defender:

In [None]:
# 'tgt_height_adv' = target height advantage (target - defender)
pass_df['tgt_height_adv'] = pass_df.tgt_height - pass_df.def_height

In [None]:
pass_df.tgt_height_adv.describe()

In [None]:
plt.figure(figsize=(10,4))
sns.countplot(data=pass_df, x='tgt_height_adv')
print(pass_df.tgt_height_adv.value_counts().sort_index(ascending=True))

Grouping together the values <= -7 and >= 8 will group the extremes to roughly 200 pass plays.

In [None]:
# group together extreme values
min_bin_bound = -7
max_bin_bound = 8
pass_df['tgt_height_adv_bin'] = pass_df.tgt_height_adv
pass_df.loc[pass_df.tgt_height_adv <= min_bin_bound, 'tgt_height_adv_bin'] = min_bin_bound
pass_df.loc[pass_df.tgt_height_adv >= max_bin_bound, 'tgt_height_adv_bin'] = max_bin_bound

# 3. Calculate passing performance metrics

In [None]:
pass_df = pd.merge(pass_df, play_df,
                  on=['gameId', 'playId'],
                  how='left')

### EPA

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=pass_df, x='tgt_height_adv_bin', y='epa')

In [None]:
plt.figure(figsize=(11,6))
sns.barplot(data=pass_df, x='tgt_height_adv_bin', y='epa', ci=None, color='b')
plt.title('Mean EPA of All Pass Attempts vs. Height Difference');
plt.ylabel('EPA')
plt.xlabel('Target Height Advantage (in)')

### Completion Percentage

In [None]:
# add column to aid calculating yards per completion
pass_df['yds_cmp'] = np.nan  # NaN for incomplete
pass_df.loc[pass_df.passResult=='C', 'yds_cmp'] = pass_df.loc[pass_df.passResult=='C', 'offensePlayResult']

# compute aggregates
cmp_agg = pass_df[~pass_df.playDescription.str.contains('No Play')].groupby('tgt_height_adv_bin').agg(
    count=pd.NamedAgg(column='playId', aggfunc='count'),
    cmp_pct=pd.NamedAgg(column='passResult', aggfunc=lambda ser: np.sum(ser=='C') / len(ser)),
    yd_per_att=pd.NamedAgg(column='offensePlayResult', aggfunc='mean'),
    yd_per_cmp=pd.NamedAgg(column='yds_cmp', aggfunc=np.nanmean)
).reset_index()
cmp_agg

In [None]:
sns.scatterplot(data=cmp_agg, x='tgt_height_adv_bin', y='cmp_pct');

In [None]:
sns.scatterplot(data=cmp_agg, x='tgt_height_adv_bin', y='yd_per_att');

In [None]:
sns.scatterplot(data=cmp_agg, x='tgt_height_adv_bin', y='yd_per_cmp');

This is an interesting trend. Need to group together ends due to low value counts, but this goes against the typical wisdom of height advantage. Need to look to see average depth of pass to see if high mismatch is from short RB/speedsters catching in open space rather than truly beating coverage.

### Depth of pass

In [None]:
# compute aggregates
dop_agg = pass_df[~pass_df.playDescription.str.contains('No Play')].groupby('tgt_height_adv_bin').agg(
    count=pd.NamedAgg(column='playId', aggfunc='count'),
    pass_depth_mean=pd.NamedAgg(column='pass_depth', aggfunc=np.nanmean),
    pass_depth_median=pd.NamedAgg(column='pass_depth', aggfunc=np.nanmedian),
).reset_index()
dop_agg

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=pass_df, x='tgt_height_adv_bin', y='pass_depth');

In [None]:
sns.scatterplot(data=dop_agg, x='tgt_height_adv_bin', y='pass_depth_median');

## NEXT UP: Look at completion pctg. vs. depth of pass to determine if there is a height advantage that performs better than expected

# 4. Completion percentage vs. depth of pass

In [None]:
pass_df.columns

In [None]:
comp_depth_df = pd.merge(closest_def_df[['gameId','playId','pass_depth']],
                         play_df[['gameId','playId','passResult']])
comp_depth_df.dropna(inplace=True)
comp_depth_df.info()

In [None]:
comp_depth_df.passResult.value_counts()

In [None]:
# add flag for complete vs not complete
comp_depth_df['comp'] = (comp_depth_df.passResult=='C').astype(int)

Look at the distribution of the pass depth looking at multiple bin widths:

In [None]:
min_depth = -6  # all below this value will be binned together
max_depth = 36 # all above this value will be binned together
bin_width = 3  # yards

depth_bins = np.concatenate([np.array([-100]), np.arange(min_depth, max_depth + 0.01, bin_width), np.array([100])])
# define the depth to use for regression (middle of the bin, except for the ends which will use a cutoff)
depth_points_mid = (depth_bins[1:-2] + depth_bins[2:-1]) / 2
# first and last points are temporary
depth_points = np.concatenate([np.array([-100]), depth_points_mid, np.array([100])])
# set first and last points to the mean value for the extreme bins
depth_points[0] = comp_depth_df.loc[comp_depth_df.pass_depth <= min_depth, 'pass_depth'].mean()
depth_points[-1] = comp_depth_df.loc[comp_depth_df.pass_depth > max_depth, 'pass_depth'].mean()

sns.displot(data=comp_depth_df, x='pass_depth', bins=depth_bins, kind='hist')
plt.axhline(100, color='k')

In [None]:
comp_depth_df['pass_depth_bin'] = pd.cut(comp_depth_df['pass_depth'], depth_bins)

### calculate the completion percentage in each bin

In [None]:
comp_depth_agg = comp_depth_df.groupby('pass_depth_bin').agg(
    comp_pct=pd.NamedAgg(column='comp', aggfunc='mean')
)
comp_depth_agg['depth_point'] = depth_points
comp_depth_agg

In [None]:
plt.figure(figsize=(10,5))
ax = plt.gcf().gca()
sns.scatterplot(data=comp_depth_agg, x='depth_point', y='comp_pct')
plt.xlabel('Pass Attempt Depth (yds)')
plt.ylabel('Completion Percentage')
plt.title(f'2018 Aggregate Completion Percentage by Pass Attempt Depth, Relative to LOS ({bin_width}-yd Bins)');

Modeling the completion percentage vs. pass attempt depth as a logistic curve (asymptotes at extremes) makes sense: behind line of scrimmage passes are easy, increasing difficulty as depth increases, then reaches a "natural" completion rate when passes are over 35 yards (the marginal difficulty for increasingly long throws is near zero).

In [None]:
def logistic_fcn(depth, a, b, k, q, v):
    return a + (k - a) / ((1 + q * np.exp(-b * depth)) ** (1/v))

In [None]:
# initialize parameters (a,b,k,q,v)
p0 = np.array([.85, .05, .25, .25, 1])
log_bounds = ([0, 0, 0, 0, .00000000001], [1, 3, 1, 1000, 10])

x = comp_depth_agg.depth_point.to_numpy()
y = comp_depth_agg.comp_pct.to_numpy()
cmp_model_params, _ = optim.curve_fit(logistic_fcn, x, y, bounds=log_bounds, p0=p0)
cmp_model_params

In [None]:
# create a handle to the completion percentage model
cmp_pct_model = lambda depth: logistic_fcn(depth, *cmp_model_params)

In [None]:
x_curve = np.linspace(-20, 60, 500)
cmp_curve = cmp_pct_model(x_curve)

# plot the data
plt.figure(figsize=(10,5))
plt.plot(x, y, marker='o', linestyle='')
plt.plot(x_curve, cmp_curve, linestyle='-', marker=None, color='k')
plt.xlabel('Pass Attempt Depth (yds)')
plt.ylabel('Completion Percentage')
plt.title(f'2018 Aggregate Completion Percentage by Pass Attempt Depth, Relative to LOS ({bin_width}-yd Bins)');

Look at the residuals:

In [None]:
cmp_predict = cmp_pct_model(x)
resid = cmp_predict - y
plt.figure()
plt.plot(x, resid, marker='o', ls='')
plt.axhline(0, ls='--', c='gray')
plt.xlabel('Pass Attempt Depth (yds)')
plt.ylabel('Residual (Percentage Points)')
plt.title('Completion Percentage Logistic Regression Residuals');

The fit is within a few percentage points below 25 yards, which is good enough for a comparative analysis of individual players.

# 5. Compare Cmp Pct Above Average (CPOA) between players

Calculate the completion percentage for each play of the season:

In [None]:
cpoa_df = pass_df[['gameId','playId','nflId_def','pass_depth','zone','covered','passResult','epa']].copy().dropna()
cpoa_df

In [None]:
# calculate the expected completion percentage
cpoa_df['cp_expect'] = cmp_pct_model(cpoa_df.pass_depth)

# remove the plays where there wasn't really coverage (very soft zone)
cpoa_df = cpoa_df[cpoa_df.covered==1]

In [None]:
# determine the number of players that would remain if a particular play count is used as a minimum filter
n_plays = cpoa_df.groupby('nflId_def').agg('count')['gameId'].to_numpy()
play_range = np.arange(5,100,5)
n_players = []

for p in play_range:
    n_players.append(np.sum(n_plays >= p))
    
plt.figure()
plt.scatter(play_range, n_players, ls='--', marker='o')
plt.xlabel('Minimum Plays')
plt.ylabel('Players')
plt.title('Number of Players Remaining after Minimum Play Count');

Using 30 as a cutoff is reasonable and leaves over 200 players left to analyze. This also leaves rooms for high-level players that are not target often by design (~2 targets/game).

In [None]:
n_cutoff = 30

# aggregate on defensive players with minimum targets >= n_cutoff
cpoa_df = cpoa_df.groupby('nflId_def').filter(lambda df: len(df) >= n_cutoff)

cpoa_agg = cpoa_df.groupby('nflId_def').agg(
    plays_total = pd.NamedAgg(column='passResult', aggfunc=len),
    plays_zone = pd.NamedAgg(column='zone', aggfunc=lambda x: np.sum(x==1)),
    plays_man = pd.NamedAgg(column='zone', aggfunc=lambda x: np.sum(x==0)),
    epa_avg_tot = pd.NamedAgg(column='epa', aggfunc='mean'),
    cp = pd.NamedAgg(column='passResult', aggfunc=lambda x: np.mean(x=='C')),
    cp_expect = pd.NamedAgg(column='cp_expect', aggfunc='mean'),
).reset_index()

# make play count columns integers
cpoa_agg['plays_zone'] = cpoa_agg.plays_zone.astype(int)
cpoa_agg['plays_man'] = cpoa_agg.plays_man.astype(int)

# calculate CPOA
cpoa_agg['cpoa'] = cpoa_agg.cp_expect - cpoa_agg.cp

cpoa_agg = pd.merge(cpoa_agg, player_df[['nflId','displayName','position']], 
                   left_on='nflId_def', right_on='nflId', how='left').drop(columns='nflId')
cpoa_agg

In [None]:
# get the average EPA for zone vs. man coverage
epa_agg = cpoa_df.groupby(['nflId_def', 'zone']).agg(
    epa_avg = pd.NamedAgg(column='epa', aggfunc='mean')
).unstack()
epa_agg.columns=['epa_avg_man', 'epa_avg_zone']
epa_agg.reset_index(inplace=True)
epa_agg

In [None]:
# add EPA breakdown into cpoa dataframe
cpoa_agg = pd.merge(cpoa_agg, epa_agg, on='nflId_def')
cpoa_agg

In [None]:
# cpoa_agg[cpoa_agg.position.isin(['CB','DB'])].sort_values('cpoa', ascending=False).head(15)
cpoa_agg[cpoa_agg.position.isin(['CB','DB'])].sort_values('cpoa', ascending=False).head(15)

In [None]:
cpoa_agg[['plays_zone','plays_man']].describe()

In [None]:
cpoa_agg.sort_values('plays_man', ascending=False)

# ------- MISC CALCULATIONS ---------------------------

In [None]:
track_df.loc[track_df.event=='pass_outcome_interception', ['gameId','playId','event','frameId']].groupby(['gameId','playId']).head(1)

In [None]:
gid = 2018121605
pid = 3489
temp = (track_df.loc[(track_df.gameId==gid) & (track_df.playId==pid), ['gameId','playId','event','frameId']]
 .groupby('frameId').head(1))

temp.loc[temp.event!='None', ['event','frameId']].apply(tuple, 1)

### -------- Play outcome ---------------------------------------------

In [None]:
play_df.columns

In [None]:
play_df.head()

In [None]:
play_df[~play_df.penaltyCodes.isna()]

In [None]:
play_df.passResult.value_counts()

In [None]:
for _, p in play_df[play_df.penaltyCodes=='DPI'].iterrows():
#     print(p)
    print(f"[{p.gameId}, {p.playId}] {p.playDescription}")
    print()

In [None]:
game_id=2018090900
play_id=742
play_df[(play_df.gameId==game_id) & (play_df.playId == play_id)].iloc[0]

In [None]:
play_df[play_df.playDescription.str.contains("No Play.")].penaltyCodes.value_counts(dropna=False)

### --------------------------------------------------------------

In [None]:
game_df.head()

In [None]:
target_df.head()

In [None]:
temp_df = pd.merge(target_df, player_df.rename(columns={'nflId': 'targetNflId'}), how='left', on='targetNflId')
temp_df.head()

In [None]:
coverage_df.coverage.value_counts()

In [None]:
track_df.event.value_counts()

In [None]:
out_df = track_df.groupby(['playId','gameId']).apply(lambda df: np.any(df.event == 'pass_forward') & np.all(df.event != 'pass_arrived')).reset_index()

In [None]:
out_df[out_df[0]]

In [None]:
events = []

for pid, gid in zip(out_df[out_df[0]].playId, out_df[out_df[0]].gameId):
    # get events for given play and game ID
    play_events = track_df[(track_df.gameId == gid) & (track_df.playId == pid)].event.unique()
    if np.any(play_events == 'pass_forward'):
        # get index of pass_forward to grab any events that occur afterwards
        idx_pass_fwd = np.where(play_events == 'pass_forward')[0][0]
        if len(play_events)-1 > idx_pass_fwd:
            # get all events after the forward pass event
            after_pass_events = play_events[idx_pass_fwd+1:].tolist()
            if after_pass_events not in events:
                events.append(after_pass_events)
    
events

In [None]:
game_id = 2018121605
play_id = 1770
track_df[(track_df.gameId == game_id) & (track_df.playId == play_id)].event.unique()

In [None]:
pass_end_events = ['pass_arrived', 'pass_outcome_interception', 'pass_outcome_incomplete', 'pass_outcome_caught']

In [None]:
PASS_END_EVENTS = ['pass_arrived',
                       'pass_outcome_interception',
                       'pass_outcome_incomplete',
                       'pass_outcome_caught']
PASS_END_EVENTS = ['fake_bad_event']
track_df.loc[track_df.event.isin(PASS_END_EVENTS), 'frameId'].min()

In [None]:
play_df.columns

In [None]:
play_df[(play_df.gameId.isin([2018111111, 2018121604])) & (play_df.playId.isin([2502, 1333]))]

In [None]:
game_df[(game_df.gameId.isin([2018111111, 2018121604]))]

In [None]:
z=pd.merge(play_df, game_df, on='gameId')
z.head()

In [None]:
z.typeDropback.value_counts()

In [None]:
cover_plays = pd.merge(coverage_df, track_df.groupby(['gameId','playId']).head(1)[['gameId','playId']],
        on=['gameId','playId'], how='inner')[['gameId','playId']]

non_unknown_plays = play_df[play_df.typeDropback!='UNKNOWN']

remaining_plays = pd.merge(cover_plays, non_unknown_plays, on=['gameId','playId'], how='inner')

remaining_plays.info()

In [None]:
for desc in z[(z.week==1) & (z.typeDropback=='UNKNOWN')]['playDescription']:
    print(desc)