# Overview

This "Tackling_Data" code uses previous dataframes, SHAP values, and play-by-play data from 'nfl-data-py' to create the dataframe "df_shap_final_pbp". This contains the Defensive Tackling Contribution (DTC) metric that can be further analyzed in "Data_Viz_and_Insights".

The main steps are the following:

- A) Bring in play-by-play data from 'nfl-data-py' to get the following data:
      third_down_failed, fourth_down_failed, goal-to-go, tackled_for_loss, run_gap, run_location, EPA
- B) Calculate player SHAP values by adding all attributes related to the player
- C) Use logic to end the play once the play has ended to remove instances where the defender jumps on top when the ball carrier's been stopped
- D) Use only the last 15 frames and remove plays that end in a touchdown or are brought back due to penalty
- E) Find frames where defender has negative total SHAP value to create **Defensive Tackling Contribution (DTC) metric** by:
      * Find all player frames where their total SHAP value is negative
      * Remove players if their total contribution of total negative SHAP values are < 10% of the play's total
      * Divide each player's sum by the total to get each player's DTC for the play

In [2]:
!pip install nfl-data-py

Collecting nfl-data-py
  Downloading nfl_data_py-0.3.1.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fastparquet>0.5 (from nfl-data-py)
  Downloading fastparquet-2023.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-snappy>0.5 (from nfl-data-py)
  Downloading python_snappy-0.6.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.9/55.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting cramjam>=2.3 (from fastparquet>0.5->nfl-data-py)
  Downloading cramjam-2.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: nfl-data-py
  Build

In [None]:
import pandas as pd
import numpy as np

all_players_sorted_1 = pd.read_pickle("all_players_sorted_1.pkl")
all_players_sorted_2 = pd.read_pickle("all_players_sorted_2.pkl")
all_players_sorted_3 = pd.read_pickle("all_players_sorted_3.pkl")
all_players_sorted_4 = pd.read_pickle("all_players_sorted_4.pkl")
all_players_sorted_5 = pd.read_pickle("all_players_sorted_5.pkl")
all_players_sorted_6 = pd.read_pickle("all_players_sorted_6.pkl")
all_players_sorted_7 = pd.read_pickle("all_players_sorted_7.pkl")
all_players_sorted_8 = pd.read_pickle("all_players_sorted_8.pkl")
all_players_sorted_9 = pd.read_pickle("all_players_sorted_9.pkl")

df_all = pd.read_pickle("df_all.pkl")

shap_values_wks_7_9 = pd.read_csv("shap_values_7_9.csv")

games = pd.read_csv("games.csv")
players = pd.read_csv("players.csv")
plays = pd.read_csv("plays.csv")
tackles = pd.read_csv("tackles.csv")

**A) Bring in play-by-play data from 'nfl-data-py'**

In [5]:
import nfl_data_py as nfl

pbp_data = nfl.import_pbp_data(years=[2022])

2022 done.
Downcasting floats.


In [6]:
# additional play-by-play data from nfl-data-py

pbp_df = pbp_data[['old_game_id', 'play_id', 'third_down_failed', 'fourth_down_failed', 'goal_to_go',
                   'tackled_for_loss', 'run_gap', 'run_location', 'epa']]

pbp_df = pbp_df.rename(columns={'old_game_id': 'gameId', 'play_id': 'playId'})

pbp_df['gameId'] = pbp_df['gameId'].astype(int)

In [7]:
pbp_df.head()

Unnamed: 0,gameId,playId,third_down_failed,fourth_down_failed,goal_to_go,tackled_for_loss,run_gap,run_location,epa
0,2022091107,1.0,,,0.0,,,,0.0
1,2022091107,43.0,0.0,0.0,0.0,0.0,,,-0.443521
2,2022091107,68.0,0.0,0.0,0.0,0.0,end,left,1.468819
3,2022091107,89.0,0.0,0.0,0.0,0.0,,,-0.492192
4,2022091107,115.0,0.0,0.0,0.0,0.0,end,left,-0.325931


**B) Calculate player SHAP values by adding all attributes related to the player**

In [8]:
wks_1_6 = games[['gameId', 'week']].merge(df_all[['gameId']])
wks_7_9 = wks_1_6[wks_1_6['week'] > 6.5]
wks_1_6 = wks_1_6[wks_1_6['week'] < 6.5]

df_all_1_6 = df_all[df_all['gameId'] < 2022101900]
df_all_7_9 = df_all[df_all['gameId'] > 2022101900]

In [9]:
# SHAP value sums for defenders 1-3
shap_values_wks_7_9['def1_sum'] = shap_values_wks_7_9['player2.0_s'] + shap_values_wks_7_9['player2.0_dist_ball'] + shap_values_wks_7_9['player2.0_ang_ball'] + shap_values_wks_7_9['player2.0_dis_closest_o'] + shap_values_wks_7_9['player2.0_ang_closest_o'] + shap_values_wks_7_9['player2.0_min_dist'] + shap_values_wks_7_9['player2.0_ang_min'] + shap_values_wks_7_9['player2.0_2_min_dist'] + shap_values_wks_7_9['player2.0_2_ang_min']

shap_values_wks_7_9['def2_sum'] = shap_values_wks_7_9['player3.0_s'] + shap_values_wks_7_9['player3.0_dist_ball'] + shap_values_wks_7_9['player3.0_ang_ball'] + shap_values_wks_7_9['player3.0_dis_closest_o'] + shap_values_wks_7_9['player3.0_ang_closest_o'] + shap_values_wks_7_9['player3.0_min_dist'] + shap_values_wks_7_9['player3.0_ang_min'] + shap_values_wks_7_9['player3.0_2_min_dist'] + shap_values_wks_7_9['player3.0_2_ang_min']

shap_values_wks_7_9['def3_sum'] = shap_values_wks_7_9['player4.0_s'] + shap_values_wks_7_9['player4.0_dist_ball'] + shap_values_wks_7_9['player4.0_ang_ball'] + shap_values_wks_7_9['player4.0_dis_closest_o'] + shap_values_wks_7_9['player4.0_ang_closest_o'] + shap_values_wks_7_9['player4.0_min_dist'] + shap_values_wks_7_9['player4.0_ang_min'] + shap_values_wks_7_9['player4.0_2_min_dist'] + shap_values_wks_7_9['player4.0_2_ang_min']

In [10]:
all_players_sorted_7_9 = pd.concat([all_players_sorted_7, all_players_sorted_8, all_players_sorted_9], axis=0)

In [11]:
df_all_7_9 = df_all_7_9.reset_index(drop=True)

In [12]:
df_shap_7_9 = df_all_7_9[['gameId', 'playId', 'frameId', 'frame_since_bc', 'player1.0_y',
                          'actual_dist_from_final']].merge(shap_values_wks_7_9[['def1_sum', 'def2_sum', 'def3_sum']],
                                                                                  left_index=True, right_index=True)

In [13]:
df_shap_7_9.head()

Unnamed: 0,gameId,playId,frameId,frame_since_bc,player1.0_y,actual_dist_from_final,def1_sum,def2_sum,def3_sum
0,2022102000,80,18,0,24.56,22.74,2.851314,0.248037,-0.184807
1,2022102000,80,19,1,24.24,22.32,3.001441,-0.183921,-0.332242
2,2022102000,80,20,2,23.94,21.87,2.789211,0.139855,-0.045122
3,2022102000,80,21,3,23.66,21.38,2.226721,-0.198384,-0.220069
4,2022102000,80,22,4,23.4,20.85,1.814105,-0.038021,-0.150498


In [14]:
df_shap_7_9_def = df_shap_7_9.merge(all_players_sorted_7_9[all_players_sorted_7_9['rank'] == 2][['gameId', 'playId', 'frameId', 'nflId', 'displayName', 'new_pos']],
      on=['gameId', 'playId', 'frameId'], how='inner').merge(all_players_sorted_7_9[all_players_sorted_7_9['rank'] == 3][['gameId', 'playId', 'frameId', 'nflId', 'displayName', 'new_pos']],
      on=['gameId', 'playId', 'frameId'], how='inner').merge(all_players_sorted_7_9[all_players_sorted_7_9['rank'] == 4][['gameId', 'playId', 'frameId', 'nflId', 'displayName', 'new_pos']],
      on=['gameId', 'playId', 'frameId'], how='inner')

df_shap_7_9_def = df_shap_7_9_def.rename(columns={'nflId_x': 'def1_nflId', 'displayName_x': 'def1_name', 'nflId_y': 'def2_nflId',
                                'displayName_y': 'def2_name', 'nflId': 'def3_nflId', 'displayName': 'def3_name'})

**C) Use logic to end the play once the play has ended to remove instances where the defender jumps on top when the ball carrier's been stopped**

In [15]:
# cutoff play once the frame gets to the point where the ball carrier has reached final distance and doesn't move side to side anymore

def filter_rows(group):
    # Identify if any frame's distance_to_final drops below 0.1
    cutoff_reached = group['actual_dist_from_final'].lt(0).any()
    if cutoff_reached:
        # Find the first frameId where distance_to_final drops below 0.1
        first_cutoff_frame = group[group['actual_dist_from_final'] < 0]['frameId'].iloc[0]
        # Check if any subsequent frames have distance_to_final > 0.5
        subsequent_frames = group[group['frameId'] > first_cutoff_frame]
        if not subsequent_frames['actual_dist_from_final'].gt(0.5).any():
            # Filter out rows after the first cutoff frame (plus 1 additional frame) and keep the last 15 frames
            remaining_frames = group[(group['frameId'] <= (first_cutoff_frame + 1)) ]            # & (group['frameId'] >= (first_cutoff_frame - 13))
            removing_frames = group[(group['frameId'] > first_cutoff_frame)]
            # Check if the difference between max and min of 'player1.0_y' is less than 2
            if removing_frames['player1.0_y'].max() - removing_frames['player1.0_y'].min() < 2:
                return remaining_frames
    return group

In [16]:
df_shap_7_9_def = df_shap_7_9_def.groupby(['gameId', 'playId']).apply(filter_rows).reset_index(drop=True)

**D) Use only the last 15 frames and remove plays that end in a touchdown or are brought back due to penalty**

In [17]:
# cutoff to the last 15 frames
df_shap_7_9_def = df_shap_7_9_def.groupby(['gameId', 'playId']).apply(lambda x: x.tail(15)).reset_index(drop=True)

In [18]:
# filter out touchdowns and nullified penalties
plays['TD'] = plays['playDescription'].str.contains('TOUCHDOWN', case=False).astype(int)
filtered_plays = plays[(plays['TD'] == 0) & (plays['playNullifiedByPenalty'] == 'N')].reset_index(drop=True)

**E) Find frames where defender has negative total SHAP value to create Defensive Tackling Contribution (DTC) metric**

In [19]:
df_shap_7_9_def_fp = df_shap_7_9_def.merge(filtered_plays[['gameId', 'playId']], on=['gameId', 'playId'], how='inner')

df_shap_7_9_def_fp['def1_sum_neg_transform'] = np.where(df_shap_7_9_def_fp['def1_sum'] < 0, df_shap_7_9_def_fp['def1_sum'] * -1, 0)
df_shap_7_9_def_fp['def2_sum_neg_transform'] = np.where(df_shap_7_9_def_fp['def2_sum'] < 0, df_shap_7_9_def_fp['def2_sum'] * -1, 0)
df_shap_7_9_def_fp['def3_sum_neg_transform'] = np.where(df_shap_7_9_def_fp['def3_sum'] < 0, df_shap_7_9_def_fp['def3_sum'] * -1, 0)

In [22]:
df_shap_7_9_def_fp.head(5).T

Unnamed: 0,0,1,2,3,4
gameId,2022102000,2022102000,2022102000,2022102000,2022102000
playId,80,80,80,80,80
frameId,45,46,47,48,49
frame_since_bc,27,28,29,30,31
player1.0_y,20.1,19.7,19.29,18.89,18.49
actual_dist_from_final,5.72,5.29,4.89,4.51,4.16
def1_sum,-3.1459,-3.43579,-3.257644,-2.757328,-3.171229
def2_sum,-0.859514,-0.51994,-0.336973,0.434542,0.110076
def3_sum,-0.07979,-0.086945,-0.057229,-0.201417,-0.10988
def1_nflId,44848.0,44848.0,44848.0,44848.0,44848.0


In [23]:
a = df_shap_7_9_def_fp.groupby(['gameId', 'playId', 'def1_nflId', 'def1_name'])['def1_sum_neg_transform'].sum()
b = df_shap_7_9_def_fp.groupby(['gameId', 'playId', 'def2_nflId', 'def2_name'])['def2_sum_neg_transform'].sum()
c = df_shap_7_9_def_fp.groupby(['gameId', 'playId', 'def3_nflId', 'def3_name'])['def3_sum_neg_transform'].sum()

a = pd.DataFrame(a).reset_index()
b = pd.DataFrame(b).reset_index()
c = pd.DataFrame(c).reset_index()

In [24]:
d = df_shap_7_9_def_fp.groupby(['gameId', 'playId'])['def1_sum_neg_transform'].sum()
e = df_shap_7_9_def_fp.groupby(['gameId', 'playId'])['def2_sum_neg_transform'].sum()
f = df_shap_7_9_def_fp.groupby(['gameId', 'playId'])['def3_sum_neg_transform'].sum()

d = pd.DataFrame(d).reset_index()
e = pd.DataFrame(e).reset_index()
f = pd.DataFrame(f).reset_index()

d = d.rename(columns={'def1_sum_neg_transform': 'def1_total'})
e = e.rename(columns={'def2_sum_neg_transform': 'def2_total'})
f = f.rename(columns={'def3_sum_neg_transform': 'def3_total'})

In [25]:
df_shap_7_9_def_group = a.merge(b, left_on=['gameId', 'playId', 'def1_nflId', 'def1_name'],
                            right_on=['gameId', 'playId', 'def2_nflId', 'def2_name'], how='left').drop(columns=['def2_nflId', 'def2_name'], axis=1).merge(c,
                            left_on=['gameId', 'playId', 'def1_nflId', 'def1_name'], right_on=['gameId', 'playId', 'def3_nflId', 'def3_name'],
                            how='left').drop(columns=['def3_nflId', 'def3_name'], axis=1).merge(d, on=['gameId', 'playId'], how='inner').merge(e,
                            on=['gameId', 'playId'], how='inner').merge(f, on=['gameId', 'playId'], how='inner')

In [26]:
df_shap_7_9_def_group['def1_sum_neg_transform'] = df_shap_7_9_def_group['def1_sum_neg_transform'].fillna(0)
df_shap_7_9_def_group['def2_sum_neg_transform'] = df_shap_7_9_def_group['def2_sum_neg_transform'].fillna(0)
df_shap_7_9_def_group['def3_sum_neg_transform'] = df_shap_7_9_def_group['def3_sum_neg_transform'].fillna(0)

In [27]:
df_shap_7_9_def_group['def_sum'] = df_shap_7_9_def_group['def1_sum_neg_transform'] + df_shap_7_9_def_group['def2_sum_neg_transform'] + df_shap_7_9_def_group['def3_sum_neg_transform']
df_shap_7_9_def_group['play_total'] = df_shap_7_9_def_group['def1_total'] + df_shap_7_9_def_group['def2_total'] + df_shap_7_9_def_group['def3_total']

In [28]:
df_shap_7_9_def_group['prop'] = df_shap_7_9_def_group['def_sum'] / df_shap_7_9_def_group['play_total']
df_shap_7_9_def_group['large_enough'] = np.where(df_shap_7_9_def_group['prop'] > 0.1,1,0)

In [29]:
df_shap_7_9_def_group_over_10pct = df_shap_7_9_def_group[df_shap_7_9_def_group['large_enough'] == 1].reset_index(drop=True)

In [30]:
new_sum = df_shap_7_9_def_group_over_10pct.groupby(['gameId', 'playId'])['def_sum'].sum().reset_index()
new_sum = new_sum.rename(columns={'def_sum': 'new_total_sum'})

In [31]:
df_shap_7_9_def_group_over_10pct = df_shap_7_9_def_group_over_10pct.merge(new_sum, on=['gameId', 'playId'], how='inner')

In [32]:
df_shap_7_9_def_group_over_10pct['new_prop'] = df_shap_7_9_def_group_over_10pct['def_sum'] / df_shap_7_9_def_group_over_10pct['new_total_sum']

In [33]:
df_shap_final = df_shap_7_9_def_group_over_10pct[['gameId', 'playId', 'def1_nflId', 'def1_name', 'new_prop']]

In [None]:
pbp_df['gameId'] = pbp_df['gameId'].astype(int)

In [None]:
df_shap_final_pbp = df_shap_final.merge(pbp_df, on=['gameId', 'playId'], how='inner')

In [None]:
df_shap_final_pbp

Unnamed: 0,gameId,playId,def1_nflId,def1_name,new_prop,third_down_failed,fourth_down_failed,goal_to_go,tackled_for_loss,run_gap,run_location,epa
0,2022102000,80,44848.0,Budda Baker,0.679478,0.0,0.0,0.0,0.0,,middle,1.318386
1,2022102000,80,49410.0,Jalen Thompson,0.320522,0.0,0.0,0.0,0.0,,middle,1.318386
2,2022102000,101,52539.0,Rashard Lawrence,0.641223,0.0,0.0,0.0,0.0,,middle,-0.152057
3,2022102000,101,53445.0,Zaven Collins,0.358777,0.0,0.0,0.0,0.0,,middle,-0.152057
4,2022102000,201,40017.0,Tyrann Mathieu,0.511015,0.0,0.0,0.0,0.0,,middle,0.279343
...,...,...,...,...,...,...,...,...,...,...,...,...
3317,2022110700,3707,44851.0,Marcus Maye,0.294283,0.0,0.0,0.0,0.0,end,right,-0.149736
3318,2022110700,3707,48027.0,Kaden Elliss,0.705717,0.0,0.0,0.0,0.0,end,right,-0.149736
3319,2022110700,3740,48537.0,Carl Granderson,1.000000,1.0,0.0,0.0,1.0,end,left,-1.144850
3320,2022110700,3787,52578.0,Broderick Washington,0.552720,0.0,0.0,0.0,0.0,guard,right,-0.099311


In [None]:
#df_shap_final_pbp.to_csv('df_shap_final_pbp.csv')