In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This is a notebook that explores a few NFL Special Teams topics that could give an advantage to an informed team.

The topics I researched are as follows:
1. Improving onside kick recovery rate
1. Predicting the length of a punt 
     * as well as whether day of week has an impact on punt length
3. Checking the affect of hangtime on how a returner fields a punt (Ideally to see whether we can make the returner muff more often)
4. Ranking the longest punters in the league

Aside from trying to improve special team performance, I also created my own metric of Composite Kicking Score (The name could use work). It is a scoreboard that the NFL can use to rank the most accurate long range kickers, possibly for a season long competition/award.

In [None]:
#Importing Libraries
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
import calendar
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB
from sklearn import tree
#%%
#Loading in datasets
df_games = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/games.csv')
#Teams playing in each game
df_PFF = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/PFFScoutingData.csv')
#Play level scouting
df_players = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/players.csv')
#Player info
#height has some players listed in inches i think
df_plays = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/plays.csv')
#Play level info each game

In [None]:
df_games.head(5) #Just making sure everything was read in properly.

In [None]:
#%%
#This gets day of week
df_games['dayOfWeek'] = np.nan
for index, i in enumerate(df_games['gameDate']):
    df_games['dayOfWeek'][index] = (datetime.strptime(df_games['gameDate'][index], '%m/%d/%Y').date().strftime('%A'))

gameday = dict(zip(df_games.gameId, df_games.dayOfWeek))

In [None]:
df_games.head(5)

In [None]:
#%%
#This cell is to determine whether we can predict whether onside kicks will work
onside_plays = df_plays[df_plays['playDescription'].str.contains('onside')]
onside_PFF = df_PFF.loc[(df_PFF['kickType'] == 'O') | (df_PFF['kickType'] == 'S')]
onside = pd.merge(onside_plays, onside_PFF, on ='gameId')

onside.drop_duplicates(subset=['playId_x'], inplace =True)
onside['Success'] = np.nan
onside = onside.reset_index()

for index, i in enumerate(onside['specialTeamsResult']):
    if onside['specialTeamsResult'][index] == 'Kickoff Team Recovery':
        onside['Success'][index] = 1
    else: 
        onside['Success'][index] = 0

onside.groupby('Success')['index'].count()

print ('Onside kicks have a' , (onside.groupby('Success')['index'].count()[1]/onside.groupby('Success')['index'].count()[0]).round(4), 
       'percent success rate.')

clf = MLPRegressor(activation = 'logistic' , solver='sgd', alpha=0.3, hidden_layer_sizes=(50, 50, 50, 50, 50),
                    random_state=42, shuffle = False)

y = onside['Success']
X = onside[['quarter', 'preSnapVisitorScore', 'preSnapHomeScore', 'kickLength', 'kickType', 'kickDirectionActual', 'kickoffReturnFormation']]
X = pd.get_dummies(X, columns = ['quarter', 'kickType', 'kickDirectionActual', 'kickoffReturnFormation'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

clf.fit(X_train, y_train)

results = clf.predict(X_test)
'''
Nothing looks to be a good indicator of successful onside kicks by current metrics
'''

In [None]:
results

**Nothing looks to be a good indicator of successful onside kicks by current metrics**

In [None]:
#%%

punt_plays = df_plays[df_plays['playDescription'].str.contains('punt')]
punt_PFF = df_PFF.loc[(df_PFF['kickType'] == 'A') | (df_PFF['kickType'] == 'R') | (df_PFF['kickType'] == 'N')]
punt = pd.merge(punt_plays, punt_PFF, on ='gameId')
punt.drop_duplicates(subset=['playId_x'], inplace =True)

temp = punt.stack().map(gameday).unstack()
punt['dayOfWeek'] = np.nan
punt['dayOfWeek'] = temp['gameId']

#No significant difference in punt length based on day of week. Though, on short rest (TNF), they kick the furthest
#And on more rest (MNF), they kick the shortest. Probably coincidense 
k = punt.groupby('dayOfWeek')['kickLength'].mean().reset_index()

#Need to do this for fitting
punt.dropna(subset=['hangTime'], inplace = True)
punt.dropna(subset=['kickLength'], inplace = True)


#Algorithm for predicting punt length
y = punt['kickLength']
X = punt[['snapDetail', 'snapTime', 'hangTime', 'kickType', 'kickDirectionActual']]

X = pd.get_dummies(X, columns = ['snapDetail', 'kickType', 'kickDirectionActual'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = MLPRegressor(activation = 'logistic' , solver='sgd', alpha=0.5, hidden_layer_sizes=(10,10,10),
                    random_state=42, shuffle = False)

clf.fit(X_train, y_train)

temp = clf.predict(X_test)
'''
This does not lead to anything. Everything comes out to ~45 yards punt
'''

In [None]:
temp[0:101]

**This does not lead to anything. Everything comes out to ~45 yards punt**

In [None]:
#%%
#Checking hangtime affect. Does hangtime affect how the returner handles the ball
n = punt.groupby('kickContactType')['hangTime'].mean().reset_index()
'''
Nothing here
'''

In [None]:
n.sort_values(by='hangTime', ascending = False)

**Nothing here**

In [None]:
#%%
#Ranking punters lengths
n =  (punt.groupby('kickerId')['kickLength'].count().reset_index())
m =  (punt.groupby('kickerId')['kickLength'].mean().reset_index())
n = pd.merge(n, m, on ='kickerId')

id_name = dict(zip(df_players.nflId, df_players.displayName))

temp = n.stack().map(id_name).unstack()
n['kickerId'] = temp['kickerId']
n.rename(columns = {'kickerId': 'Kicker', 'kickLength_x': 'Number of Punts', 'kickLength_y': 'Average Punt Length'}, inplace = True)

#Getting sample size
n = n.loc[n['Number of Punts'] >= 30].sort_values(by = ['Average Punt Length'], ascending = False)

#Final Punt Leaderboard
print (n[['Kicker', 'Number of Punts', 'Average Punt Length']])
'''
Tress way and Andy Lee have the longest punt average with a very large sample size.
'''

**Tress way and Andy Lee have the longest punt average with a very large sample size**

In [None]:
#%%
#Creating my own kicking metric. Composite Kicker Score (CKS)
#Kickers gain score by completing kicks and lose score from missing kicks.
#If a kick is made, the score increases based on the distance kicked.
#For example, making 3 kicks at 25, 35, and 50 yards would result in a score of 110
#If a kick is missed, the kicker is penalized the closer they are to the goal.
#The formula to calculate lost points is: Pts lost = -67 + kick length
#-67 accounts for the longest kick in NFL History (Justin Tucker) being 66 yards.
#Missing anything above that wouldnt be deserving of a penalty.
kick_plays = df_plays.loc[(df_plays['specialTeamsPlayType'] == 'Extra Point') | (df_plays['specialTeamsPlayType'] == 'Field Goal')]
#Yardline number will break if a 69 yarder is attempted.
#This dataset should have kick distance as a metric. kickLength only accounts for made kicks
kick_plays = kick_plays[['specialTeamsPlayType', 'specialTeamsResult', 'kickerId' ,'yardlineNumber']]
kick_plays = kick_plays.loc[(kick_plays['specialTeamsResult'] == 'Kick Attempt Good') | (kick_plays['specialTeamsResult'] == 'Kick Attempt No Good')]
kick_plays.replace('Kick Attempt Good', 0, inplace = True)
kick_plays.replace('Kick Attempt No Good', -67, inplace = True)

kick_plays['yardlineNumber'] = kick_plays['yardlineNumber'] + 18

kick_plays['Points Gained or Lost'] = kick_plays['specialTeamsResult'] + kick_plays['yardlineNumber']

id_name = dict(zip(df_players.nflId, df_players.displayName))

temp = kick_plays.stack().map(id_name).unstack()
kick_plays['Kicker Name'] = temp['kickerId']

scoreboard = kick_plays.groupby('Kicker Name')['Points Gained or Lost'].sum().reset_index()

scoreboard.sort_values(by = ['Points Gained or Lost'], ascending = False)

'''
Names that you would expect to be at the top of a leaderboard like this are there.
I believe that this is a stat that the NFL could use on screen to show who the
most prolific kickers in the league are.

On a yearly basis, fans could take interest and root for their teams kicker 
to be at the top of the leaderboard. At the end of the year, the NFL could even
acknowledge the highest score with an award (similar to an NBA scoring leader).
'''

In [None]:
scoreboard.sort_values(by = ['Points Gained or Lost'], ascending = False)


**Names that you would expect to be at the top of a leaderboard like this are there.
I believe that this is a stat that the NFL could use on screen to show who the
most prolific kickers in the league are.**

**On a yearly basis, fans could take interest and root for their teams kicker 
to be at the top of the leaderboard. At the end of the year, the NFL could even
acknowledge the highest score with an award (similar to an NBA scoring leader).**

In [None]:
my_submission = pd.DataFrame(scoreboard)
my_submission.to_csv('submission.csv', index=False)