# What makes a great punter?

## Defining a new tool for analyzing punter effectiveness

In recent seasons, NFL teams have opted to punt less and less often, and many argue that this has made for a more exciting game. Whether or not this is an effective strategy for teams depends greatly on the quality of their punter--the more effective the punter, the greater the opportunity cost of going for it on fourth down. 

This project aims to create a new tool for analyzing punter effectiveness over the course of one or more seasons. 

In working on this project, I collaborated with an avid football fan and non-coder (my mom). To make the notebook as readable for her as possible, I tried to err on the side of explaining what I was doing as often as possible. I hope, as a byproduct, that this makes the code more readable for all who choose to explore. 

I welcome any and all feedback on the code.

Thank you for reading.


Step 1: Prepare the environment, open the data, and build dfs.

In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import os

In [None]:
# Access the csv files and build dfs
scout_df = pd.read_csv("../input/nfl-big-data-bowl-2022/PFFScoutingData.csv")
games_df = pd.read_csv("../input/nfl-big-data-bowl-2022/games.csv")
players_df = pd.read_csv("../input/nfl-big-data-bowl-2022/players.csv")
plays_df = pd.read_csv("../input/nfl-big-data-bowl-2022/plays.csv")
# track18_df = pd.read_csv("../input/nfl-big-data-bowl-2022/tracking2018.csv")
# track19_df = pd.read_csv("../input/nfl-big-data-bowl-2022/tracking2019.csv")
# track20_df = pd.read_csv("../input/nfl-big-data-bowl-2022/tracking2020.csv")

In [None]:
#Make a list of all dfs for cleaning and analysis
# df_list = [scout_df, games_df, players_df, plays_df, track18_df, track19_df, track20_df]

Step 2: Preview all dfs to understand available data.

In [None]:
# Define a function to insert commas into shape descriptions to make them human-legibile.
def insert_commas (tup):
    new_list = []
    for i in tup:
        i_new = "{:,}".format(i)
        new_list.append(i_new)
    return new_list


    
# Get the shape (number of rows, number of columns) for each df
scout_shape = insert_commas(scout_df.shape)
games_shape = insert_commas(games_df.shape)
players_shape = insert_commas(players_df.shape)
plays_shape = insert_commas(plays_df.shape)
# track18_shape = insert_commas(track18_df.shape)
# track19_shape = insert_commas(track19_df.shape)
# track20_shape = insert_commas(track20_df.shape)


print("Number of rows and columns in each df:")            
print(f'scout_df has {scout_shape[0]} rows and {scout_shape[1]} columns')
print(f'games_df: has {games_shape[0]} rows and {games_shape[1]} columns')
print(f'players_df: has {players_shape[0]} rows and {players_shape[1]} columns')
print(f'plays_df: has {plays_shape[0]} rows and {plays_shape[1]} columns')
# print(f'track18_df: has {track18_shape[0]} rows and {track18_shape[1]} columns')
# print(f'track19_df: has {track19_shape[0]} rows and {track19_shape[1]} columns')
# print(f'track20_df: has {track20_shape[0]} rows and {track20_shape[1]} columns')

In [None]:
scout_df.head()

In [None]:
scout_df.describe()

In [None]:
games_df.head()

In [None]:
games_df.describe()

In [None]:
players_df.head()

In [None]:
players_df.describe()

In [None]:
plays_df.head()

In [None]:
plays_df.describe()

In [None]:
# track18_df.head()

In [None]:
# track18_df.describe()

In [None]:
# track19_df.head()

In [None]:
# track19_df.describe()

In [None]:
# track20_df.head()

In [None]:
# track20_df.describe()

Step 3: Build a dataframe of all punters in players_df and all the punts in plays_df. This will allow for analyzing performance across punters and punts.

In [None]:
punters_df = players_df.loc[players_df['Position'] == "P"]
punters_df.head()

In [None]:
punters_df.info()

In [None]:
punts_df = plays_df.loc[plays_df['specialTeamsPlayType'] == "Punt"]
puntsbypunter_df = punts_df.groupby('kickerId')
puntsbypunter_df.head()

In [None]:
# Calculate each punter's mean data.
puntsbypunter_df.mean()

In [None]:
# Determine the number of punts each punter kicked
punt_count = puntsbypunter_df.size().to_frame('npunts') 
punt_count = punt_count.astype(int)
punt_count



In [None]:
# Merge punt counts with punter information 
# punters_df.merge(punt_count, how='left', left_on="nflId", right_on="kickerId")
punters_df.merge(punt_count, how='left', left_index = True, right_index=True)

Step 4: Calculate summary statistics for each punter and add them to punter_df. Statistics include...
- Mean punt length, kick return yards, and play result


In [None]:
punt_mean_df = puntsbypunter_df.mean()
punt_mean_df = punt_mean_df[['kickLength', 'kickReturnYardage', 'playResult']]
punt_mean_df

Merge summary statistics to punters_df

In [None]:
punters_df = punters_df.join(punt_mean_df)
punters_df

Step 5: create data vizzes to understand punters data.

In [None]:
sns.histplot(data=punters_df, x="kickLength", kde=True).set(title="Each Punter's Average Punt Length Histogram")

Step 6: Categorize punts based on starting yard line.

In [None]:
print(punts_df.columns)

In [None]:
pd.set_option('display.max_columns', None)  
print(punts_df[1:5])

In [None]:
# Create a calculation to normalize starting yard line.
punt_starts = punts_df[['gameId','playId','possessionTeam',"yardlineSide", "yardlineNumber", 
                        "specialTeamsResult", "kickerId", "kickLength", "kickReturnYardage", "playResult"]].copy()


In [None]:
punt_starts.loc[punt_starts['possessionTeam'] == punt_starts['yardlineSide'], 'startYdsToEndzone'] = 100 - punt_starts['yardlineNumber']
punt_starts.loc[punt_starts['possessionTeam'] != punt_starts['yardlineSide'], 'startYdsToEndzone'] = punt_starts['yardlineNumber']
punt_starts

In [None]:
sns.stripplot(data=punt_starts, x="startYdsToEndzone", y="kickLength").set(title="Punt Lengths Based on Initial Distance to Endzone")


In [None]:
sns.kdeplot(data=punt_starts, x="startYdsToEndzone", y="kickLength").set(title="Punt Lengths Based on Initial Distance to Endzone")


In [None]:
f, ax = plt.subplots(figsize=(10, 10))
sns.scatterplot(data=punt_starts, x="startYdsToEndzone", y="kickLength", s=5, color=".15")
sns.histplot(data=punt_starts, x="startYdsToEndzone", y="kickLength", bins=50, pthresh=.1, cmap="mako")
sns.kdeplot(data=punt_starts, x="startYdsToEndzone", y="kickLength", levels=15, color="w", linewidths=1)

Based on the vizzes above, there appears to be a distinction between punts from more than 60 yards away from the endzone (where the punter aims to kick as far as possible) and from less than 60 yards away (where the punter aims to pin the opponent as close as possible to the endzone without kicking a touchback.

Using this information, I will split the punts data into these two groups.

In [None]:
punt_starts.loc[punt_starts['startYdsToEndzone'] >= 60, 'puntCategory'] = "MaxLength"
punt_starts.loc[punt_starts['startYdsToEndzone'] < 60, 'puntCategory'] = "Precision"
punt_starts

In [None]:
punt_starts_grouped = punt_starts.groupby('puntCategory')
punt_starts_grouped.mean()

Produce graphs for each punter.

In [None]:
punters_df
punter_list = punters_df['displayName'].tolist()
punter_list

In [None]:
punters_df.head()

In [None]:
punter_ids = punters_df["nflId"].tolist()
punter_ids

Create a dictionary with names and id numbers for labeling figures.

In [None]:
punters_df

In [None]:
# Create a directory for figures
os.mkdir("figures")

Create an "optimal" punt line by graphing a series of dots in a angled line 10 yards from the endzone.

In [None]:
optimal_dist_to_ez = [40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90]
optimal_length = [30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80] 
sns.scatterplot(x=optimal_dist_to_ez, y=optimal_length, palette='red')


In [None]:
# Testing out different plot options
# Option 1: scatterplot
sns.kdeplot(data=punt_starts, x="startYdsToEndzone", y="kickLength").set(title="Punt Lengths Based on Initial Distance to Endzone")
punter = 46903
punter_id = float(punter)
temp_punts_df_1 = punt_starts.loc[punt_starts['kickerId'] == punter_id]
fig = sns.scatterplot(data=temp_punts_df_1, x="startYdsToEndzone", y="kickLength").set(title=punter_id)


In [None]:
# Option 2: Stacked density plots
plt.clf()
sns.kdeplot(data=punt_starts, x="startYdsToEndzone", y="kickLength").set(title="Punt Lengths Based on Initial Distance to Endzone")
fig = sns.kdeplot(data=temp_punts_df_1, x="startYdsToEndzone", y="kickLength").set(title=punter_id)
sns.scatterplot(x=optimal_dist_to_ez, y=optimal_length, palette='red')


In [None]:
# Option 3: Filled density plot, "Mako" palette
plt.clf()
sns.kdeplot(data=punt_starts, x="startYdsToEndzone", y="kickLength").set(title="Punt Lengths Based on Initial Distance to Endzone")
fig = sns.kdeplot(data=temp_punts_df_1, x="startYdsToEndzone", y="kickLength",fill=True, thresh=0, levels=100, cmap="mako").set(title=punter_id)
sns.scatterplot(x=optimal_dist_to_ez, y=optimal_length, palette='red')


In [None]:
# Option 4: Filled density plot, "Rocket" palette
plt.clf()
sns.kdeplot(data=punt_starts, x="startYdsToEndzone", y="kickLength").set(title="Punt Lengths Based on Initial Distance to Endzone")
fig = sns.kdeplot(data=temp_punts_df_1, x="startYdsToEndzone", y="kickLength",fill=True, thresh=0, levels=100, cmap="rocket").set(title=punter_id)
sns.scatterplot(x=optimal_dist_to_ez, y=optimal_length, palette='red')


In [None]:
# Option 5: Filled density plot, "Crest" palette
plt.clf()
sns.kdeplot(data=punt_starts, x="startYdsToEndzone", y="kickLength").set(title="Punt Lengths Based on Initial Distance to Endzone")
fig = sns.kdeplot(data=temp_punts_df_1, x="startYdsToEndzone", y="kickLength",fill=True, thresh=0, levels=100, cmap="crest").set(title=punter_id)
sns.scatterplot(x=optimal_dist_to_ez, y=optimal_length, palette='red')


In [None]:
# Option 6: Filled density plot, "Magma" palette, fewer levels
plt.clf()
sns.kdeplot(data=punt_starts, x="startYdsToEndzone", y="kickLength").set(title="Punt Lengths Based on Initial Distance to Endzone")
fig = sns.kdeplot(data=temp_punts_df_1, x="startYdsToEndzone", y="kickLength",fill=True, thresh=0, levels=10, cmap="magma").set(title=punter_id)
sns.scatterplot(x=optimal_dist_to_ez, y=optimal_length, palette='greys')


In [None]:
# Option 7: Filled density plot, "Magma" palette, fewer levels, higher threshhold
plt.clf()
sns.kdeplot(data=punt_starts, x="startYdsToEndzone", y="kickLength").set(title="Punt Lengths Based on Initial Distance to Endzone")
fig = sns.kdeplot(data=temp_punts_df_1, x="startYdsToEndzone", y="kickLength",fill=True, thresh=.1, levels=10, cmap="magma").set(title=punter_id)
sns.scatterplot(x=optimal_dist_to_ez, y=optimal_length, palette='red')


In [None]:
name_id = punters_df.loc[:,['displayName','nflId']]


In [None]:
# punter_id = 46903
# name = name_id.loc[name_id['nflId'] == punter_id,'displayName']
# # name = name.iloc[0]['displayName']
# print(name)
# # sub_df.iloc[0]['A']

In [None]:
def punter_grapher (punter_ids, punt_starts):
    
    for punter in punter_ids:
        sns.kdeplot(data=punt_starts, x="startYdsToEndzone", y="kickLength").set(title="Punt Lengths Based on Initial Distance to Endzone")
        punter_id = float(punter)
        ## insert here
        temp_punts_df = punt_starts.loc[punt_starts['kickerId'] == punter_id]
        sns.kdeplot(data=temp_punts_df, x="startYdsToEndzone", y="kickLength",fill=True, thresh=0, levels=10, cmap="magma").set(title=punter_id)
        sns.scatterplot(x=optimal_dist_to_ez, y=optimal_length, palette='red')
        plt.title(punter_id)
        plot_title = f"figures/{punter}.pdf"
        plt.savefig(plot_title)
        plt.clf()

        
    

In [None]:
plt.clf()
punter_grapher(punter_ids, punt_starts)