# NFL Punt Safety Analysis
Carlin Eng

**Head on over to Youtube for a narrated version of this analysis: https://youtu.be/EHAPOUm1IG4**  
**Presentation slides are in [this Google Doc](https://docs.google.com/presentation/d/1GKqfaTn7BzZydY3__NUnrmFJmYPXIdO506Vg-FnUWCg/edit?usp=sharing)**

I am proposing the following rule changes:  

1. Amend Rule 9, Article 2, to allow ineligible receivers on the line of scrimmage to advance more than 1 yard beyond the line of scrimmage prior to the kick.
2. Require the punter to set up at least 15 yards behind the line of scrimmage when receiving the snap.
3. Change pre-season rules to eliminate punt returns. All punts during the preseason should be fair caught, downed, or out of bounds.

The kernel below shows the analysis that led me to these proposals.

In [None]:
# kaggle/python docker image: https://github.com/kaggle/docker-python
import os
import datetime

import numpy as np
import pandas as pd
import holoviews as hv
hv.extension('bokeh')

from scipy import stats

For data manipulation and exploratory analysis, I prefer to use SQL.

I've loaded the data provided into a local Postgresql instance, and run the following queries to create the "plays_categorized" dataset:

```sql
CREATE TABLE nfl_punt.role_types (
	role varchar(32)
	, role_type varchar(32)
);

-- Manually map each of the roles to either 'return' or 'coverage' unit
INSERT INTO nfl_punt.role_types (role, role_type) VALUES
	('PR', 'return')
	, ('PDL1', 'return')
	, ('PDR1', 'return')
	, ('PRG', 'coverage')
	, ('P', 'coverage')
	, ('PLG', 'coverage')
	, ('PRT', 'coverage')
	, ('PLS', 'coverage')
	, ('PLT', 'coverage')
	, ('PLW', 'coverage')
	, ('PDR2', 'return')
	, ('PRW', 'coverage')
	, ('PDL2', 'return')
	, ('GL', 'coverage')
	, ('GR', 'coverage')
	, ('PDL3', 'return')
	, ('PDR3', 'return')
	, ('VL', 'return')
	, ('VR', 'return')

	, ('PPR', 'coverage')
	, ('PLL', 'return')
	, ('PLR', 'return')
	, ('VRo', 'return')
	, ('VRi', 'return')
	, ('VLi', 'return')
	, ('VLo', 'return')
	, ('PDL4', 'return')
	, ('PDR4', 'return')
	, ('PLM', 'return')
	, ('PLR1', 'return')
	, ('PLR2', 'return')
	, ('PLL2', 'return')
	, ('PLL1', 'return')
	, ('PFB', 'return')
	, ('PDL5', 'return')
	, ('PDR5', 'return')
	, ('GRo', 'coverage')
	, ('GRi', 'coverage')
	, ('GLi', 'coverage')
	, ('GLo', 'coverage')

	, ('PDL6', 'return')
	, ('PLR3', 'return')
	, ('PLL3', 'return')
	, ('PPLi', 'coverage')
	, ('PPLo', 'coverage')
	, ('PC', 'coverage')
	, ('PDR6', 'return')
	, ('PPRo', 'coverage')
	, ('PPRi', 'coverage')
	, ('PLM1', 'return')
;

-- Create a table containing NGS data for only injuries
CREATE TABLE ngs_injuries AS
SELECT
	a.*
FROM ngs a
JOIN video_review b
ON a.season_year = b.season_year
AND a.game_key = b.game_key
AND a.play_id = b.play_id
;

-- Get relevant columns all in a single table related to injury plays
DROP TABLE nfl_punt.injury_wide;
CREATE TABLE nfl_punt.injury_wide AS
SELECT
	a.season_year
	, a.game_key
	, a.play_id
	, a.gsisid
	, pi.season_type
	, pi.yard_line
	, pi.poss_team
	, a.player_activity_derived
	, a.turnover_related
	, a.primary_impact_type
	, a.primary_partner_gsisid
	, a.primary_partner_activity_derived
	, a.friendly_fire
	, role_1.role AS player_role
	, rt1.role_type AS player_role_type
	, role_2.role AS primary_partner_role
	, footage.preview_link

FROM video_review a
LEFT JOIN player_role role_1 
	ON a.season_year = role_1.season_year
	AND a.game_key = role_1.game_key
	AND a.play_id = role_1.play_id
	AND a.gsisid = role_1.gsisid
LEFT JOIN player_role role_2
	ON a.season_year = role_2.season_year
	AND a.game_key = role_2.game_key
	AND a.play_id = role_2.play_id
	AND a.primary_partner_gsisid = role_2.gsisid
LEFT JOIN nfl_punt.role_types rt1
	ON role_1.role = rt1.role
LEFT JOIN nfl_punt.video_footage_injury footage
	ON a.season_year = footage.season
	AND a.game_key = footage.game_key
	AND a.play_id = footage.play_id
LEFT JOIN nfl_punt.play_information pi
	ON a.season_year = pi.season_year
	AND a.game_key = pi.game_key
	AND a.play_id = pi.play_id
;

-- Create a table with NGS data for injured/partner players during plays w/ injuries
DROP TABLE ngs_injured_players;
CREATE TABLE ngs_injured_players AS
SELECT
	*
	, DATE_PART('seconds', time - play_start) AS time_since_start
FROM (
	SELECT
		a.*
		, 'injured' AS role
		, MIN(a.time) OVER (PARTITION BY a.season_year, a.game_key, a.play_id, a.gsisid) AS play_start
	FROM ngs_injuries a
	JOIN injury_wide b
	ON a.season_year = b.season_year
	AND a.game_key = b.game_key
	AND a.play_id = b.play_id
	AND a.gsisid = b.gsisid
	UNION ALL
	SELECT
		c.*
		, 'partner' AS role
		, MIN(c.time) OVER (PARTITION BY c.season_year, c.game_key, c.play_id, c.gsisid) AS play_start
	FROM ngs_injuries c
	JOIN injury_wide d
	ON c.season_year = d.season_year
	AND c.game_key = d.game_key
	AND c.play_id = d.play_id
	AND c.gsisid = d.primary_partner_gsisid
) z
;

DROP TABLE nfl_punt.punter_names;
CREATE TABLE nfl_punt.punter_names AS
SELECT 
	season_year
	, poss_team
	, punter_gsisid
	, TRIM(both ' ' FROM t[array_length(t, 1)]) AS player_name
	, COUNT(*) AS nrows
FROM (

	SELECT
		*
		, regexp_split_to_array(first_desc, '\)') AS t
	FROM (
		SELECT 
			pi.season_year
			, pi.season_type
			, pi.game_key
			, pi.play_id
			, pi.poss_team
			, pr.gsisid  AS punter_gsisid
			, SPLIT_PART(play_description, 'punt', 1) AS first_desc
		FROM nfl_punt.play_information pi
		JOIN (SELECT * FROM nfl_punt.player_role WHERE role = 'P' ) pr
		ON pi.season_year = pr.season_year
		and pi.game_key = pr.game_key
		and pi.play_id = pr.play_id
		WHERE play_description NOT ILIKE '%PENALTY%'
	) a
) b

WHERE TRIM(both ' ' FROM t[array_length(t, 1)]) != '.'
AND TRIM(both ' ' FROM t[array_length(t, 1)]) NOT ILIKE '%fake punt%'
GROUP BY 1,2,3,4
HAVING COUNT(*) > 1
;
-- Looks like some mis-categorized data, need to do some cleanup. R.Leone is mistakenly listed as punter 31876.
-- DELETE FROM nfl_punt.punter_names WHERE punter_gsisid = 31876 AND player_name = 'R.Leone';

-- Want to create a punter stats table
-- In each season:
-- n_punts, n_injuries

-- Get NGS data just for punters
DROP TABLE nfl_punt.punter_ngs;
CREATE TABLE nfl_punt.punter_ngs AS
SELECT
	ngs.*
	, names.player_name
FROM nfl_punt.ngs ngs
LEFT JOIN nfl_punt.player_role role
	ON ngs.season_year = role.season_year
	AND ngs.game_key = role.game_key
	AND ngs.play_id = role.play_id
	AND ngs.gsisid = role.gsisid
JOIN (SELECT player_name, MAX(punter_gsisid) AS punter_gsisid FROM nfl_punt.punter_names GROUP BY 1) names
ON ngs.gsisid = names.punter_gsisid
WHERE role.role = 'P'
;

CREATE INDEX ON nfl_punt.punter_ngs (season_year, game_key, play_id, gsisid);

-- Summarize data for each punt, including times when events occur
DROP TABLE nfl_punt.punts_summarized;
CREATE TABLE nfl_punt.punts_summarized AS
SELECT
	a.season_year
	, a.game_key
	, c.season_type
	, a.play_id
	, a.player_name
	, a.gsisid
	, CASE WHEN b.gsisid IS NOT NULL THEN 1 ELSE 0 END AS injury
	, MAX(CASE WHEN event = 'line_set' THEN time ELSE NULL END) AS line_set_time
	, MAX(CASE WHEN event = 'ball_snap' THEN time ELSE NULL END) AS snap_time
	, MAX(CASE WHEN event = 'punt' THEN time ELSE NULL END) AS punt_time
	, MAX(CASE WHEN event = 'punt_received' THEN time ELSE NULL END) AS receive_time
	, MAX(CASE WHEN event = 'tackle' THEN time ELSE NULL END) AS tackle_time
	, MAX(CASE WHEN event = 'first_contact' THEN time ELSE NULL END) AS first_contact_time
	, MAX(CASE WHEN event = 'punt_downed' THEN time ELSE NULL END) AS punt_downed_time
	, MAX(CASE WHEN event = 'fair_catch' THEN time ELSE NULL END) AS fair_catch_time
	, MAX(CASE WHEN event = 'out_of_bounds' THEN time ELSE NULL END) AS out_of_bounds_time
	, MAX(CASE WHEN event = 'play_submit' THEN time ELSE NULL END) AS submit_time

FROM nfl_punt.punter_ngs a
LEFT JOIN nfl_punt.injury_wide b
	ON a.season_year = b.season_year
	AND a.game_key = b.game_key
	AND a.play_id = b.play_id
LEFT JOIN nfl_punt.game_data c 
	ON a.season_year = c.season_year
	AND a.game_key = c.game_key
GROUP BY 1,2,3,4,5,6,7
;

-- For each punt determine where the punter was relative to line of scrimmage when ball is punted
CREATE TABLE nfl_punt.punter_location AS
SELECT
	ngs.season_year
	, ngs.game_key
	, ngs.play_id
	, ngs.time AS snap_time
	, pc.season_type
	, MAX(CASE WHEN role.role = 'P' THEN x ELSE NULL END) AS p_x
	, MAX(CASE WHEN role.role = 'P' THEN y ELSE NULL END) AS p_y
	, MAX(CASE WHEN role.role = 'PLS' THEN x ELSE NULL END) AS ls_x
	, MAX(CASE WHEN role.role = 'PLS' THEN x ELSE NULL END) AS ls_y

FROM nfl_punt.ngs ngs
LEFT JOIN nfl_punt.player_role role
	ON ngs.season_year = role.season_year
	AND ngs.game_key = role.game_key
	AND ngs.play_id = role.play_id
	AND ngs.gsisid = role.gsisid
LEFT JOIN nfl_punt.play_information pi
	ON ngs.season_year = pi.season_year
	AND ngs.game_key = pi.game_key
	AND ngs.play_id = pi.play_id
WHERE role.role IN ('P', 'PLS')
AND ngs.event = 'ball_snap'
GROUP BY 1,2,3,4,5,6,7,8,9
;

-- For each punt, determine the position of the punter at the time when the ball was received
-- event = 'punt_received', 'fair_catch', 'punt_downed', 'touchback'
DROP TABLE nfl_punt.returner_location;
CREATE TABLE nfl_punt.returner_location AS
SELECT
	ngs.season_year
	, c.season_type
	, ngs.game_key
	, ngs.play_id
	, ngs.gsisid AS returner_gsisid
	, d.gsisid AS punter_gsisid
	, ngs.time
	, ngs.event
	, ngs.x
	, ngs.y
	, ngs.dis
	, ngs.o
	, ngs.dir
	, CASE WHEN b.gsisid IS NOT NULL THEN 1 ELSE 0 END AS injury

FROM nfl_punt.ngs ngs
JOIN (SELECT * FROM nfl_punt.player_role WHERE role = 'PR') r
	ON ngs.season_year = r.season_year
	AND ngs.game_key = r.game_key
	AND ngs.play_id = r.play_id
	AND ngs.gsisid = r.gsisid
LEFT JOIN nfl_punt.injury_wide b
	ON ngs.season_year = b.season_year
	AND ngs.game_key = b.game_key
	AND ngs.play_id = b.play_id
LEFT JOIN nfl_punt.game_data c 
	ON ngs.season_year = c.season_year
	AND ngs.game_key = c.game_key
LEFT JOIN nfl_punt.punts_summarized d
	ON ngs.season_year = d.season_year
	AND ngs.game_key = d.game_key
	AND ngs.play_id = d.play_id
WHERE event IN ('punt_received', 'fair_catch', 'punt_downed', 'touchback')
;


-- Categorize each play using the description field
-- Include a lot of other relevant columns calculated above
DROP TABLE nfl_punt.plays_categorized;
CREATE TABLE nfl_punt.plays_categorized AS
SELECT

	a.season_year
	, a.season_type
	, a.game_key
	, a.game_date
	, a.week
	, a.play_id
	, a.game_clock
	, a.yard_line
	, a.quarter
	, a.play_type
	, a.poss_team
	, a.home_team_visit_team
	, a.score_home_visiting
	, a.punt_outcome
	, c.gsisid AS punter_gsisid
	, a.name_match[1] AS punter_name
	, a.yards_punted[1]::integer AS yards_punted
	, CASE WHEN b.play_id IS NOT NULL THEN 1 ELSE 0 END AS injury
	, b.gsisid AS injured_gsisid
	, d.player_numbers AS injured_numbers
	, b.primary_impact_type
	, b.player_activity_derived
	, b.primary_partner_gsisid
	, b.primary_partner_activity_derived
	, b.friendly_fire
	, b.player_role
	, b.player_role_type
	, b.primary_partner_role
	, b.preview_link

	, e.line_set_time
	, e.snap_time
	, e.punt_time
	, e.receive_time
	, e.tackle_time
	, e.first_contact_time
	, e.punt_downed_time
	, e.fair_catch_time
	, e.out_of_bounds_time
	, e.submit_time

	, f.p_x AS punter_x
	, f.p_y AS punter_y
	, f.ls_y AS longsnap_x
	, f.ls_y AS longsnap_y

FROM (
	SELECT
		*
		, CASE WHEN play_description ILIKE '%Touchback%' THEN 'touchback'
			WHEN play_description ILIKE '%punts%' AND play_description ILIKE '%out of bounds%' THEN 'out_of_bounds'
			WHEN play_description ILIKE '%fair catch%' THEN 'fair_catch'
			WHEN play_description ILIKE '%downed%' THEN 'downed'
			WHEN play_description ILIKE '%MUFFS catch%' THEN 'muffs_catch'
			WHEN play_description ILIKE '%Direct snap%' THEN 'direct_snap'
			WHEN play_description ILIKE '%punts%' AND play_description ILIKE '%for __ yards%' THEN 'catch_and_run'
			WHEN play_description ILIKE '%punts%' AND play_description ILIKE '%for _ yard%' THEN 'catch_and_run'
			WHEN play_description ILIKE '%punts%' AND play_description ILIKE '%for no gain%' THEN 'catch_and_run'

			WHEN play_description ILIKE '%false start%' THEN 'false_start'
			WHEN play_description ILIKE '%PENALTY%' THEN 'penalty'
			WHEN play_description ILIKE '%pass incomplete%' THEN 'pass_incomplete'
			WHEN play_description ILIKE '%pass%' THEN 'pass'
			WHEN play_description ILIKE '%fake%' THEN 'fake_punt'
			WHEN play_description ILIKE '%BLOCKED%' THEN 'blocked'
			WHEN play_description ILIKE '%FUMBLES%' THEN 'fumbled'
			WHEN play_description ILIKE '%safety%' THEN 'safety'
			WHEN play_description ILIKE '%up the middle%' OR play_description ILIKE '%left end%' OR play_description ILIKE '%right%' THEN 'fake_punt'

			ELSE play_description
			END AS punt_outcome
		, REGEXP_MATCHES(SPLIT_PART(play_description, 'punts', 1), '([\w]\.[\w'']*)') AS name_match
		, REGEXP_MATCHES(play_description, 'punts ([\w]*) yards') AS yards_punted
	FROM nfl_punt.play_information
) a

LEFT JOIN nfl_punt.injury_wide b
	ON a.season_year = b.season_year
	AND a.game_key = b.game_key
	AND a.play_id = b.play_id
LEFT JOIN nfl_punt.player_role c
	ON a.season_year = c.season_year
	AND a.game_key = c.game_key
	AND a.play_id = c.play_id
LEFT JOIN (SELECT gsisid, STRING_AGG(number, ',') AS player_numbers FROM nfl_punt.player_punt_data GROUP BY 1) d
	ON b.gsisid = d.gsisid
LEFT JOIN nfl_punt.punts_summarized e
	ON a.season_year = e.season_year
	AND a.game_key = e.game_key
	AND a.play_id = e.play_id
LEFT JOIN nfl_punt.punter_location f
	ON a.season_year = f.season_year
	AND a.game_key = f.game_key
	AND a.play_id = f.play_id
WHERE LENGTH( a.name_match[1] ) > 2
AND c.role = 'P'
;

CREATE INDEX ON nfl_punt.plays_categorized(season_year, game_key, play_id);

-- Summarize stats for each punter by year, and overall
DROP TABLE nfl_punt.punter_stats_by_year;
CREATE TABLE nfl_punt.punter_stats_by_year AS
SELECT
	ngs.season_year
	, ngs.gsisid
	, ngs.player_name
	, COUNT(DISTINCT ngs.play_id) AS n_punts
	, COUNT(DISTINCT inj.play_id) AS n_injuries
	, COUNT(DISTINCT CASE WHEN pc.punt_outcome = 'catch_and_run' THEN ngs.play_id ELSE NULL END) AS num_catch_and_run
	, COUNT(DISTINCT CASE WHEN pc.punt_outcome = 'fair_catch' THEN ngs.play_id ELSE NULL END) AS num_fair_catch
	, COUNT(DISTINCT CASE WHEN pc.punt_outcome = 'out_of_bounds' THEN ngs.play_id ELSE NULL END) AS num_out_of_bounds
	, COUNT(DISTINCT CASE WHEN pc.punt_outcome = 'downed' THEN ngs.play_id ELSE NULL END) AS num_downed
	, COUNT(DISTINCT CASE WHEN pc.punt_outcome NOT IN ('catch_and_run', 'fair_catch', 'out_of_bounds', 'downed') THEN ngs.play_id ELSE NULL END) AS num_other_outcome
	, COUNT(DISTINCT CASE WHEN pc.player_role_type = 'coverage' THEN ngs.play_id ELSE NULL END) AS num_coverage_injured
	, COUNT(DISTINCT CASE WHEN pc.player_role_type = 'return' THEN ngs.play_id ELSE NULL END) AS num_return_injured

FROM nfl_punt.punter_ngs ngs
LEFT JOIN injury_wide inj
	ON ngs.season_year = inj.season_year
	AND ngs.game_key = inj.game_key
	AND ngs.play_id = inj.play_id
LEFT JOIN nfl_punt.plays_categorized pc
	ON ngs.season_year = pc.season_year
	AND ngs.game_key = pc.game_key
	AND ngs.play_id = pc.play_id
GROUP BY 1,2,3
;

DROP TABLE nfl_punt.punter_stats;
CREATE TABLE nfl_punt.punter_stats AS
SELECT
	gsisid
	, player_name
	, SUM(n_punts) AS total_punts
	, SUM(n_injuries) AS total_injuries
	, SUM(n_injuries)::float / SUM(n_punts) AS inj_rate
	, SUM(num_catch_and_run) AS num_catch_and_run
	, SUM(num_fair_catch) AS num_fair_catch
	, SUM(num_out_of_bounds) AS num_out_of_bounds
	, SUM(num_downed) AS num_downed
	, SUM(num_other_outcome) AS num_other_outcome
	, SUM(num_coverage_injured) AS num_coverage_injured
	, SUM(num_return_injured) AS num_return_injured
FROM nfl_punt.punter_stats_by_year
GROUP BY 1,2
;
```

In [None]:
# This dataset was created by loading data into a local Postgres server, and running the queries above.
plays = pd.read_csv("../input/punt-plays-categorized/plays_categorized.csv", sep='|')

Create some filters to help with our analysis:

In [None]:
where_kern = plays.punter_name == 'B.Kern'
not_kern = plays.punter_name != 'B.Kern'
where_hekker = plays.punter_name == 'J.Hekker'
did_punt = plays.punt_time.notnull()
is_injury = plays.injury == 1
not_injury = plays.injury == 0

Of all the punts in the list, how many injuries were there?

In [None]:
plays[['injury', 'game_key']].groupby('injury').count()

What were the players doing when they suffered a concussion?
Most players were either tackling (13/37), being blocked (10/27) or blocking (8/27).
Tackled players only made up 6/37 of the injured players.

In [None]:
plays[is_injury][['player_activity_derived', 'game_key']].groupby('player_activity_derived').count()

Let's take a look at injured players by role:
"role_type" was manually mapped from punt position to one of either coverage or return.

In [None]:
plays[is_injury][['player_role_type', 'game_key']].groupby('player_role_type').count()

27 of the 37 concussions were suffered by those on the coverage unit!

This is somewhat counter-intuitive. I would have expected the "tackled" player to suffer concussions most often. I suspect this is related to player's general awareness of their situation. A punt returner is expecting to get tackled, and can often brace themselves for impact. On the other hand, the coverage unit is usually fixated on the returner, or the football, and has higher potential to be blindsided by a blocker.

Of the 37 injuries, how do they break down in terms of the outcome of the punt? E.g., if the punt resulted in a fair catch, vs if the returner cought the ball and tried to return it. These were categorized using the "description" field. The logic is in the attached SQL.

In [None]:
plays[is_injury][['punt_outcome', 'game_key']].groupby('punt_outcome').count()

Using concussion data that the NFL published in via it's [Play Smart Play Safe](https://www.playsmartplaysafe.com/newsroom/reports/2017-injury-data/), site: 

* In 2016, there were
  * 45 preseason in-game concussions
  * 172 regular+postseason in-game concussions
* In 2017, there were
  * 46 preseason in-game concussions
  * 189 regular+postseason in-game concussions
* 91 total pre-season
* 361 total regular+postseason

In [None]:
plays[is_injury][['season_type', 'game_key']].groupby('season_type').count()

Preseason is about 19.3% of all plays in a given season, but accounts for 32.4% of concussions!

In [None]:
preseason_games = 64.0
regular_season_games = 256
postseason_games = 11
print(preseason_games / (preseason_games + regular_season_games + postseason_games))
print(12/(12+25.0))

12 preseason punt concussions, 25 regular season punt concussions
91 - 12 = 79 preseason non-punt concussions, 361 - 25 = 336 regular season non-punt concussions

We can run a [chi squared test](https://en.wikipedia.org/wiki/Chi-squared_test) to see if there is an interaction effect between punts and concussions, i.e., are preseason punts just as risky as regular season punts?

"Expected" values for each cell in the 2x2 season-type vs. play-type table are computed by taking (row total) * (column total / grand total)

In [None]:
pre_punt = 12
reg_punt = 25
pre_nonpunt = 79
reg_nonpunt = 336
grand_total = pre_punt+reg_punt+pre_nonpunt+reg_nonpunt
pre_total = pre_punt+pre_nonpunt
reg_total = reg_punt+reg_nonpunt
punt_total = pre_punt+reg_punt
nonpunt_total = pre_nonpunt+reg_nonpunt

observed = [12, 79, 25, 336]
expected = [(punt_total * pre_total/float(grand_total)), 
            (nonpunt_total * pre_total/ float(grand_total)),
            (punt_total * reg_total/float(grand_total)),
            (nonpunt_total * reg_total/float(grand_total))]
stats.chisquare(f_obs=observed, f_exp=expected, ddof=2)



P-value is 0.052 - doesn't cross the magic 0.05 p-value threshold, but definitely worth collecting more data.

Punter names were computed using the play_description field in the SQL above.
Here I ask the question, "Do some punters' punts cause more injuries than others?"

In [None]:
plays[['punter_name', 'injury']].groupby('punter_name').agg({'injury': {'num_injuries': np.sum, 'total_punts': np.size, 'injury_rate': np.mean}}).sort_values( [('injury', 'num_injuries')], ascending=False ).head(25)

It looks like some punters are more prone to injury than others.
Let's compare two Pro Bowlers, Brett Kern (2017, 2018), with 204 punts and 5 players injured during his plays
With Johnny Hekker, (2013, 2015-2017), with 209 punts, and 0 players injured during his plays

In [None]:
punts_kern = plays[ did_punt & where_kern ]
punts_hekker = plays[ did_punt & where_hekker ]
punts_not_kern = plays[ did_punt & not_kern ]

How do Brett Kern and Johnny Hekker compare in terms of set-up distance from line of scrimmage?
To tell this, we compute the X location of the punter and the X location of the snapper at the time of the snap

In [None]:
kern_snap_punt_distance = abs(punts_kern.punter_x - punts_kern.longsnap_x)
hekker_snap_punt_distance = abs(punts_hekker.punter_x - punts_hekker.longsnap_x)
not_kern_snap_punt_distance = abs(punts_not_kern.punter_x - punts_not_kern.longsnap_x)
kern_distance_density = [hv.Distribution(kern_snap_punt_distance, label = 'Kern')]
hekker_distance_density = [hv.Distribution(hekker_snap_punt_distance, label = 'Hekker')]
not_kern_distance_density = [hv.Distribution(not_kern_snap_punt_distance, label = 'Not Kern')]

In [None]:
%%opts Overlay [width=600 legend_position='right']
overlay = hv.Overlay(kern_distance_density + hekker_distance_density)
overlay

On a typical play, Brett Kern sets up about 13.3 yards away from the longsnapper
Hekker is 13.6 yards away, 0.3 yards farther.

In [None]:
print('Brett Kern mean distance from long snapper: {0}'.format(kern_snap_punt_distance.mean()))
print('Johnny Hekker mean distance from long snapper: {0}'.format(hekker_snap_punt_distance.mean()))
print('Difference: {0}'.format(hekker_snap_punt_distance.mean() - kern_snap_punt_distance.mean()))

Using a two-sample t-test, this result is statistically significant:

In [None]:
stats.ttest_ind(kern_snap_punt_distance.dropna(), hekker_snap_punt_distance.dropna(), equal_var=False)

How does Kern compare to the rest of punters in general?

In [None]:
%%opts Overlay [width=600 legend_position='right']
overlay = hv.Overlay(kern_distance_density + not_kern_distance_density)
overlay

For the punting population in general, Kern is again, 0.28 yards closer to the punter than average and again, this result is statistically significant.

In [None]:
print('Brett Kern mean distance from long snapper: {0}'.format(kern_snap_punt_distance.mean()))
print('Rest of punters mean distance from long snapper: {0}'.format(not_kern_snap_punt_distance.mean()))
print('Difference: {0}'.format(not_kern_snap_punt_distance.mean() - kern_snap_punt_distance.mean()))
print(stats.ttest_ind(kern_snap_punt_distance.dropna(), not_kern_snap_punt_distance.dropna(), equal_var=False))

How about time from snap to punt? How do Kern and Hekker compare, and how does Kern compare to the rest of punters as a group?

In [None]:
kern_snap_punt = (pd.to_datetime(plays[ did_punt & where_kern ].punt_time) - pd.to_datetime(plays[ did_punt & where_kern ].snap_time)).dt.total_seconds()
hekker_snap_punt = (pd.to_datetime(plays[ did_punt & where_hekker ].punt_time) - pd.to_datetime(plays[ did_punt & where_hekker ].snap_time)).dt.total_seconds()
not_kern_snap_punt = (pd.to_datetime(plays[ did_punt & not_kern ].punt_time) - pd.to_datetime(plays[ did_punt & not_kern ].snap_time)).dt.total_seconds()

kern_snap_punt_time = [hv.Distribution(kern_snap_punt, label = 'Kern')]
hekker_snap_punt_time = [hv.Distribution(hekker_snap_punt, label = 'Hekker')]
not_kern_snap_punt_time = [hv.Distribution(not_kern_snap_punt, label = 'Not Kern')]

Kern typically gets his punts off in 1.98 seconds. Hekker in 2.12 seconds. Difference again is statistically significant:

In [None]:
print('Brett Kern time from snap to punt: {0}'.format(kern_snap_punt.mean()))
print('Johnny Hekker time from snap to punt: {0}'.format(hekker_snap_punt.mean()))
print('Difference in time from snap to punt: {0}'.format(hekker_snap_punt.mean() - kern_snap_punt.mean()))

print(stats.ttest_ind(kern_snap_punt.dropna(), hekker_snap_punt, equal_var=False))

In [None]:
%%opts Overlay [width=600 legend_position='right']
overlay = hv.Overlay(kern_snap_punt_time + hekker_snap_punt_time)
overlay

Again, comparing Kern to all other punters in the league, Kern is about 0.11 seconds faster than all the other punters

In [None]:
print('All punters but Kern: {0}'.format(not_kern_snap_punt.mean()))
print('Difference in time from snap to punt: {0}'.format(kern_snap_punt.mean() - not_kern_snap_punt.mean()))
print(stats.ttest_ind(kern_snap_punt.dropna(), not_kern_snap_punt.dropna(), equal_var=False))

In [None]:
%%opts Overlay [width=600 legend_position='right']
overlay = hv.Overlay(kern_snap_punt_time + not_kern_snap_punt_time)
overlay

Rather than compare punters, let's compare injury vs non-injury punts. As before, we'll look at distance from longsnapper to punter, and time from snap to punt

In [None]:
punts_injury = plays[is_injury & did_punt]
punts_non_injury = plays[not_injury & did_punt]

# Distance
injury_snap_punt_distance = abs(punts_injury.punter_x - punts_injury.longsnap_x)
non_injury_snap_punt_distance = abs(punts_non_injury.punter_x - punts_non_injury.longsnap_x)
injury_distance_density = [hv.Distribution(injury_snap_punt_distance, label = 'injury')]
non_injury_distance_density = [hv.Distribution(non_injury_snap_punt_distance, label = 'not injury')]

# Time
injury_snap_punt_time = (pd.to_datetime(punts_injury.punt_time) - pd.to_datetime(punts_injury.snap_time)).dt.total_seconds()
non_injury_snap_punt_time = (pd.to_datetime(punts_non_injury.punt_time) - pd.to_datetime(punts_non_injury.snap_time)).dt.total_seconds()
injury_time_density = [hv.Distribution(injury_snap_punt_time, label = 'injury')]
non_injury_time_density = [hv.Distribution(non_injury_snap_punt_time, label = 'not injury')]

Comparing injury punts vs non-injury punts. Distance from long-snapper to punter:

In [None]:
%%opts Overlay [width=600 legend_position='right']

overlay = hv.Overlay(injury_distance_density + non_injury_distance_density)
overlay

In this case, it doesn't look like distance from snap to punt changes much from injury vs non-injury:

In [None]:
stats.ttest_ind(injury_snap_punt_distance.dropna(), non_injury_snap_punt_distance.dropna(), equal_var=False)

Comparing injury punts vs non-injury punts. Time from snap to punt:

In [None]:
%%opts Overlay [width=600 legend_position='right']

overlay = hv.Overlay(injury_time_density + non_injury_time_density)
overlay

Time from snap to punt, injury vs. non-injury. Statistically significant at p=0.05

In [None]:
print('Injury punts, time from snap to punt: {0} seconds'.format(injury_snap_punt_time.mean()))
print('Non-injury punts, time from snap to punt: {0} seconds'.format(non_injury_snap_punt_time.mean()))
print('Non-injury punts were {0} seconds slower from snap to punt'.format(non_injury_snap_punt_time.mean() - injury_snap_punt_time.mean()))
print(stats.ttest_ind(injury_snap_punt_time.dropna(), non_injury_snap_punt_time.dropna(), equal_var=False))

Are time from snap to punt, and distance from longsnapper to punter related? We can run a regression, and compare the relationship:

In [None]:
valid_punts = plays[did_punt & plays.punter_x.notnull() & plays.longsnap_y.notnull() & plays.punt_time.notnull() & plays.snap_time.notnull()]
snap_punt_time = (pd.to_datetime(valid_punts.punt_time) - pd.to_datetime(valid_punts.snap_time)).dt.total_seconds()
snap_punt_distance =  abs(valid_punts.punter_x - valid_punts.longsnap_x)
print(stats.linregress(x=snap_punt_distance, y=snap_punt_time))

These two variables are definitely correlated!, rvalue=0.24, and p_value <<< 0.05
Let's plot to see what it looks like.

This means if we want to increase time from snap to punt, we should be able to achieve this 
by increasing distance from snap to punt.

In [None]:
%%opts Scatter [width=600]

slope, intercept, r_value, p_value, std_err = stats.linregress(x=snap_punt_distance, y=snap_punt_time)
fitted_x = np.linspace(7, 16, 200)
fitted_y = fitted_x * slope + intercept

observed = [hv.Scatter(list(zip(snap_punt_distance, snap_punt_time)), label = 'Observed')]
fitted = [hv.Scatter(list(zip(fitted_x,fitted_y)), label = 'Fitted')]
overlay = hv.Overlay(observed + fitted)
overlay


Using the analysis presented above, I propose the following rule changes:
1. Amend Rule 9, Article 2, to allow ineligible receivers on the line of scrimmage to advance more than 1 yard beyond the line of scrimmage prior to the kick.
  * The goal is to increase the likelihood that the coverage unit makes it downfield to the returner before the returner has a chance to start a punt return for yardage, thus increasing the chance of a fair catch or downed ball, which are less likely to cause injuries.
2. Require the punter to set up at least 15 yards behind the line of scrimmage when receiving the snap.
  * The goal is to slow down the punter, and again, give the coverage unit a better chance at forcing a fair catch or downed ball.
3. Change pre-season rules to eliminate punt returns. All punts during the preseason should be fair caught, downed, or out of bounds.
  * Because pre-season looks potentially more dangerous, we could take steps to address this part specifically.