In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Games data

Load the Games data to answer questions like:
- How many games are in the data set?
- How many games from each season?

The answers will be used to put special-team penalties in perspective (penalties per game, penalty yards per game, etc.).

In [None]:
game_data = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/games.csv')
game_data.head()

In [None]:
print(game_data.shape)
game_data.groupby('season')['gameId'].count()

### Plays data

Load the Plays data set to answer questions like:
- [How many total special-teams plays are in the data set?](#plays_q1)
- [How many penalties were called?](#plays_q2)
- [What proportion of special-teams plays resulted in a penalty?](#plays_q3)
- [What are the most penalized teams in the NFL?](#plays_q4)
- [Which types of penalties are the most common on special-teams plays?](#plays_q5)
- [What type of penalty is most common for each play type?](#plays_q6)
- [Which play type resulted in the most penalty yards?](#plays_q7)
- [What is the distribution of penalty yards for each play type?](#plays_q8)

In [None]:
play_data = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2022/plays.csv')
play_data.head()

<a id='plays_q1'></a>
#### How many total special-teams plays are in the data set?

In [None]:
print(play_data.shape)

There are 19,979 toal special-teams plays in the data set.

In [None]:
play_data.describe()

In [None]:
obj_cols = ['playDescription', 'possessionTeam', 'specialTeamsPlayType', 'specialTeamsResult',
            'penaltyCodes', 'penaltyJerseyNumbers', 'passResult']
play_data[obj_cols].describe()

<a id='plays_q2'></a>
#### How many penalties were called?

In [None]:
penalty_cols = ['playDescription', 'specialTeamsPlayType', 'possessionTeam',
                'penaltyCodes', 'penaltyYards', 'penaltyJerseyNumbers']
penalty_df = play_data.loc[play_data['penaltyCodes'].notnull(), penalty_cols]
penalty_df

A total of 1,102 penalties were called on 19,979 plays.

<a id='plays_q3'></a>
#### What proportion of special-teams plays resulted in a penalty?

In [None]:
play_data['specialTeamsPlayType'].value_counts().plot.barh(figsize=(6,3),
                                                           title="Total special-teams plays by type")

In [None]:
penalty_df['specialTeamsPlayType'].value_counts().plot.barh(figsize=(6,3), title="Special-teams penalties by type")

In [None]:
(penalty_df['specialTeamsPlayType'].value_counts() / play_data['specialTeamsPlayType'].value_counts()).plot.barh(figsize=(6,3),
                                                                                                                 title="Proportion of penalties by play type")

Since there are 1,102 total penalties in the data set for 19,979 plays, about 5.5% of special-teams plays overall result in a penalty. Here's how it breaks down by play type:
- Over 10% of punt plays in the past three seasons have resulted in a penalty.
- Less than 5% of kickoffs have resulted in penalties.
- Less than 2% of field goal and extra point attempts result in penalties.

<a id='plays_q4'></a>
#### What are the most penalized teams in the NFL?

To answer this question, we can look at the `penaltyJerseyNumbers` column, since it contains the team as a 2 or 3-letter prefix. We'll first create a new column `penaltyTeam` to isolate just the information we're interested in.

In [None]:
penalty_df['penaltyTeam'] = penalty_df['penaltyJerseyNumbers'].str[0:3].str.strip()
penalty_df

Now to find the most penalized teams we can group by the new column and count the results.

In [None]:
team_penalty_df = penalty_df.groupby('penaltyTeam')['playDescription'].count().sort_values(ascending=True)
team_penalty_df.plot.barh(figsize=(12,10), title="Special-teams penalties by Team")
plt.xlabel('Penalties')
plt.ylabel('Team')

Jacksonville, Pittsburgh, and Cleveland had the most penalties called on special-teams plays over the last three season. Note that the Las Vegas Raiders had the fewest, but that's due to the fact that the team moved from Oakland between seasons. (We could combine the two by changing the labels for one or both teams if we wanted to do deeper analysis.)

Let's see which teams were most penalized in terms of total yards.

In [None]:
penalty_df['penaltyYards'] = penalty_df['penaltyYards'].abs()
team_penalty_yards_df = penalty_df.groupby('penaltyTeam')['penaltyYards'].sum().sort_values(ascending=True)

team_penalty_yards_df.plot.barh(figsize=(12,10), title="Special-teams penalty yards by Team")
plt.xlabel('Penalty Yards')
plt.ylabel('Team')

That didn't change things by a lot. Jacksonville is still at the top, but Detroit jumped up a few spots to #2. Las Vegas, as expected is still last.

<a id='plays_q5'></a>
#### Which types of penalties are the most common on special-teams plays?

In [None]:
unique_penalties = penalty_df['penaltyCodes'].unique()
print(len(unique_penalties))
unique_penalties

Since there are 71 different penalty codes in this dataset, and many plays resulted in multiple penalties, let's reduce the number of unique values by combining multiple calls into one group.

In [None]:
penalty_df['penaltyCodes'].str.contains(';').sum()

In [None]:
penalty_codes = penalty_df['penaltyCodes'].copy()
penalty_codes.loc[penalty_codes.str.contains(';')] = 'Multiple'
penalty_codes.value_counts().plot.bar(figsize=(12,3))

We can refer to the [NFL Rulebook](https://operations.nfl.com/the-rules/2021-nfl-rulebook/#table-of-foul-codes) to see what the most often called penalty codes mean.
- **OH** - Offensive Holding
- **IBW** - Illegal Block Above the Waist
- **UNRd** - Unnecessary Roughness
- **ILF** - Illegal Formation
- **DOF** - Defensive Offside
- **UNR** - Unnecessary Roughness

<a id='plays_q6'></a>
#### What type of penalties are most common for each play type?

In [None]:
play_type_penalties_df = penalty_df[['specialTeamsPlayType', 'penaltyCodes']].copy()
play_type_penalties_df['count'] = 1
play_type_penalty_counts = play_type_penalties_df.groupby(['specialTeamsPlayType', 'penaltyCodes'])['count'].sum().reset_index()
play_type_penalty_counts.sort_values(['specialTeamsPlayType','count'],ascending=False).groupby('specialTeamsPlayType').head(2)

Offensive Holding is the most common penalty called on Punt and Kickoff plays, while Defensive Offside is most common on Field Goal and Extra Point attempts.

<a id='plays_q7'></a>
#### Which play type resulted in the most penalty yards?

In [None]:
penalty_yards_df = penalty_df[['specialTeamsPlayType', 'penaltyYards']].copy()
penalty_yards_df['penaltyYards'] = penalty_yards_df['penaltyYards'].abs()

play_type_penalty_yards = penalty_yards_df.groupby('specialTeamsPlayType')['penaltyYards'].agg(['count', 'sum', 'mean'])
play_type_penalty_yards

Punt plays resulted in by far the most penalty yards on special-teams plays with 5,730. Field Goals resulted in the fewest. We'll see next why Field Goal attempts resulted in so few yards penalized when we look at the distributions.

<a id='plays_q8'></a>
#### What is the distribution of penalty yards for each play type?

In [None]:
plt.figure(figsize=(12,6))

sns.kdeplot(data=penalty_yards_df.loc[penalty_yards_df['specialTeamsPlayType'] == 'Extra Point', 'penaltyYards'],
             label="Extra Point", shade=True)
sns.kdeplot(data=penalty_yards_df.loc[penalty_yards_df['specialTeamsPlayType'] == 'Field Goal', 'penaltyYards'],
             label="Field Goal", shade=True)
sns.kdeplot(data=penalty_yards_df.loc[penalty_yards_df['specialTeamsPlayType'] == 'Kickoff', 'penaltyYards'],
             label="Kickoff", shade=True)
sns.kdeplot(data=penalty_yards_df.loc[penalty_yards_df['specialTeamsPlayType'] == 'Punt', 'penaltyYards'],
             label="Punt", shade=True)

plt.title("Distribution of Penalty Yards by Play Type")
plt.legend()

Kickoffs and Punts both had a peak at penalties of 10 yards. Extra Point attepts had two peaks at 5 and 15 yeards, while Field Goal attempts had a high number of 0-yard penalties. That's probably worth further investigation.

In [None]:
cols = ['playDescription', 'specialTeamsResult', 'penaltyCodes', 'penaltyYards']
zero_yard_penalty = play_data.loc[(play_data['specialTeamsPlayType'] == 'Field Goal') & (play_data['penaltyYards'] == 0.0), cols]
zero_yard_penalty

These are mostly defensive penalties where the result was 'Kick Attempt Good'. The one exception was an offensive penalty on a kick that was missed. If you read the play description, you'll see that each of these penalties was declined, which fully explains the `penaltyYards = 0.0` anomaly.