# Exploring NFL special teams data for seasons 2018-20

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns
import os, gc, re, warnings
from wordcloud import WordCloud, STOPWORDS
# from IPython.html import widgets
# from IPython.display import display
warnings.filterwarnings("ignore")

# Play data

In [None]:
plays = pd.read_csv('../input/nfl-big-data-bowl-2022/plays.csv')
plays['scoreDiff'] = abs(plays.preSnapHomeScore-plays.preSnapVisitorScore)
plays.head(2)

In [None]:
fig, ((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8), (ax9,ax10)) = plt.subplots(5,2, figsize=(15,20))
plays.kickLength.plot.hist(bins=50, title='Kick length', grid=True, ax=ax1)
plays.loc[plays.kickReturnYardage.notnull()]['kickReturnYardage'].plot.hist(bins=50, title='Return result (yds)', grid=True, ax=ax2)
plays.playResult.plot.hist(bins=50, title='Play result (yds)', grid=True, ax=ax3)
plays.yardsToGo.plot.hist(bins=20, title='Yards to go at play start', grid=True, ax=ax4)
plays.penaltyYards.plot.hist(title='Penalty yards', grid=True, ax=ax5)
plays.penaltyCodes.value_counts()[:10].plot.bar(title='Penalty codes (top 10)', ax=ax6)
plays.specialTeamsPlayType.value_counts().plot.bar(title='Play type', ax=ax7)
plays.specialTeamsResult.value_counts().plot.bar(title='Play result breakdown', ax=ax8)
plays.loc[plays.passResult.notnull()]['passResult'].value_counts().plot.bar(title='Pass result breakdown', ax=ax9)
plays.yardlineNumber.plot.hist(bins=20, title='Where plays happen (yardline #)', grid=True, ax=ax10)
plt.tight_layout()

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(15,15))  
plays.down.value_counts().plot.pie(title='Down when plays happen', ax=ax1)
plays.quarter.value_counts().plot.pie(title='Quarter when plays happen', ax=ax2)
plt.tight_layout()

In [None]:
comment_words = ''
stopwords = set(STOPWORDS)
 
# iterate through the df
for val in plays.playDescription:
     
    val = str(val)
 
    tokens = val.split()
     
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
     
    comment_words += " ".join(tokens)+" "
 
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(comment_words)
 
# plot WordCloud                       
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Play description word cloud')
plt.tight_layout(pad = 0)
 
plt.show()

# Scout data

In [None]:
scout = pd.read_csv('../input/nfl-big-data-bowl-2022/PFFScoutingData.csv')
scout.head(2)

In [None]:
fig, ((ax1,ax2,ax3), (ax4,ax5,ax6)) = plt.subplots(2,3, figsize=(15,8))  
scout.hangTime.plot.hist(bins=20, grid=True, title='Hangtime (seconds)', ax=ax1)
scout.loc[scout.kickType.notnull()]['kickType'].value_counts().plot.bar(title='Kick type', ax=ax2)
scout.loc[scout.kickDirectionActual.notnull()]['kickDirectionActual'].value_counts().plot.bar(title='Kick direction', ax=ax3)
scout.loc[scout.snapTime.notnull()]['snapTime'].plot.hist(bins=20, grid=True, title='Snap time', ax=ax4)
scout.loc[scout.kickContactType.notnull()]['kickContactType'].value_counts().plot.bar(title='Kick contact type', ax=ax5)
scout.loc[scout.returnDirectionActual.notnull()]['returnDirectionActual'].value_counts().plot.bar(title='Return direction', ax=ax6)
plt.tight_layout()

Merged scout and play data.

In [None]:
# merge scout and plays
play_scout = pd.merge(plays, scout, how='left', on=['playId','gameId'])
# select only numeric columns
num_play_scout = play_scout.select_dtypes(include=['int','float'])

print('Numeric columns:')
for x in num_play_scout.columns:
    print(f'-{x}')

In [None]:
corr_df = num_play_scout[['quarter','down','yardsToGo','yardlineNumber',
                          'penaltyYards','preSnapHomeScore','preSnapVisitorScore',
                          'kickLength','kickReturnYardage','playResult',
                          'absoluteYardlineNumber','snapTime','operationTime',
                          'hangTime']]

plt.figure(figsize=(19, 10))
corr = corr_df.corr()
sns.heatmap(corr, annot=True)
plt.title('Plays-Scout data correlation heatmap')
plt.show()

In [None]:
# creates linear regression plots
def regress(input1, input2):
    
    temp_df = play_scout[[input1,input2]].dropna(how='any')
    
    x = temp_df[input1]
    y = temp_df[input2]
    
    # calculates linear regression
    (slope, intercept, rvalue, pvalue, stderr) = st.linregress(x,y)
    x = np.asarray(x, dtype=np.float64)
    regress_values = x * slope + intercept
    
    print(regress_values)

    # plots scatter plot and regresion
    plt.figure(figsize=(10, 8))
    plt.scatter(x,y)
    plt.plot(x,regress_values,"r-")

    # annotates graph with equation
    line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
    plt.annotate(line_eq,xy=(min(x),min(y)),fontsize=15,color="red")
    plt.xlabel(input1)
    plt.ylabel(input2)
    plt.title(f'{input1} vs. {input2}')
    
    # prints r squared value from linregress function
    print(f'The r-squared is: {rvalue}')
    
regress('kickReturnYardage','hangTime')

# Player data

In [None]:
players = pd.read_csv('../input/nfl-big-data-bowl-2022/players.csv')
players.head(2)

Convert height to feet

In [None]:
# convert height to feet
players[['feet','inches']] = players['height'].str.split('-',expand=True)
players['feet'] = players['feet'].astype('int')
players['inches'] = players['inches'].astype('float').fillna(0.0)
players['feet'] = np.where(players.feet>8, players.feet/12, players.feet)
players['feet'] = round(players['feet'] + players['inches']/12, 2)
players = players.drop(columns=['inches'])
players.head(2)

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,4))
players.feet.plot.hist(bins=10, grid=True, title='Player height (ft)', ax=ax1)
players.weight.plot.hist(bins=20, grid=True, title='Player weight (lbs)', ax=ax2)
plt.tight_layout()

In [None]:
players.groupby('Position')['weight'].mean().sort_values(ascending=False)\
    .plot.bar(figsize=(15,5), 
              title='Avg. player weight (lbs) by position', 
              grid=True, 
              ylim=(150,325))
plt.tight_layout()

In [None]:
players.groupby('Position')['feet'].mean().sort_values(ascending=False)\
    .plot.bar(figsize=(15,5), 
              title='Avg. player height (ft) by position', 
              grid=True, 
              ylim=(5,7))
plt.tight_layout()

## Game data

In [None]:
games = track = pd.read_csv('../input/nfl-big-data-bowl-2022/games.csv')
games.head(2)

# Tracking data

In [None]:
track = pd.read_csv('../input/nfl-big-data-bowl-2022/tracking2018.csv')
track.head(2)

In [None]:
print('Tracking events:')
track.event.unique()

In [None]:
# convert to timestamp
track['ts'] = pd.to_datetime(track['time']).values.astype(np.int64) // 10 ** 9
track = track.drop(columns=['time'])
track.head(2)

In [None]:
# messing with aggregations
track.groupby(['playId','nflId']).agg({'x': lambda x: x.iat[-1] - x.iat[0], # x pos difference
                                       'y': lambda x: x.iat[-1] - x.iat[0], # y pos difference
                                       's': 'mean',                         # avg speed
                                       'dis': 'sum',                        # total dist
                                       'o': 'mean',                         # avg orientation
                                       'dir': 'mean',                       # avg direction
                                       'frameId': 'last',                   # number of frames
                                       'ts': lambda x: x.max() - x.min(),   # play time
                                       'position': 'first', 
                                       'team': 'first',
                                       'playDirection': 'first',
                                       'event': 'first'}
                                     )

## Eye on the ball

In [None]:
ball_df = pd.merge(track.loc[track.team=='football'], plays, how='left', on=['gameId','playId'])

In [None]:
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2, figsize=(15,10))  
ball_df.loc[ball_df.specialTeamsPlayType=='Kickoff'].\
    groupby('frameId')['s'].mean()[:100]\
    .plot.line(figsize=(10,6),
               title='Avg. ball speed on kickoff plays (first 100 frames)', 
               ylabel='speed',
               ax=ax1)
ball_df.loc[ball_df.specialTeamsPlayType=='Punt'].\
    groupby('frameId')['s'].mean()[:100]\
    .plot.line(figsize=(10,6),
               title='Avg. ball speed on punt plays (first 100 frames)', 
               ylabel='speed',
               ax=ax2)
ball_df.loc[ball_df.specialTeamsPlayType=='Field Goal'].\
    groupby('frameId')['s'].mean()[:100]\
    .plot.line(figsize=(10,6),
               title='Avg. ball speed on field goal plays (first 100 frames)', 
               ylabel='speed',
               ax=ax3)
ball_df.loc[ball_df.specialTeamsPlayType=='Extra Point'].\
    groupby('frameId')['s'].mean()[:100]\
    .plot.line(figsize=(10,6),
               title='Avg. ball speed on extra point plays (first 100 frames)', 
               ylabel='speed',
               ax=ax4)
plt.tight_layout()

In [None]:
def ball_speed_plotter(play, game):
    '''
    A function to plot ball speed of individual plays and frame id of events. 
    argument="playId, gameId" 
    '''
    if play not in ball_df.playId.unique():
        return 'Error: Play number does not exist.'
    temp_df = ball_df.loc[(ball_df.playId==play) & (ball_df.gameId==game)].reset_index()
    temp_df['s'].plot.line(figsize=(15, 8), 
                           title=f'Game {game}, play {play} ball speed and events',
                           xlabel='Frame id',
                           ylabel='Ball speed')
    
    plt.gca().set_ylim(bottom=-3)
    print('Play events')
    print('------')
    for index, row in temp_df.loc[temp_df.event!='None'].iterrows():
        print(f"-{row['event']} at frame {row['frameId']}")
        plt.axvline(x=row['frameId'], color='r', alpha=.4)
        plt.annotate(row['event'], xy=(row['frameId'], -2), color='r')
        
ball_speed_plotter(36, 2018123000)

In [None]:
ball_speed_plotter(892,2018123000)

In [None]:
ball_speed_plotter(373,2018123000)

In [None]:
del ball_df
gc.collect()

## Fake plays

In [None]:
# assemble df of fake plays
all_fakes = {'2018':'','2019':'','2020':''}
for year in all_fakes:
    print(f'Loading {year} data....')
    df = pd.read_csv(f'../input/nfl-big-data-bowl-2022/tracking{year}.csv')
    print(f'Filtering fake play data....')
    fake_play_list = df.loc[df.event.str.contains('fake')]['playId'].unique().tolist()
    all_fakes[year] = df.loc[df.playId.isin(fake_play_list)]
    print(f'Freeing memory....')
    del df
    gc.collect()
    print('Done.')
    
fake_df = all_fakes['2018'].append(all_fakes['2019']).append(all_fakes['2020'])

# fake_df.to_csv('all_fake_plays.csv')

print(f'\nShape of fake_df: {fake_df.shape}\n')
fake_df.head(2)

Merge fake df with player, scout, and play data.

In [None]:
# merge fake df with player, scout, and play data
merged = pd.merge(fake_df, play_scout, how='left', on=['gameId','playId'])
merged = pd.merge(merged, games, how='left', on='gameId')
merged = pd.merge(merged, players[['nflId','feet','weight','birthDate','collegeName']], how='left', on='nflId')

del fake_df
gc.collect()

merged["playSeason"] = merged['playId'].astype(str) + '_' + merged['season'].astype(str)
    
print(f'Shape of merged df: {merged.shape}')

In [None]:
fig, ((ax1,ax2,ax3),(ax4,ax5,ax6)) = plt.subplots(2,3, figsize=(15,8))
merged.groupby('playSeason').first().season.value_counts().plot.bar(title='Fake plays by season', ax=ax1)
merged.groupby('playSeason')['frameId'].max().plot.hist(grid=True, title='Number of frames in fake plays',ax=ax2)
merged.groupby('playSeason').first().playDirection.value_counts().plot.bar(title='Fake play directions', ax=ax3)
merged.loc[~merged.event.isin(['None','ball_snap'])].groupby('playSeason').first().event.value_counts()[:20].plot.bar(title='Most common fake play "events" (top 20)', ax=ax4)
merged.loc[merged.event.str.contains('fake')].groupby('playSeason')['frameId'].mean().plot.hist(bins=15, grid=True, title='Frame id when fake play takes place', ax=ax5)
merged.groupby('playSeason').first().scoreDiff.plot.hist(grid=True, title='Score diff at time of fake play', ax=ax6)
plt.tight_layout()

In [None]:
fig, ((ax1,ax2,ax3),(ax4,ax5,ax6)) = plt.subplots(2,3, figsize=(15,8))
merged.groupby('playSeason')['yardlineNumber'].first().plot.hist(bins=15, grid=True, title='Yardline No. where fake play takes place', ax=ax1)
merged.groupby('playSeason')['playResult'].first().plot.hist(bins=10, grid=True, title='Fake play results',ax=ax2)
merged.groupby('playSeason')['kickType'].first().value_counts().plot.bar(title='Fake play kick types', ax=ax3)
merged.groupby('playSeason')['snapDetail'].first().value_counts().plot.pie(title='Snap target', ax=ax4)
merged.groupby('playSeason')['possessionTeam'].first().value_counts().plot.bar(title='Teams who do fake plays', ax=ax5)
merged.groupby('playSeason')['week'].first().value_counts().sort_index().plot.line(title='Fake plays by week', ax=ax6)
plt.tight_layout()