In [1]:
# importing pandas and numpy

import pandas as pd

import numpy as np

In [2]:
# importing data

df = pd.read_excel('./data.xlsx')

In [3]:
df.head()

Unnamed: 0,play_id,game_id,home_team,away_team,possession_team,game_date,time,yrdln,desc,play_type,yards_gained
0,46,2009091000,PIT,TEN,PIT,2009-09-10,15:00,TEN 30,R.Bironas kicks 67 yards from TEN 30 to PIT 3....,kickoff,0.0
1,68,2009091000,PIT,TEN,PIT,2009-09-10,14:53,PIT 42,(14:53) B.Roethlisberger pass short left to H....,pass,5.0
2,92,2009091000,PIT,TEN,PIT,2009-09-10,14:16,PIT 47,(14:16) W.Parker right end to PIT 44 for -3 ya...,run,-3.0
3,113,2009091000,PIT,TEN,PIT,2009-09-10,13:35,PIT 44,(13:35) (Shotgun) B.Roethlisberger pass incomp...,pass,0.0
4,139,2009091000,PIT,TEN,PIT,2009-09-10,13:27,PIT 44,(13:27) (Punt formation) D.Sepulveda punts 54 ...,punt,0.0


In [4]:
# converting game date to date format

df['game_date'] = pd.to_datetime(df['game_date'])

In [5]:
# extracting year from game date

df['game_year'] = df['game_date'].dt.year

In [6]:
# dropping null from play type

df = df.dropna(subset = ['play_type'])

In [66]:
# filtering play type to pass or run

df_filtered = df[df['play_type'].str.contains('pass') | df['play_type'].str.contains('run')]

In [110]:
# crosstabbing data

df_pivoted = df_filtered.pivot_table(index = 'game_year',
                       columns = 'play_type',
                       values = 'yards_gained',
                       aggfunc = 'sum',
                       margins = True,
                       margins_name = 'total_yards_gained') # adding margin to calculate total yards gained

# dropping total row

df_pivoted = df_pivoted.drop(index = 'total_yards_gained')

df_pivoted

play_type,pass,run,total_yards_gained
game_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009,105629.0,56062.0,161691.0
2010,113251.0,59422.0,172673.0
2011,116813.0,60060.0,176873.0
2012,125637.0,63464.0,189101.0
2013,119526.0,57762.0,177288.0
2014,121212.0,57280.0,178492.0
2015,117213.0,52665.0,169878.0
2016,123646.0,56080.0,179726.0
2017,122153.0,60117.0,182270.0
2018,114119.0,55283.0,169402.0


In [112]:
# calculating percentage of total yards gained

df_pivoted['%  Yards Gained by Run'] = round((df_pivoted['run'] / df_pivoted['total_yards_gained']) * 100, 2)

df_pivoted

play_type,pass,run,total_yards_gained,% Yards Gained by Run
game_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009,105629.0,56062.0,161691.0,34.67
2010,113251.0,59422.0,172673.0,34.41
2011,116813.0,60060.0,176873.0,33.96
2012,125637.0,63464.0,189101.0,33.56
2013,119526.0,57762.0,177288.0,32.58
2014,121212.0,57280.0,178492.0,32.09
2015,117213.0,52665.0,169878.0,31.0
2016,123646.0,56080.0,179726.0,31.2
2017,122153.0,60117.0,182270.0,32.98
2018,114119.0,55283.0,169402.0,32.63


In [114]:
# filtering to runs

df_filtered2 = df[df['play_type'].str.contains('run')]

df_filtered2.head()

Unnamed: 0,play_id,game_id,home_team,away_team,possession_team,game_date,time,yrdln,desc,play_type,yards_gained,game_year
2,92,2009091000,PIT,TEN,PIT,2009-09-10,14:16,PIT 47,(14:16) W.Parker right end to PIT 44 for -3 ya...,run,-3.0,2009
5,162,2009091000,PIT,TEN,TEN,2009-09-10,13:16,TEN 2,(13:16) C.Johnson up the middle to TEN 2 for n...,run,0.0,2009
7,207,2009091000,PIT,TEN,TEN,2009-09-10,12:11,TEN 6,(12:11) (Shotgun) C.Johnson left end to TEN 4 ...,run,-2.0,2009
11,301,2009091000,PIT,TEN,PIT,2009-09-10,10:21,TEN 30,(10:21) W.Parker right guard to TEN 31 for -1 ...,run,-1.0,2009
15,393,2009091000,PIT,TEN,TEN,2009-09-10,08:20,TEN 11,(8:20) C.Johnson right end to TEN 43 for 32 ya...,run,32.0,2009


In [126]:
# counting number of run plays per possession team and finding the top three teams with run plays

df_filtered2.groupby('possession_team').size().reset_index(name = 'count')\
.nlargest(3, 'count')

Unnamed: 0,possession_team,count
24,NYJ,4665
4,CAR,4551
12,HOU,4545
