In [1]:
import nfl_data_py as nfl
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [5]:
cols = ['rusher_player_name', 'rusher_player_id','posteam','play_id','week','down', 'yardline_100', 'ydstogo' ,'yards_gained', 'ep','epa', 'rush_touchdown']

In [16]:
def top_rb(year, min_yds):
    df = nfl.import_pbp_data([year])
    roster = nfl.import_rosters([year])
    pos = roster[['player_id','position']]
    #load dfs. get roster data with player_id and position.
    
    runs = df.loc[(df['season_type'] == 'REG') & (df['play_type'] == 'run'), cols]
    #filter out regular season running plays
    
    ids = runs[['rusher_player_name', 'rusher_player_id']]
    ids = ids.rename(columns={'rusher_player_id':'player_id'})
    ids = ids.drop_duplicates()
    #get a table of ids for players
    
    top_filter = runs.groupby(['rusher_player_name','posteam'])[['yards_gained']].agg('sum')
    top_filter = top_filter.loc[top_filter['yards_gained'] >= min_yds]
    top_filter = pd.merge(top_filter,ids, on='rusher_player_name')
    top_filter = pd.merge(top_filter,pos,on='player_id')
    top_filter = top_filter[top_filter['position']=='RB']
    #get RBs who ran for 1000 or more yards in season
    
    runs_by_top_rbs = runs[runs['rusher_player_name'].isin(top_filter['rusher_player_name'])]
    #filter pbp data for only RBs that ran for 1000+ yds in season
    
    top_rbs_grouped = runs_by_top_rbs.groupby('rusher_player_name').agg(attempts=('rusher_player_id', 'count'),
                                                                    rushing_yards=('yards_gained','sum'),
                                                                    rushing_tds=('rush_touchdown', 'sum'),
                                                                    total_epa=('epa', 'sum'),
                                                                    positive_pct_epa=('epa', lambda x: (x>=0).mean()))
    #top_rbs_grouped['positive_pct_epa'] = top_rbs_grouped['positive_pct_epa'].map(format_pct)
    top_rbs_grouped['season'] = str(year)
    top_rbs_grouped['name_and_season'] = top_rbs_grouped.index + ', ' + top_rbs_grouped['season']
    #group pbp data by running back. aggregate columns to get season attempts, rushing yards, rushing touchdowns,
    #season total epa, and percentage of plays that gained positive epa
    #format percentage epa column into %
    
    return top_rbs_grouped

In [17]:
def tbl_concat(years, threshold):
    tbl = top_rb(years[0], threshold)
    for x in years[1:]:
        temp = top_rb(x, threshold)
        tbl = pd.concat([tbl, temp])
    return tbl 
    
tbl = tbl_concat([2022,2021,2020,2019,2018], 1000)
tbl.sort_values(by='positive_pct_epa',ascending=False)

2022 done.
Downcasting floats.
2021 done.
Downcasting floats.
2020 done.
Downcasting floats.
2019 done.
Downcasting floats.
2018 done.
Downcasting floats.


Unnamed: 0_level_0,attempts,rushing_yards,rushing_tds,total_epa,positive_pct_epa,season,name_and_season
rusher_player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
M.Sanders,260,1269.0,11.0,12.698614,0.488462,2022,"M.Sanders, 2022"
D.Henry,380,2029.0,17.0,33.751442,0.481579,2020,"D.Henry, 2020"
D.Cook,315,1563.0,16.0,16.153913,0.47619,2020,"D.Cook, 2020"
T.Gurley,260,1255.0,17.0,21.408102,0.465385,2018,"T.Gurley, 2018"
A.Jones,213,1121.0,2.0,12.084998,0.464789,2022,"A.Jones, 2022"
P.Lindsay,193,1037.0,9.0,5.10675,0.46114,2018,"P.Lindsay, 2018"
A.Jones,202,1104.0,9.0,13.48881,0.455446,2020,"A.Jones, 2020"
A.Gibson,259,1039.0,7.0,-24.129261,0.451737,2021,"A.Gibson, 2021"
J.Taylor,255,1279.0,11.0,16.964924,0.45098,2020,"J.Taylor, 2020"
J.Jacobs,341,1653.0,12.0,-1.681085,0.442815,2022,"J.Jacobs, 2022"


In [18]:
tbl.groupby('season')['positive_pct_epa'].mean()

season
2018    0.414476
2019    0.396813
2020    0.426902
2021    0.412406
2022    0.409337
Name: positive_pct_epa, dtype: float64

In [19]:
tbl.groupby('season')['total_epa'].mean()

season
2018    -0.941937
2019   -11.455646
2020     5.372243
2021    -7.835158
2022    -8.559671
Name: total_epa, dtype: float32