Code from: 
  https://medium.com/@thiscuriousquest/graphing-nfl-running-back-production-by-age-using-python-part-1-61072802ea0f
  https://medium.com/@thiscuriousquest/graphing-nfl-running-back-production-by-age-2-e2fbc35919a4

Data from:
  https://www.pro-football-reference.com/


In [1]:
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
def top_RB_per_year(year, position="RB"):
    """Helper function to scrape RB stats by year"""
    
    #position dictionary with indexing
    pos ={"RB":['/rushing.htm',   1, 2, ["RB", "rb",""]],
          "RR":['/receiving.htm', 0, 1, ["RB", "rb",""]]}
    
    #get html
    base_url= "https://www.pro-football-reference.com/years/"
    
    html = requests.get(base_url+year+pos[position][0]).text
    page = soup(html, features='lxml')
    rows = page.findAll('tr')
    
    #get column headers
    cols = [i.getText() for i in rows[pos[position][1]].findAll('th')]    
    
    #get table of stats
    stats = []
    stat_rows = rows[pos[position][2]:] 
    for row in range(len(stat_rows)):
        stats.append([i.getText() for i in stat_rows[row].findAll('td')])
        
    season_stats = pd.DataFrame(stats, columns=cols[1:])
    season_stats = season_stats[season_stats["Pos"].isin(pos[position][3])]
    
    return season_stats

In [3]:
rush_stats = top_RB_per_year('2020')
rec_stats = top_RB_per_year('2020', 'RR')

print("----- Results: -----\n")

print(rush_stats.iloc[:5, :10])
    
print("\n")

print(rec_stats.iloc[:5, :10])
    
print("\n---- End Results ---")

IndexError: list index out of range

In [None]:
def join_rb_season_df(year):
    temp_rush = top_RB_per_year(year, 'RB')
    temp_rec = top_RB_per_year(year, 'RR')
    total_prod = pd.merge(temp_rush, temp_rec, 
                          on=['Player', 'Tm', 'Age', 'Pos', 'G', 'GS'], 
                          how='left', suffixes=('_rush', '_rec'))
    total_prod['year'] = year
    return total_prod

In [None]:

list_of_rb_dfs = []
past_ten_years=[str(i) for i in range(2011,2021)]

for year in past_ten_years:
    total_prod = join_rb_season_df(year)
    list_of_rb_dfs.append(total_prod[:50])

for i in list_of_rb_dfs:
    print(i.iloc[0:2, 0:8])

In [None]:
cols = ["Player", "Age", "Yds_rush", "TD_rush", "Fmb_rush", 
        "Rec", "Yds_rec", "TD_rec", "Fmb_rec", "year"]

df = (pd.concat(list_of_rb_dfs, ignore_index=True)
      .filter(items=cols))
df[cols[1:-1]] = df[cols[1:-1]].apply(pd.to_numeric)

In [None]:
def calc_fant_points(x):
    total = ((x['Yds_rush'] *  0.1) + 
             (x['TD_rush']  *  6) + 
             (x['Fmb_rush'] * -1) +
             (x['Yds_rec']  *  0.1) + 
             (x['Rec']      *  0.5) +
             (x['TD_rec']   *  6) + 
             (x['Fmb_rec']  * -1))
    return total

In [None]:
df['total_pts'] = df.apply(calc_fant_points, axis=1)
print(df.loc[df['year'] == '2020', ["Player", "Age", "total_pts"]][:5])

In [None]:
age_prod_df = df.groupby(['Age']).mean()
age_prod_df.index = pd.to_numeric(age_prod_df.index)

print(age_prod_df.iloc[:,[0,1,4,5,7]])

In [None]:
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(11.7,8.27)})

fig, ax = plt.subplots()
sns.lineplot(x=age_prod_df.index, y=age_prod_df['total_pts'], 
             label='Mean pt production for top 50 RBs')
sns.regplot(x=age_prod_df.index, y=age_prod_df['total_pts'], 
            ci=75, order=2, ax=ax)

plt.legend()
plt.xlabel('Age')
plt.ylabel('Avg point production')
plt.title('Age impact on fantasy production for RB')

plt.show()

In [4]:
def top_pos_per_year(year, position):
    """Helper function to scrape pos stats by year"""
    
    #position dictionary with indexing
    pos ={"QB":['/passing.htm', 0, 1, ["QB", "qb", ""]],
         "RB":['/rushing.htm', 1, 2, ["RB","rb",""]],
         "RR":['/receiving.htm', 0, 1, ["RB","rb",""]],
         "AR":['/receiving.htm', 0, 1, ["RB","rb","WR","wb","TE","te",""]],
         "WR":['/receiving.htm', 0, 1, ["WR", "wr", ""]],
         "TE":['/receiving.htm', 0, 1, ["TE", "te", ""]]}
    
    #get html
    base_url= "https://www.pro-football-reference.com/years/"
    
    html = requests.get(base_url+year+pos[position][0]).text
    page = soup(html, features='lxml')
    rows = page.findAll('tr')
    
    #get column headers
    cols = [i.getText() for i in rows[pos[position][1]].findAll('th')]    
    
    #get table of stats
    stats = []
    stat_rows = rows[pos[position][2]:] 
    for row in range(len(stat_rows)):
        stats.append([i.getText() for i in stat_rows[row].findAll('td')])
        
    season_stats = pd.DataFrame(stats, columns=cols[1:])
    season_stats = season_stats[season_stats["Pos"].isin(pos[position][3])]
    
    return season_stats

In [5]:
list_of_wr_dfs = []
past_ten_years=[str(i) for i in range(2011,2021)]

for year in past_ten_years:
    total_prod = top_pos_per_year(year, "WR")
    total_prod["year"] = year
    list_of_wr_dfs.append(total_prod[:50])
    
cols = ["Player", "Age", "Rec", "Yds", "TD", "Fmb", "year"]

#drop the second Yds column
df = pd.concat(list_of_wr_dfs, ignore_index=True)
#df = df.loc[:,~df.columns.duplicated()].filter(items=cols)

df[cols[1:-1]] = df[cols[1:-1]].apply(pd.to_numeric)

IndexError: list index out of range

In [6]:
list_of_wr_dfs = []
past_ten_years=[str(i) for i in range(2011,2021)]

for year in past_ten_years:
    total_prod = top_pos_per_year(year, "WR")
    total_prod["year"] = year
    list_of_wr_dfs.append(total_prod[:50])
    
cols = ["Player", "Age", "Rec", "Yds", "TD", "Fmb", "year"]

#drop the second Yds column
df = pd.concat(list_of_wr_dfs, ignore_index=True)
#df = df.loc[:,~df.columns.duplicated()].filter(items=cols)

df[cols[1:-1]] = df[cols[1:-1]].apply(pd.to_numeric)

IndexError: list index out of range

In [None]:
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(11.7,8.27)})

fig, ax = plt.subplots()
sns.lineplot(x=age_prod_df.index, y=age_prod_df['total_pts'], 
             label='Mean pt production for top 50 WRs')
sns.regplot(x=age_prod_df.index, y=age_prod_df['total_pts'], 
            ci=75, order=2, ax=ax)

plt.legend()
plt.xlabel('Age')
plt.ylabel('Avg point production')
plt.title('Age impact on fantasy production for WR')

plt.show()