In [288]:
"""Topic: Exploring and visualizing NFL statistics in pandas)
Web scraping data from Pro-Football-Reference for analysis
"""

from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np

In [289]:
def all_position_stats(year, stat):

    base_url = 'https://www.pro-football-reference.com/years/'
    
    #Create full url to scrape
    url = base_url + year + '/' + stat + '.htm'

    response = requests.get(url)
    page = response.text

    soup = BeautifulSoup(page,"lxml")

    table = soup.find('tbody')

    #Breaking down all data via rows 
    rows = [row for row in table.select('tr')]  # tr tag is for rows

    #Calling running backs
    position_dict = {}

    for row in rows[:25]:
        items = row.find_all('td')
        link = items[0].find('a')
        position_stat, url = link.text, link['href']
        position_dict[position_stat] = [url] + [i.text for i in items]

    #Create a dataframe of movies
    df_all_stats = pd.DataFrame(position_dict).T

    return df_all_stats

In [325]:
def all_rb_defense_headers(year, stat):

    base_url = 'https://www.pro-football-reference.com/years/'
    
    #Create full url to scrape
    url = base_url + year + '/' + stat + '.htm'

    response = requests.get(url)
    page = response.text

    soup = BeautifulSoup(page,"lxml")

    table_head = soup.find('thead').find_all('tr')[1]

    #Breaking down all data via rows 
    column_headers = [row.text for row in table_head.find_all('th')]  # tr tag is for rows
    column_headers[0] = 'Link'

    return column_headers

In [326]:
df_all_running_backs_2019 = all_position_stats('2019','rushing')
df_all_running_backs_2019.columns = all_rb_defense_headers('2019','rushing')

In [327]:
df_all_running_backs_2019

Unnamed: 0,Link,Player,Tm,Age,Pos,G,GS,Att,Yds,TD,1D,Lng,Y/A,Y/G,Fmb
Derrick Henry,/players/H/HenrDe00.htm,Derrick Henry *,TEN,25,RB,15,15,303,1540,16,73,74,5.1,102.7,5
Ezekiel Elliott,/players/E/ElliEz00.htm,Ezekiel Elliott*,DAL,24,RB,16,16,301,1357,12,78,33,4.5,84.8,3
Nick Chubb,/players/C/ChubNi00.htm,Nick Chubb*,CLE,24,RB,16,16,298,1494,8,62,88,5.0,93.4,3
Christian McCaffrey,/players/M/McCaCh01.htm,Christian McCaffrey *+,CAR,23,RB,16,16,287,1387,15,57,84,4.8,86.7,1
Chris Carson,/players/C/CarsCh00.htm,Chris Carson,SEA,25,RB,15,15,278,1230,7,75,59,4.4,82.0,7
Joe Mixon,/players/M/MixoJo00.htm,Joe Mixon,CIN,23,RB,16,15,278,1137,5,56,41,4.1,71.1,0
Leonard Fournette,/players/F/FourLe00.htm,Leonard Fournette,JAX,24,RB,15,15,265,1152,3,55,81,4.3,76.8,1
Dalvin Cook,/players/C/CookDa01.htm,Dalvin Cook*,MIN,24,RB,14,14,250,1135,13,60,75,4.5,81.1,4
Marlon Mack,/players/M/MackMa00.htm,Marlon Mack,IND,23,RB,14,12,247,1091,8,67,63,4.4,77.9,0
Sony Michel,/players/M/MichSo00.htm,Sony Michel,NWE,24,RB,16,14,247,912,7,55,26,3.7,57.0,2


In [293]:
def all_wide_receiver_headers(year, stat):

    base_url = 'https://www.pro-football-reference.com/years/'
    
    #Create full url to scrape
    url = base_url + year + '/' + stat + '.htm'

    response = requests.get(url)
    page = response.text

    soup = BeautifulSoup(page,"lxml")

    table_head = soup.find('thead').find('tr')

    #Breaking down all data via rows 
    column_headers = [row.text for row in table_head.find_all('th')]  # tr tag is for rows
    column_headers[0] = 'Link'

    return column_headers

In [294]:
df_all_wide_receivers_2019 = all_position_stats('2019','receiving')
df_all_wide_receivers_2019.columns = all_wide_receiver_headers('2019','receiving')

In [295]:
df_all_wide_receivers_2019

Unnamed: 0,Link,Player,Tm,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,1D,Lng,Y/Tgt,R/G,Y/G,Fmb
Michael Thomas,/players/T/ThomMi05.htm,Michael Thomas*+,NOR,26,WR,16,15,185,149,80.5%,1725,11.6,9,91,49,9.3,9.3,107.8,1
Christian McCaffrey,/players/M/McCaCh01.htm,Christian McCaffrey *+,CAR,23,RB,16,16,142,116,81.7%,1005,8.7,4,58,28,7.1,7.3,62.8,1
Keenan Allen,/players/A/AlleKe00.htm,Keenan Allen*,LAC,27,WR,16,16,149,104,69.8%,1199,11.5,6,63,45,8.0,6.5,74.9,0
DeAndre Hopkins,/players/H/HopkDe00.htm,DeAndre Hopkins*+,HOU,27,WR,15,15,150,104,69.3%,1165,11.2,7,68,43,7.8,6.9,77.7,0
Julian Edelman,/players/E/EdelJu00.htm,Julian Edelman,NWE,33,WR,16,13,153,100,65.4%,1117,11.2,6,54,44,7.3,6.3,69.8,3
Julio Jones,/players/J/JoneJu02.htm,Julio Jones *,ATL,30,WR,15,15,157,99,63.1%,1394,14.1,6,77,54,8.9,6.6,92.9,1
Allen Robinson,/players/R/RobiAl02.htm,Allen Robinson,CHI,26,WR,16,15,154,98,63.6%,1147,11.7,7,63,49,7.4,6.1,71.7,0
Travis Kelce,/players/K/KelcTr00.htm,Travis Kelce*,KAN,30,TE,16,16,136,97,71.3%,1229,12.7,5,65,47,9.0,6.1,76.8,1
Cooper Kupp,/players/K/KuppCo00.htm,Cooper Kupp,LAR,26,WR,16,14,134,94,70.1%,1161,12.4,10,51,66,8.7,5.9,72.6,3
Austin Ekeler,/players/E/EkelAu00.htm,Austin Ekeler,LAC,24,rb,16,8,108,92,85.2%,993,10.8,8,42,84,9.2,5.8,62.1,3


In [296]:
#Pickling
df_all_running_backs_2019.to_pickle("./df_all_running_backs_2019.pkl")

In [297]:
df_all_running_backs_2019 = pd.read_pickle("./df_all_running_backs_2019.pkl")

In [314]:
def get_running_back_df(link):
    
    """
    From Pro-Football-Reference link stub, request running back html, parse with BeautifulSoup, and
    collect 
        - Name 
        - Carries
        - Rush Yards
        - Rush Yard Per Attempt
        - Touchdowns
    Return information as a dictionary.
    """
    
    base_url = 'https://www.pro-football-reference.com'
    season = '/gamelog/2019'
    
    #Create full url to scrape
    url = base_url + link + season

    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    name = soup.find('h1').find('span').text

    table = soup.find('tbody')

    #Breaking down all data via rows via tr tag
    rows = [row for row in table.select('tr')] 

    running_backs = {}

    for row in rows:
        items = row.find_all('td')
        running_backs[row] = [i.text for i in items[:15]]

    # #Create a dataframe of running back stats
    df_running_backs = pd.DataFrame(running_backs).T.reset_index() #transpose

    df_running_backs.columns = ['Name','Date','Game', 'Week', 'Age', 'Team', '','Opp', 'Result','Game_Started',
    'Carries', 'Total_Rushing_Yards', 'Rushing_Yards_per_Attempt', 'Rushing_Touchdowns','None','None'] 

    # table_head = soup.find('thead').find_all('tr')[1]

    #Breaking down all data via rows 
    # column_headers = [row.text for row in table_head.find_all('th')[:14]]  # tr tag is for rows
    # column_headers[0] = 'Name'
    # df_running_backs.columns = column_headers

    df_running_backs['Name'] = np.where(df_running_backs['Name'], name , df_running_backs['Name'])
    df_running_backs = df_running_backs.drop(columns=['Game','Age','','Game_Started'])

    return df_running_backs

In [315]:
#creates list of all movies with features
all_running_backs = get_running_back_df(df_all_running_backs_2019.Link[0])

for link in range(len(df_all_running_backs_2019.Link)):
    all_running_backs = all_running_backs.append(get_running_back_df(df_all_running_backs_2019.Link[link]))
all_running_backs.reset_index()

Unnamed: 0,index,Name,Date,Week,Team,Opp,Result,Carries,Total_Rushing_Yards,Rushing_Yards_per_Attempt,Rushing_Touchdowns,None,None.1
0,0,Derrick Henry,2019-09-08,1,TEN,CLE,W 43-13,19,84,4.42,1,2,1
1,1,Derrick Henry,2019-09-15,2,TEN,IND,L 17-19,15,82,5.47,1,3,2
2,2,Derrick Henry,2019-09-19,3,TEN,JAX,L 7-20,17,44,2.59,1,2,1
3,3,Derrick Henry,2019-09-29,4,TEN,ATL,W 24-10,27,100,3.70,0,1,1
4,4,Derrick Henry,2019-10-06,5,TEN,BUF,L 7-14,20,78,3.90,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,9,Alvin Kamara,2019-11-28,13,NOR,ATL,W 26-18,11,61,5.55,0,8,4
389,10,Alvin Kamara,2019-12-08,14,NOR,SFO,L 46-48,13,25,1.92,0,6,4
390,11,Alvin Kamara,2019-12-16,15,NOR,IND,W 34-7,14,66,4.71,0,5,5
391,12,Alvin Kamara,2019-12-22,16,NOR,TEN,W 38-28,11,80,7.27,2,7,6


In [316]:
def get_wide_receiver_df(link):
    
    """
    From Pro-Football-Reference link stub, request wide receiver html, parse with BeautifulSoup, and
    collect 
        - Name 
        - Carries
        - Rush Yards
        - Rush Yard Per Attempt
        - Touchdowns
    Return information as a dictionary.
    """
    
    base_url = 'https://www.pro-football-reference.com'
    season = '/gamelog/2019'
    
    #Create full url to scrape
    url = base_url + link + season

    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    name = soup.find('h1').find('span').text

    table = soup.find('tbody')

    #Breaking down all data via rows via tr tag
    rows = [row for row in table.select('tr')] 

    running_backs = {}

    for row in rows:
        items = row.find_all('td')
        running_backs[row] = [i.text for i in items[:15]]

    # #Create a dataframe of running back stats
    df_wide_receivers = pd.DataFrame(running_backs).T.reset_index() #transpose

    df_wide_receivers.columns = ['Name','Date','Game', 'Week', 'Age', 'Team', '','Opp', 'Result','Game_Started',
    'Targets', 'Receptions', 'Total_Yards', 'Yards/Reception', 'Touchdowns', 'Catch%'] 

    df_wide_receivers['Name'] = np.where(df_wide_receivers['Name'], name , df_wide_receivers['Name'])
    df_wide_receivers = df_wide_receivers.drop(columns=['Game','Age','','Game_Started'])

    return df_wide_receivers

In [317]:
#creates list of all movies with features
all_wide_receivers = get_wide_receiver_df(df_all_wide_receivers_2019.Link[0])

for link in range(len(df_all_wide_receivers_2019.Link)):
    all_wide_receivers = all_wide_receivers.append(get_wide_receiver_df(df_all_wide_receivers_2019.Link[link]))
all_wide_receivers.reset_index()

Unnamed: 0,index,Name,Date,Week,Team,Opp,Result,Targets,Receptions,Total_Yards,Yards/Reception,Touchdowns,Catch%
0,0,Michael Thomas,2019-09-09,1,NOR,HOU,W 30-28,13,10,123,12.30,0,76.9%
1,1,Michael Thomas,2019-09-15,2,NOR,LAR,L 9-27,13,10,89,8.90,0,76.9%
2,2,Michael Thomas,2019-09-22,3,NOR,SEA,W 33-27,7,5,54,10.80,1,71.4%
3,3,Michael Thomas,2019-09-29,4,NOR,DAL,W 12-10,9,9,95,10.56,0,100.0%
4,4,Michael Thomas,2019-10-06,5,NOR,TAM,W 31-24,13,11,182,16.55,2,84.6%
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,10,Leonard Fournette,2019-11-24,12,JAX,TEN,L 20-42,24,97,4.04,2,12,9
396,11,Leonard Fournette,2019-12-01,13,JAX,TAM,L 11-28,14,38,2.71,0,11,9
397,12,Leonard Fournette,2019-12-08,14,JAX,LAC,L 10-45,15,50,3.33,0,6,3
398,13,Leonard Fournette,2019-12-15,15,JAX,OAK,W 20-16,15,42,2.80,0,7,5


In [328]:
df_all_defense_2019 = all_position_stats('2019', 'opp')
df_all_defense_2019.columns = all_rb_defense_headers('2019','opp')

In [329]:
df_all_defense_2019

Unnamed: 0,Link,Tm,G,PF,Yds,Ply,Y/P,TO,FL,1stD,...,Yds.1,TD,Y/A,1stD.1,Pen,Yds.2,1stPy,Sc%,TO%,EXP
New England Patriots,/teams/nwe/2019.htm,New England Patriots,16,225,4414,948,4.7,36,11,261,...,1528,7,4.2,72,107,920,39,19.4,17.3,166.75
Buffalo Bills,/teams/buf/2019.htm,Buffalo Bills,16,259,4772,985,4.8,23,9,295,...,1649,12,4.3,93,94,815,33,23.6,12.4,48.85
Baltimore Ravens,/teams/rav/2019.htm,Baltimore Ravens,16,282,4809,921,5.2,25,12,276,...,1494,12,4.4,74,97,795,39,32.9,14.6,21.61
Chicago Bears,/teams/chi/2019.htm,Chicago Bears,16,298,5186,1017,5.1,19,9,306,...,1632,16,3.9,86,113,923,30,31.3,10.6,0.85
Minnesota Vikings,/teams/min/2019.htm,Minnesota Vikings,16,303,5465,1053,5.2,31,14,324,...,1728,8,4.3,78,83,713,31,34.5,17.0,-3.88
Pittsburgh Steelers,/teams/pit/2019.htm,Pittsburgh Steelers,16,303,4866,1030,4.7,38,18,304,...,1753,7,3.8,110,115,1118,30,29.9,19.0,86.78
Kansas City Chiefs,/teams/kan/2019.htm,Kansas City Chiefs,16,308,5594,1043,5.4,23,7,344,...,2051,14,4.9,115,116,844,39,34.6,13.6,-56.69
San Francisco 49ers,/teams/sfo/2019.htm,San Francisco 49ers,16,310,4509,968,4.7,27,15,285,...,1802,11,4.5,105,116,957,30,29.0,14.2,84.41
Green Bay Packers,/teams/gnb/2019.htm,Green Bay Packers,16,313,5642,998,5.7,25,8,310,...,1921,15,4.7,99,97,968,20,34.3,14.0,-59.65
Denver Broncos,/teams/den/2019.htm,Denver Broncos,16,316,5392,1003,5.4,17,7,305,...,1783,9,4.2,96,121,1041,34,37.3,8.4,-31.98


In [364]:
def get_defense_df(link):
    
    """
    From Pro-Football-Reference link stub, request running back html, parse with BeautifulSoup, and
    collect 
        - Name 
        - Carries
        - Rush Yards
        - Rush Yard Per Attempt
        - Touchdowns
    Return information as a dictionary.
    """
    
    base_url = 'https://www.pro-football-reference.com'
    
    #Create full url to scrape
    url = base_url + link

    #Request HTML and parse
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")

    name = soup.find('h1').find_all('span')[1].text

    table = soup.find_all('tbody')[1]

    #Breaking down all data via rows via tr tag
    rows = [row for row in table.select('tr')] 

    defense = {}

    for row in rows:
        items = row.find_all('td')
        defense[row] = [i.text for i in items[:21]]

    # #Create a dataframe of running back stats
    df_defense = pd.DataFrame(defense).T.reset_index() #transpose

    df_defense.columns = ['Team','Day','Date','Time', 'Boxscore','Win/Loss','OT','Record','','Opponent','Points',
    'Opponent Score','1st Downs','Total_Yards','Pass_Yards','Rush_Yards','TurnOvers','1stDowns','Defense_Total_Yards','Defense_Pass_Yards','Defense_Rush_Yards','Turnovers'] 

    df_defense['Team'] = np.where(df_defense['Team'], name , df_defense['Team'])
    df_defense = df_defense.drop(columns=['Time','Boxscore','Win/Loss','OT','Record','','1st Downs','Total_Yards','Pass_Yards','Rush_Yards','TurnOvers'])

    return df_defense

In [369]:
#creates list of all movies with features
all_defense = get_defense_df(df_all_defense_2019.Link[0])

for link in range(len(df_all_defense_2019.Link)):
    all_defense = all_defense.append(get_defense_df(df_all_defense_2019.Link[link]))
all_defense.reset_index()

Unnamed: 0,index,Team,Day,Date,Opponent,Points,Opponent Score,1stDowns,Defense_Total_Yards,Defense_Pass_Yards,Defense_Rush_Yards,Turnovers
0,0,New England Patriots,Sun,September 8,Pittsburgh Steelers,33,3,15,308,276,32,1
1,1,New England Patriots,Sun,September 15,Miami Dolphins,43,0,11,184,142,42,4
2,2,New England Patriots,Sun,September 22,New York Jets,30,14,6,105,69,36,1
3,3,New England Patriots,Sun,September 29,Buffalo Bills,16,10,23,375,240,135,4
4,4,New England Patriots,Sun,October 6,Washington Redskins,33,7,11,223,78,145,2
...,...,...,...,...,...,...,...,...,...,...,...,...
473,12,Cincinnati Bengals,Sun,December 1,New York Jets,22,6,15,271,209,62,
474,13,Cincinnati Bengals,Sun,December 8,Cleveland Browns,19,27,17,333,187,146,2
475,14,Cincinnati Bengals,Sun,December 15,New England Patriots,13,34,19,291,116,175,
476,15,Cincinnati Bengals,Sun,December 22,Miami Dolphins,35,38,29,502,406,96,1
