In [1]:
#import dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pymongo
# SQL Alchemy
from sqlalchemy import create_engine, event
import time
from datetime import datetime

# PyMySQL 
import pymysql
pymysql.install_as_MySQLdb()

# Config variables
from config import local_db_user, local_db_pwd, local_db_endpoint, local_db_port, local_db_name
from config import remote_db_user, remote_db_pwd, remote_db_endpoint, remote_db_port, remote_db_name

https://towardsdatascience.com/scraping-nfl-stats-to-compare-quarterback-efficiencies-4989642e02fe
https://stmorse.github.io/journal/pfr-scrape-python.html
https://www.pro-football-reference.com/players/H/HenrDe00/fantasy/2020/
https://ffpredictor.thefantasytakeaway.com/

# Long test scrape

In [52]:
combined_all_df = pd.DataFrame()

for year in range(2002, 2021):
    
    print(year)
    print('---')

    url = 'https://www.pro-football-reference.com'
    maxp = 300
    
    # grab fantasy players
    r = requests.get(url + '/years/' + str(year) + '/fantasy.htm')
    soup = BeautifulSoup(r.content, 'html.parser')
    parsed_table = soup.find_all('table')[0]  

    df = []

    # first 2 rows are col headers
    for i,row in enumerate(parsed_table.find_all('tr')[2:]):
        if i % 10 == 0: print(i, end=' ')
        if i >= maxp: 
            print('\nComplete.')
            break

        try:

            name_td = row.find('td', attrs={'data-stat': 'player'})
            name = name_td.a.get_text()
            stub = name_td.a.get('href')
            stub = stub[:-4] + '/fantasy/' + str(year)
            pos = row.find('td', attrs={'data-stat': 'fantasy_pos'}).get_text()
            total_Fpts = row.find('td', attrs={'data-stat': 'fantasy_points_ppr'}).get_text()
            vbd = row.find('td', attrs={'data-stat': 'vbd'}).get_text()
            pos_rank = row.find('td', attrs={'data-stat': 'fantasy_rank_pos'}).get_text()
            ov_rank = row.find('td', attrs={'data-stat': 'fantasy_rank_overall'}).get_text()

            # grab this players stats
            tdf = pd.read_html(url + stub)[0]  

            # get rid of MultiIndex, just keep last row
            tdf.columns = tdf.columns.get_level_values(-1)

            # fix the away/home column
            tdf = tdf.rename(columns={'Unnamed: 4_level_2': 'Away'})
            tdf['Away'] = [1 if r=='@' else 0 for r in tdf['Away']]

            # drop all intermediate stats
            tdf = tdf.iloc[:,[1,2,3,4,5,-9,-8,-3]]

            # drop "Total" row
            tdf = tdf.query('Date != "Total"')

            # add other info
            tdf['Name'] = name
            tdf['Position'] = pos
            tdf['Season'] = year
            tdf['total_Fpts'] = total_Fpts
            tdf['vbd'] = vbd
            tdf['pos_rank'] = pos_rank
            tdf['ov_rank'] = ov_rank

            df.append(tdf)
        except:
            pass

    df = pd.concat(df)
    combined_all_df = combined_all_df.append(df)

combined_all_df

2002
---
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 
Complete.
2003
---
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 
Complete.
2004
---
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 
Complete.
2005
---
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 
Complete.
2006
---
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 
Complete.
2007
---
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 
Complete.
2008
---
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 
Complete.
2009
---
0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170

Unnamed: 0,G#,Date,Tm,Away,Opp,Yds,TD,FantPt,Name,Position,...,total_Fpts,vbd,pos_rank,ov_rank,Att,Rec,Result,Pos,Num,Pct
0,1.0,2002-09-08,KAN,1,CLE,14.0,2.0,38.1,Priest Holmes,RB,...,440.7,220,1,1,,,,,,
1,2.0,2002-09-15,KAN,0,JAX,0.0,0.0,11.6,Priest Holmes,RB,...,440.7,220,1,1,,,,,,
2,3.0,2002-09-22,KAN,1,NWE,7.0,2.0,35.8,Priest Holmes,RB,...,440.7,220,1,1,,,,,,
3,4.0,2002-09-29,KAN,0,MIA,3.0,0.0,17.5,Priest Holmes,RB,...,440.7,220,1,1,,,,,,
4,5.0,2002-10-06,KAN,1,NYJ,0.0,0.0,35.3,Priest Holmes,RB,...,440.7,220,1,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,12.0,2020-12-06,NYJ,0,LVR,,,7.4,Josh Adams,RB,...,36.6,,83,,,,,,19.0,30.6%
4,13.0,2020-12-13,NYJ,1,SEA,,,3.0,Josh Adams,RB,...,36.6,,83,,,,,,12.0,22.6%
5,14.0,2020-12-20,NYJ,1,LAR,,,,Josh Adams,RB,...,36.6,,83,,,,,,1.0,1.5%
6,15.0,2020-12-27,NYJ,0,CLE,,,,Josh Adams,RB,...,36.6,,83,,,,,,0.0,0.0%


In [55]:
# saving the dataframe 
combined_all_df.to_csv('weekly_stats_all.csv') 

# Pro Reference all players per year scrape

In [7]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324


 


[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/88.0.4324.96/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\Steven\.wdm\drivers\chromedriver\win32\88.0.4324.96]


In [25]:
# URL of page to be scraped

combined_df = pd.DataFrame()

for year in range(2002, 2021):
    
    print(year)
    
    url = f'https://www.pro-football-reference.com/years/{year}/fantasy.htm'
    #print(url)

    # Retrieve page with the browser module
    browser.visit(url)

    # setup html parser
    html = browser.html
    fantasy_soup = BeautifulSoup(html, 'html.parser')

    # save the stats table as a df
    stats_table = fantasy_soup.find("table", {"class":"per_match_toggle"})
    stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]

    stats_df['Season'] = year
    combined_df = combined_df.append(stats_df) 
        
combined_df



2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,Season
0,1,Priest Holmes*+,KAN,RB,29,14,14,0,1,0,...,,,371,440.7,447.7,405.7,220,1,1,2002
1,2,Ricky Williams*+,MIA,RB,25,16,16,0,0,0,...,,,316,362.6,372.6,339.1,165,2,2,2002
2,3,LaDainian Tomlinson*,SDG,RB,23,16,16,0,0,0,...,,,305,384.2,391.2,344.7,155,3,3,2002
3,4,Clinton Portis,DEN,RB,21,16,12,0,0,0,...,,,283,316.2,325.2,299.7,133,4,4,2002
4,5,Marvin Harrison*+,IND,WR,30,16,16,0,0,0,...,1,,241,384.2,387.2,312.7,122,1,5,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,622,Andre Roberts*,BUF,WR,32,15,0,0,0,0,...,,,-1,3.1,5.1,1.1,,243,,2020
642,623,Nate Sudfeld,PHI,QB,27,1,0,5,12,32,...,,,-2,-1.5,0.5,-0.5,,83,,2020
643,624,Nsimba Webster,LAR,,24,16,0,0,0,0,...,,,-2,-2.0,-1.0,-2.0,,246,,2020
644,625,Dede Westbrook,JAX,WR,27,2,0,0,0,0,...,,,-2,-0.6,0.4,-1.1,,245,,2020


In [26]:
combined_df.columns = ['Rk', 'Player', 'Team', 'Pos', 'Age', 'Games Played', 'Games Started', 'Pass Comp', 'Pass Att', 'Pass Yds', 'Pass TD', 'Pass Int', 'Rush Att', 'Rush Yds', 'Rush Y/A', 'Rush TD', 'Rec Tgt', 'Receptions', 'Rec Yds', 'Rec Y/R', 'Rec TD', 'Fumbles', 'Fumbles Lost', 'Total TD', '2pt Convs Made', '2pt Conv Passes', 'FPts', 'PPR', 'DK Fpts', 'FD Fpts', 'VBD', 'Pos Rank', 'Overall Rank', 'Season']
combined_df['Player'] = combined_df['Player'].str.replace('*', '')
combined_df['Player'] = combined_df['Player'].str.replace('+', '')
combined_df['Player'] = combined_df['Player'].str.strip()
combined_df

Unnamed: 0,Rk,Player,Team,Pos,Age,Games Played,Games Started,Pass Comp,Pass Att,Pass Yds,...,2pt Convs Made,2pt Conv Passes,FPts,PPR,DK Fpts,FD Fpts,VBD,Pos Rank,Overall Rank,Season
0,1,Priest Holmes,KAN,RB,29,14,14,0,1,0,...,,,371,440.7,447.7,405.7,220,1,1,2002
1,2,Ricky Williams,MIA,RB,25,16,16,0,0,0,...,,,316,362.6,372.6,339.1,165,2,2,2002
2,3,LaDainian Tomlinson,SDG,RB,23,16,16,0,0,0,...,,,305,384.2,391.2,344.7,155,3,3,2002
3,4,Clinton Portis,DEN,RB,21,16,12,0,0,0,...,,,283,316.2,325.2,299.7,133,4,4,2002
4,5,Marvin Harrison,IND,WR,30,16,16,0,0,0,...,1,,241,384.2,387.2,312.7,122,1,5,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,622,Andre Roberts,BUF,WR,32,15,0,0,0,0,...,,,-1,3.1,5.1,1.1,,243,,2020
642,623,Nate Sudfeld,PHI,QB,27,1,0,5,12,32,...,,,-2,-1.5,0.5,-0.5,,83,,2020
643,624,Nsimba Webster,LAR,,24,16,0,0,0,0,...,,,-2,-2.0,-1.0,-2.0,,246,,2020
644,625,Dede Westbrook,JAX,WR,27,2,0,0,0,0,...,,,-2,-0.6,0.4,-1.1,,245,,2020


In [28]:
combined_df = combined_df.fillna(0)
combined_df

Unnamed: 0,Rk,Player,Team,Pos,Age,Games Played,Games Started,Pass Comp,Pass Att,Pass Yds,...,2pt Convs Made,2pt Conv Passes,FPts,PPR,DK Fpts,FD Fpts,VBD,Pos Rank,Overall Rank,Season
0,1,Priest Holmes,KAN,RB,29,14,14,0,1,0,...,0,0,371,440.7,447.7,405.7,220,1,1,2002
1,2,Ricky Williams,MIA,RB,25,16,16,0,0,0,...,0,0,316,362.6,372.6,339.1,165,2,2,2002
2,3,LaDainian Tomlinson,SDG,RB,23,16,16,0,0,0,...,0,0,305,384.2,391.2,344.7,155,3,3,2002
3,4,Clinton Portis,DEN,RB,21,16,12,0,0,0,...,0,0,283,316.2,325.2,299.7,133,4,4,2002
4,5,Marvin Harrison,IND,WR,30,16,16,0,0,0,...,1,0,241,384.2,387.2,312.7,122,1,5,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641,622,Andre Roberts,BUF,WR,32,15,0,0,0,0,...,0,0,-1,3.1,5.1,1.1,0,243,0,2020
642,623,Nate Sudfeld,PHI,QB,27,1,0,5,12,32,...,0,0,-2,-1.5,0.5,-0.5,0,83,0,2020
643,624,Nsimba Webster,LAR,0,24,16,0,0,0,0,...,0,0,-2,-2.0,-1.0,-2.0,0,246,0,2020
644,625,Dede Westbrook,JAX,WR,27,2,0,0,0,0,...,0,0,-2,-0.6,0.4,-1.1,0,245,0,2020


In [30]:
# saving the dataframe 
combined_df.to_csv('all_players_yearly_stats_2002-2020.csv') 

# The huddle QB Scrape

In [4]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\Steven\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


 


In [5]:
# URL of page to be scraped

combined_QB_df = pd.DataFrame()

for year in range(2012, 2021):
    
    print(year)
    print('---')
    
    for week in range(1, 18):

        #print(year)
        #print(week)
        #print('---')
        pos = 'QB'
        url = f'https://tools.thehuddle.com/weekly?week={week}&year={year}&position={pos}&formulaId=6'
        #print(url)

        # Retrieve page with the browser module
        browser.visit(url)

        # setup html parser
        html = browser.html
        fantasy_soup = BeautifulSoup(html, 'html.parser')

        # save the stats table as a df
        stats_table = fantasy_soup.find("table", {"class":"dataTable"})
        stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]
        stats_df.dropna(axis = 0, how = 'all', inplace = True)
        stats_df.reset_index(inplace = True)

        #stats_df[['FF Pts','Plays','PAATT','PACMP','PAYDS','PATDS','RUATT','RUYDS','RUTDS','INT','FUM']] = stats_df[['FF Pts','Plays','PAATT','PACMP','PAYDS','PATDS','RUATT','RUYDS','RUTDS','INT','FUM']].apply(pd.to_numeric)
        #stats_df.replace('-', 0)

        stats_df['Player'] = stats_df['Player'].str[3:]
        stats_df['Week'] = week
        stats_df['Season'] = year


        combined_QB_df = combined_QB_df.append(stats_df) 
        
combined_QB_df

2012
---
2013
---
2014
---
2015
---
2016
---
2017
---
2018
---
2019
---
2020
---


Unnamed: 0,index,Player,Team,FF Pts,Plays,PAATT,PACMP,PAYDS,PATDS,RUATT,RUYDS,RUTDS,INT,FUM,Week,Season
0,0,Aaron Rodgers,GB,53.2,83,73,51,554,7,10,70,–,2,–,1,2012
1,3,Jay Cutler,CHI,38.9,68,63,41,567,4,5,22,–,1,1,1,2012
2,6,Eli Manning,NYG,38.6,81,79,49,568,5,2,-1,–,2,1,1,2012
3,9,Michael Vick,PHI,35.1,71,60,39,416,3,11,65,–,–,3,1,2012
4,12,Andy Dalton,CIN,33.7,84,74,46,478,3,10,26,1,3,–,1,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,114,Nate Sudfeld,PHI,0.5,14,12,5,32,–,2,12,–,1,1,17,2020
39,117,Jacoby Brissett,IND,0.3,2,–,–,–,–,2,3,–,–,–,17,2020
40,120,Jameis Winston,NO,-0.1,1,–,–,–,–,1,-1,–,–,–,17,2020
41,123,Brandon Allen,CIN,-1.9,22,21,6,48,–,1,2,–,2,–,17,2020


In [14]:
# saving the dataframe 
combined_QB_df.to_csv('weekly_QB_stats.csv') 


# The huddle RB Scrape

In [7]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\Steven\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


 


In [10]:
# URL of page to be scraped

combined_RB_df = pd.DataFrame()

for year in range(2012, 2021):
    
    print(year)
    print('---')
    
    for week in range(1, 18):

        #print(year)
        #print(week)
        #print('---')
        pos = 'RB'
        url = f'https://tools.thehuddle.com/weekly?week={week}&year={year}&position={pos}&formulaId=6'
        #print(url)

        # Retrieve page with the browser module
        browser.visit(url)

        # setup html parser
        html = browser.html
        fantasy_soup = BeautifulSoup(html, 'html.parser')

        # save the stats table as a df
        stats_table = fantasy_soup.find("table", {"class":"dataTable"})
        stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]
        stats_df.dropna(axis = 0, how = 'all', inplace = True)
        stats_df.reset_index(inplace = True)

        stats_df['Player'] = stats_df['Player'].str[3:]
        stats_df['Week'] = week
        stats_df['Season'] = year


        combined_RB_df = combined_RB_df.append(stats_df) 
        
combined_RB_df

2012
---
2013
---
2014
---
2015
---
2016
---
2017
---
2018
---
2019
---
2020
---


Unnamed: 0,index,Player,Team,FF Pts,Plays,RUATT,RUYDS,RUTDS,TRGT,REC,RCYDS,RCTDS,INT,FUM,Week,Season
0,0,Ahmad Bradshaw,NYG,36.6,53,43,239,1,10,7,67,–,–,1,1,2012
1,3,Alfred Morris,WAS,30.4,42,39,228,1,3,2,16,–,–,–,1,2012
2,6,LeSean McCoy,PHI,28,48,39,176,–,9,7,44,1,–,1,1,2012
3,9,Willis McGahee,DEN,19.5,25,19,112,1,6,6,23,–,–,–,1,2012
4,12,T. Richardson,CLE,18.8,24,17,81,1,7,5,47,–,–,–,1,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,330,. S. Phillips,HOU,0,–,–,–,–,–,–,–,–,–,–,17,2020
111,333,. LeVante Bellamy,DEN,0,1,1,–,–,–,–,–,–,–,–,17,2020
112,336,. Xavier Jones,LAR,0,–,–,–,–,–,–,–,–,–,–,17,2020
113,339,. J.J. Taylor,NE,0,–,–,–,–,–,–,–,–,–,–,17,2020


In [13]:
# saving the dataframe 
combined_RB_df.to_csv('weekly_RB_stats.csv') 

# The huddle WR Scrape

In [15]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\Steven\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


 


In [17]:
# URL of page to be scraped

combined_WR_df = pd.DataFrame()

for year in range(2012, 2021):
    
    print(year)
    print('---')
    
    for week in range(1, 18):

        #print(year)
        #print(week)
        #print('---')
        pos = 'WR'
        url = f'https://tools.thehuddle.com/weekly?week={week}&year={year}&position={pos}&formulaId=6'
        #print(url)

        # Retrieve page with the browser module
        browser.visit(url)

        # setup html parser
        html = browser.html
        fantasy_soup = BeautifulSoup(html, 'html.parser')

        # save the stats table as a df
        stats_table = fantasy_soup.find("table", {"class":"dataTable"})
        stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]
        stats_df.dropna(axis = 0, how = 'all', inplace = True)
        stats_df.reset_index(inplace = True)

        stats_df['Player'] = stats_df['Player'].str[3:]
        stats_df['Week'] = week
        stats_df['Season'] = year


        combined_WR_df = combined_WR_df.append(stats_df) 
        
combined_WR_df

2012
---
2013
---
2014
---
2015
---
2016
---
2017
---
2018
---
2019
---
2020
---


Unnamed: 0,index,Player,Team,FF Pts,Plays,TRGT,REC,RCYDS,RCTDS,RUATT,RUYDS,RUTDS,INT,FUM,Week,Season
0,0,B. Marshall,CHI,40.2,25,25,19,282,2,–,–,–,–,–,1,2012
1,3,Victor Cruz,NYG,39.9,21,21,14,159,4,–,–,–,–,–,1,2012
2,6,James Jones,GB,34.2,13,13,9,102,4,–,–,–,–,–,1,2012
3,9,A.J. Green,CIN,30.2,22,22,15,182,2,–,–,–,–,1,1,2012
4,12,Reggie Wayne,IND,27.2,20,20,13,212,1,–,–,–,–,–,1,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,477,. Chris Rowland,ATL,0,–,–,–,–,–,–,–,–,–,–,17,2020
160,480,. Dan Chisena,MIN,0,–,–,–,–,–,–,–,–,–,–,17,2020
161,483,. Matt Cole,SF,0,–,–,–,–,–,–,–,–,–,–,17,2020
162,486,. Isaiah Wright,WAS,0,–,–,–,–,–,–,–,–,–,–,17,2020


In [18]:
# saving the dataframe 
combined_WR_df.to_csv('weekly_WR_stats.csv') 

# The huddle TE Scrape

In [None]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [20]:
# URL of page to be scraped

combined_TE_df = pd.DataFrame()

for year in range(2012, 2021):
    
    print(year)
    print('---')
    
    for week in range(1, 18):

        #print(year)
        #print(week)
        #print('---')
        pos = 'TE'
        url = f'https://tools.thehuddle.com/weekly?week={week}&year={year}&position={pos}&formulaId=6'
        #print(url)

        # Retrieve page with the browser module
        browser.visit(url)

        # setup html parser
        html = browser.html
        fantasy_soup = BeautifulSoup(html, 'html.parser')

        # save the stats table as a df
        stats_table = fantasy_soup.find("table", {"class":"dataTable"})
        stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]
        stats_df.dropna(axis = 0, how = 'all', inplace = True)
        stats_df.reset_index(inplace = True)

        stats_df['Player'] = stats_df['Player'].str[3:]
        stats_df['Week'] = week
        stats_df['Season'] = year


        combined_TE_df = combined_TE_df.append(stats_df) 
        
combined_TE_df

2012
---
2013
---
2014
---
2015
---
2016
---
2017
---
2018
---
2019
---
2020
---


Unnamed: 0,index,Player,Team,FF Pts,Plays,TRGT,REC,RCYDS,RCTDS,RUATT,RUYDS,RUTDS,INT,FUM,Week,Season
0,0,Tony Gonzalez,ATL,18.3,14,14,13,123,1,–,–,–,–,–,1,2012
1,3,Jason Witten,DAL,17.2,14,14,13,112,1,–,–,–,–,–,1,2012
2,6,Brent Celek,PHI,12.6,10,10,7,66,1,–,–,–,–,–,1,2012
3,9,Fred Davis,WAS,12.4,10,10,9,124,–,–,–,–,–,–,1,2012
4,12,Marcedes Lewis,JAC,11.6,10,10,8,56,1,–,–,–,–,–,1,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,288,Colby Parkinson,SEA,0,–,–,–,–,–,–,–,–,–,–,17,2020
97,291,Tyler Davis,JAC,0,–,–,–,–,–,–,–,–,–,–,17,2020
98,294,Hunter Bryant,DET,0,1,1,–,–,–,–,–,–,–,–,17,2020
99,297,. Sean McKeon,DAL,0,–,–,–,–,–,–,–,–,–,–,17,2020


In [21]:
# saving the dataframe 
combined_TE_df.to_csv('weekly_TE_stats.csv') 

# The huddle DEF Scrape

In [None]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [25]:
# URL of page to be scraped

combined_DEF_df = pd.DataFrame()

for year in range(2012, 2021):
    
    print(year)
    print('---')
    
    for week in range(1, 18):

        #print(year)
        #print(week)
        #print('---')
        pos = 'DF'
        url = f'https://tools.thehuddle.com/weekly?week={week}&year={year}&position={pos}&formulaId=6'
        #print(url)

        # Retrieve page with the browser module
        browser.visit(url)

        # setup html parser
        html = browser.html
        fantasy_soup = BeautifulSoup(html, 'html.parser')

        # save the stats table as a df
        stats_table = fantasy_soup.find("table", {"class":"dataTable"})
        stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]
        stats_df.dropna(axis = 0, how = 'all', inplace = True)
        stats_df.reset_index(inplace = True)

        stats_df['Player'] = stats_df['Player'].str[3:]
        stats_df['Week'] = week
        stats_df['Season'] = year


        combined_DEF_df = combined_DEF_df.append(stats_df) 
        
combined_DEF_df

2012
---
2013
---
2014
---
2015
---
2016
---
2017
---
2018
---
2019
---
2020
---


Unnamed: 0,index,Player,Team,FF Pts,SCK,DEFFR,DEFINT,DEFTD,DEFSAF,Week,Season
0,0,Broncos,DEN,13,5,–,1,1,–,1,2012
1,3,Steelers,PIT,4,2,1,–,–,–,1,2012
0,0,Steelers,PIT,4,2,1,–,–,–,2,2012
1,3,Jets,NYJ,3,3,–,–,–,–,2,2012
0,0,Raiders,LVR,5,1,2,–,–,–,3,2012
...,...,...,...,...,...,...,...,...,...,...,...
27,81,Chiefs,KC,3,3,–,–,–,–,17,2020
28,84,Raiders,LVR,2,2,–,–,–,–,17,2020
29,87,Panthers,CAR,2,2,–,–,–,–,17,2020
30,90,Bengals,CIN,2,–,–,1,–,–,17,2020


In [26]:
# saving the dataframe 
combined_DEF_df.to_csv('weekly_DEF_stats.csv') 

# Pro football reference passing yearly scrape

In [2]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324


 


[WDM] - Driver [C:\Users\Steven\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


In [7]:
# URL of page to be scraped

yearly_passing_df = pd.DataFrame()

for year in range(2002, 2021):
    
    print(year)
    print('---')
    
    pos = 'passing'
    url = f'https://www.pro-football-reference.com/years/{year}/{pos}.htm'
    #print(url)

    # Retrieve page with the browser module
    browser.visit(url)

    # setup html parser
    html = browser.html
    fantasy_soup = BeautifulSoup(html, 'html.parser')

    # save the stats table as a df
    stats_table = fantasy_soup.find("table", {"class":"per_match_toggle"})
    stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]

    stats_df['Season'] = year
    yearly_passing_df = yearly_passing_df.append(stats_df) 
        
yearly_passing_df

2002
---
2003
---
2004
---
2005
---
2006
---
2007
---
2008
---
2009
---
2010
---
2011
---
2012
---
2013
---
2014
---
2015
---
2016
---
2017
---
2018
---
2019
---
2020
---


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,Season,30
0,1,Rich Gannon*+,OAK,37,QB,16,16,11-5-0,418,618,...,97.3,36,214,6.84,6.95,5.5,1,1,2002,
1,2,Drew Bledsoe*,BUF,30,QB,16,16,8-8-0,375,610,...,86.0,54,369,6.01,5.72,8.1,2,4,2002,
2,3,Tom Brady,NWE,25,QB,16,16,9-7-0,373,601,...,85.7,31,190,5.66,5.54,4.9,2,3,2002,
3,4,Peyton Manning *,IND,26,QB,16,16,10-6-0,392,591,...,88.8,23,145,6.60,6.09,3.7,4,5,2002,
4,5,Brett Favre *,GNB,33,QB,16,16,12-4-0,341,551,...,85.6,26,188,6.01,5.70,4.5,3,3,2002,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,108,Logan Thomas,WAS,29,TE,16,15,,1,1,...,118.7,47.7,0,0,28.00,28.00,0.0,,2020,
111,109,Tommy Townsend,KAN,24,p,16,0,,1,1,...,118.7,13.9,0,0,13.00,13.00,0.0,,2020,
112,110,Greg Ward,PHI,25,wr,16,10,,1,1,...,118.7,0.9,0,0,15.00,15.00,0.0,,2020,
113,111,Sammy Watkins,KAN,27,wr,10,9,,0,1,...,0.0,0.0,0,0,0.00,-45.00,0.0,,2020,


In [8]:
yearly_passing_df.columns = ['Rk', 'Player', 'Team', 'Age', 'Pos', 'Games Played', 'Games Started', 'QB rec', 'Comp', 'Att', 'Comp %', 'Yds', 'TD', 'TD %', 'Int', 'Int %', '1st Downs', 'Long', 'Yds/Att', 'Adj Yds/Att', 'Yds/Comp', 'Yds/Game', 'Rate', 'QB Rating', 'Sacks', 'Sack Yards', 'Net Yds/Att', 'Adj Net Yds/Att', 'Sack %', 'Comebacks', 'Game Winning Drives', 'Season']
yearly_passing_df['Player'] = yearly_passing_df['Player'].str.replace('*', '')
yearly_passing_df['Player'] = yearly_passing_df['Player'].str.replace('+', '')
yearly_passing_df['Player'] = yearly_passing_df['Player'].str.strip()
yearly_passing_df = yearly_passing_df.fillna(0)
yearly_passing_df

Unnamed: 0,Rk,Player,Team,Age,Pos,Games Played,Games Started,QB rec,Comp,Att,...,Rate,QB Rating,Sacks,Sack Yards,Net Yds/Att,Adj Net Yds/Att,Sack %,Comebacks,Game Winning Drives,Season
0,1,Rich Gannon,OAK,37,QB,16,16,11-5-0,418,618,...,97.3,36,214,6.84,6.95,5.5,1,1,2002,0
1,2,Drew Bledsoe,BUF,30,QB,16,16,8-8-0,375,610,...,86.0,54,369,6.01,5.72,8.1,2,4,2002,0
2,3,Tom Brady,NWE,25,QB,16,16,9-7-0,373,601,...,85.7,31,190,5.66,5.54,4.9,2,3,2002,0
3,4,Peyton Manning,IND,26,QB,16,16,10-6-0,392,591,...,88.8,23,145,6.60,6.09,3.7,4,5,2002,0
4,5,Brett Favre,GNB,33,QB,16,16,12-4-0,341,551,...,85.6,26,188,6.01,5.70,4.5,3,3,2002,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,108,Logan Thomas,WAS,29,TE,16,15,0,1,1,...,118.7,47.7,0,0,28.00,28.00,0.0,0,2020,0
111,109,Tommy Townsend,KAN,24,p,16,0,0,1,1,...,118.7,13.9,0,0,13.00,13.00,0.0,0,2020,0
112,110,Greg Ward,PHI,25,wr,16,10,0,1,1,...,118.7,0.9,0,0,15.00,15.00,0.0,0,2020,0
113,111,Sammy Watkins,KAN,27,wr,10,9,0,0,1,...,0.0,0.0,0,0,0.00,-45.00,0.0,0,2020,0


In [9]:
# saving the dataframe 
yearly_passing_df.to_csv('yearly_passing_stats_2002-2020.csv') 

# Pro football reference rushing yearly scrape

In [None]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [14]:
# URL of page to be scraped

yearly_rushing_df = pd.DataFrame()

for year in range(2002, 2021):
    
    print(year)
    print('---')
    
    pos = 'rushing'
    url = f'https://www.pro-football-reference.com/years/{year}/{pos}.htm'
    #print(url)

    # Retrieve page with the browser module
    browser.visit(url)

    # setup html parser
    html = browser.html
    fantasy_soup = BeautifulSoup(html, 'html.parser')

    # save the stats table as a df
    stats_table = fantasy_soup.find("table", {"class":"per_match_toggle"})
    stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]

    stats_df['Season'] = year
    yearly_rushing_df = yearly_rushing_df.append(stats_df) 
        
yearly_rushing_df

2002
---
2003
---
2004
---
2005
---
2006
---
2007
---
2008
---
2009
---
2010
---
2011
---
2012
---
2013
---
2014
---
2015
---
2016
---
2017
---
2018
---
2019
---
2020
---


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Season
0,1,Ricky Williams*+,MIA,25,RB,16,16,383,1853,16,94,63,4.8,115.8,7,2002
1,2,LaDainian Tomlinson*,SDG,23,RB,16,16,372,1683,14,96,76,4.5,105.2,3,2002
2,3,Eddie George,TEN,29,RB,16,16,343,1165,12,62,35,3.4,72.8,1,2002
3,4,Travis Henry *,BUF,24,RB,16,16,325,1438,13,78,34,4.4,89.9,11,2002
4,5,Deuce McAllister*,NOR,24,RB,15,15,325,1388,13,68,62,4.3,92.5,4,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,368,Jonathan Williams,DET,26,,5,0,1,5,0,0,5,5.0,1.0,1,2020
380,369,Mike Williams,LAC,26,WR,15,11,1,1,0,0,1,1.0,0.1,0,2020
381,370,Javon Wims,CHI,26,wr,13,1,1,2,0,0,2,2.0,0.2,0,2020
382,371,Olamide Zaccheaus,ATL,23,wr,11,2,1,0,0,0,0,0.0,0.0,0,2020


In [15]:
yearly_rushing_df.columns = ['Rk', 'Player', 'Team', 'Age', 'Pos', 'Games Played', 'Games Started', 'Rush Att', 'Rush Yds', 'Rush TD', 'First Downs', 'Long', 'Yds/Att', 'Yds/Game', 'Fumbles', 'Season']
yearly_rushing_df['Player'] = yearly_rushing_df['Player'].str.replace('*', '')
yearly_rushing_df['Player'] = yearly_rushing_df['Player'].str.replace('+', '')
yearly_rushing_df['Player'] = yearly_rushing_df['Player'].str.strip()
yearly_rushing_df = yearly_rushing_df.fillna(0)
yearly_rushing_df

Unnamed: 0,Rk,Player,Team,Age,Pos,Games Played,Games Started,Rush Att,Rush Yds,Rush TD,First Downs,Long,Yds/Att,Yds/Game,Fumbles,Season
0,1,Ricky Williams,MIA,25,RB,16,16,383,1853,16,94,63,4.8,115.8,7,2002
1,2,LaDainian Tomlinson,SDG,23,RB,16,16,372,1683,14,96,76,4.5,105.2,3,2002
2,3,Eddie George,TEN,29,RB,16,16,343,1165,12,62,35,3.4,72.8,1,2002
3,4,Travis Henry,BUF,24,RB,16,16,325,1438,13,78,34,4.4,89.9,11,2002
4,5,Deuce McAllister,NOR,24,RB,15,15,325,1388,13,68,62,4.3,92.5,4,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,368,Jonathan Williams,DET,26,0,5,0,1,5,0,0,5,5.0,1.0,1,2020
380,369,Mike Williams,LAC,26,WR,15,11,1,1,0,0,1,1.0,0.1,0,2020
381,370,Javon Wims,CHI,26,wr,13,1,1,2,0,0,2,2.0,0.2,0,2020
382,371,Olamide Zaccheaus,ATL,23,wr,11,2,1,0,0,0,0,0.0,0.0,0,2020


In [16]:
# saving the dataframe 
yearly_rushing_df.to_csv('yearly_rushing_stats_2002-2020.csv') 

# Pro football reference receiving yearly scrape¶

In [None]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [19]:
# URL of page to be scraped

yearly_receiving_df = pd.DataFrame()

for year in range(2002, 2021):
    
    print(year)
    print('---')
    
    pos = 'receiving'
    url = f'https://www.pro-football-reference.com/years/{year}/{pos}.htm'
    #print(url)

    # Retrieve page with the browser module
    browser.visit(url)

    # setup html parser
    html = browser.html
    fantasy_soup = BeautifulSoup(html, 'html.parser')

    # save the stats table as a df
    stats_table = fantasy_soup.find("table", {"class":"per_match_toggle"})
    stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]

    stats_df['Season'] = year
    yearly_receiving_df = yearly_receiving_df.append(stats_df) 
        
yearly_receiving_df

2002
---
2003
---
2004
---
2005
---
2006
---
2007
---
2008
---
2009
---
2010
---
2011
---
2012
---
2013
---
2014
---
2015
---
2016
---
2017
---
2018
---
2019
---
2020
---


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,Season
0,1,Marvin Harrison*+,IND,30,WR,16,16,205,143,69.8%,1722,12.0,11,92,69,8.4,8.9,107.6,0,2002
1,2,Hines Ward *,PIT,26,WR,16,16,160,112,70.0%,1329,11.9,12,65,72,8.3,7.0,83.1,1,2002
2,3,Randy Moss*,MIN,25,WR,16,16,185,106,57.3%,1347,12.7,7,67,60,7.3,6.6,84.2,1,2002
3,4,Eric Moulds*,BUF,29,WR,16,15,180,100,55.6%,1292,12.9,10,66,70,7.2,6.3,80.8,1,2002
4,5,Terrell Owens*+,SFO,29,WR,14,14,159,100,62.9%,1300,13.0,13,63,76,8.2,7.1,92.9,0,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,495,Trevon Wesco,NYJ,25,te,12,4,2,1,50.0%,5,5.0,0,0,5,2.5,0.1,0.4,0,2020
511,496,Dede Westbrook,JAX,27,,2,0,1,1,100.0%,4,4.0,0,0,4,4.0,0.5,2.0,1,2020
512,497,Antonio Williams,BUF,23,,1,0,1,1,100.0%,20,20.0,0,1,20,20.0,1.0,20.0,0,2020
513,498,Luke Willson,2TM,30,,8,0,3,1,33.3%,12,12.0,0,0,12,4.0,0.1,1.5,0,2020


In [20]:
yearly_receiving_df.columns = ['Rk', 'Player', 'Team', 'Age', 'Pos', 'Games Played', 'Games Started', 'Targets', 'Receptions', 'Catch %', 'Rec Yards', 'Yds/Rec', 'TD', 'First Downs', 'Long', 'Yds/Target', 'Rec/Game', 'Yds/Game', 'Fumbles', 'Season']
yearly_receiving_df['Player'] = yearly_receiving_df['Player'].str.replace('*', '')
yearly_receiving_df['Player'] = yearly_receiving_df['Player'].str.replace('+', '')
yearly_receiving_df['Player'] = yearly_receiving_df['Player'].str.strip()
yearly_receiving_df = yearly_receiving_df.fillna(0)
yearly_receiving_df

Unnamed: 0,Rk,Player,Team,Age,Pos,Games Played,Games Started,Targets,Receptions,Catch %,Rec Yards,Yds/Rec,TD,First Downs,Long,Yds/Target,Rec/Game,Yds/Game,Fumbles,Season
0,1,Marvin Harrison,IND,30,WR,16,16,205,143,69.8%,1722,12.0,11,92,69,8.4,8.9,107.6,0,2002
1,2,Hines Ward,PIT,26,WR,16,16,160,112,70.0%,1329,11.9,12,65,72,8.3,7.0,83.1,1,2002
2,3,Randy Moss,MIN,25,WR,16,16,185,106,57.3%,1347,12.7,7,67,60,7.3,6.6,84.2,1,2002
3,4,Eric Moulds,BUF,29,WR,16,15,180,100,55.6%,1292,12.9,10,66,70,7.2,6.3,80.8,1,2002
4,5,Terrell Owens,SFO,29,WR,14,14,159,100,62.9%,1300,13.0,13,63,76,8.2,7.1,92.9,0,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,495,Trevon Wesco,NYJ,25,te,12,4,2,1,50.0%,5,5.0,0,0,5,2.5,0.1,0.4,0,2020
511,496,Dede Westbrook,JAX,27,0,2,0,1,1,100.0%,4,4.0,0,0,4,4.0,0.5,2.0,1,2020
512,497,Antonio Williams,BUF,23,0,1,0,1,1,100.0%,20,20.0,0,1,20,20.0,1.0,20.0,0,2020
513,498,Luke Willson,2TM,30,0,8,0,3,1,33.3%,12,12.0,0,0,12,4.0,0.1,1.5,0,2020


In [21]:
# saving the dataframe 
yearly_receiving_df.to_csv('yearly_receiving_stats_2002-2020.csv') 

# Pro football reference defense vs RB yearly scrape¶

In [24]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\Steven\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


 


In [37]:
# URL of page to be scraped

yearly_defense_vs_RB_df = pd.DataFrame()

for year in range(2002, 2021):
    
    print(year)
    print('---')
    
    pos = 'RB'
    url = f'https://www.pro-football-reference.com/years/{year}/fantasy-points-against-{pos}.htm'
    #print(url)

    # Retrieve page with the browser module
    browser.visit(url)

    # setup html parser
    html = browser.html
    fantasy_soup = BeautifulSoup(html, 'html.parser')

    # save the stats table as a df
    stats_table = fantasy_soup.find("table", {"class":"per_match_toggle"})
    stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]

    stats_df['Season'] = year
    yearly_defense_vs_RB_df = yearly_defense_vs_RB_df.append(stats_df) 
        

2002
---
2003
---
2004
---
2005
---
2006
---
2007
---
2008
---
2009
---
2010
---
2011
---
2012
---
2013
---
2014
---
2015
---
2016
---
2017
---
2018
---
2019
---
2020
---


In [38]:
yearly_defense_vs_RB_df.columns = ['Team', 'Games', 'Rush Att', 'Rush Yds', 'Rush TD', 'Rec Tgt', 'Rec', 'Rec Yds', 'Rec TD', 'Fpts', 'DK Fpts', 'FD Fpts', 'Fpts/Game', 'DK Fpts/Game', 'FD Fpts/Game', 'Season']
yearly_defense_vs_RB_df['Team'] = yearly_defense_vs_RB_df['Team'].str.strip()
yearly_defense_vs_RB_df = yearly_defense_vs_RB_df.fillna(0)
yearly_defense_vs_RB_df

Unnamed: 0,Team,Games,Rush Att,Rush Yds,Rush TD,Rec Tgt,Rec,Rec Yds,Rec TD,Fpts,DK Fpts,FD Fpts,Fpts/Game,DK Fpts/Game,FD Fpts/Game,Season
0,Seattle Seahawks,16,427,2124,17,143,104,925,4,423.20,563.9,472.9,26.5,35.2,29.6,2002
1,Arizona Cardinals,16,426,1926,16,131,95,813,4,386.00,502.9,433.4,24.1,31.4,27.1,2002
2,New York Jets,16,371,1741,17,146,112,885,2,376.60,506.6,432.6,23.5,31.7,27.0,2002
3,Kansas City Chiefs,16,374,1770,16,132,109,783,5,375.40,510.3,429.8,23.5,31.9,26.9,2002
4,Detroit Lions,16,423,1687,17,128,102,877,4,368.60,486.4,419.4,23.0,30.4,26.2,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,Atlanta Falcons,16,292,1130,8,115,89,612,4,242.30,333.2,286.7,15.1,20.8,17.9,2020
30,Washington Football Team,16,340,1335,8,87,61,379,4,239.40,305.4,269.9,15.0,19.1,16.9,2020
31,Los Angeles Rams,16,322,1218,8,111,81,585,3,238.30,323.3,278.8,14.9,20.2,17.4,2020
32,Tampa Bay Buccaneers,16,285,964,8,125,101,671,3,226.00,331.5,276.0,14.1,20.7,17.3,2020


In [40]:
# saving the dataframe 
yearly_defense_vs_RB_df.to_csv('yearly_defense_vs_RB_stats_2002-2020.csv') 

# Pro football reference defense vs WR yearly scrape¶

In [41]:
# URL of page to be scraped

yearly_defense_vs_WR_df = pd.DataFrame()

for year in range(2002, 2021):
    
    print(year)
    print('---')
    
    pos = 'WR'
    url = f'https://www.pro-football-reference.com/years/{year}/fantasy-points-against-{pos}.htm'
    #print(url)

    # Retrieve page with the browser module
    browser.visit(url)

    # setup html parser
    html = browser.html
    fantasy_soup = BeautifulSoup(html, 'html.parser')

    # save the stats table as a df
    stats_table = fantasy_soup.find("table", {"class":"per_match_toggle"})
    stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]

    stats_df['Season'] = year
    yearly_defense_vs_WR_df = yearly_defense_vs_WR_df.append(stats_df) 

2002
---
2003
---
2004
---
2005
---
2006
---
2007
---
2008
---
2009
---
2010
---
2011
---
2012
---
2013
---
2014
---
2015
---
2016
---
2017
---
2018
---
2019
---
2020
---


In [42]:
yearly_defense_vs_WR_df.columns = ['Team', 'Games', 'Rec Tgt', 'Rec', 'Rec Yds', 'Rec TD', 'Fpts', 'DK Fpts', 'FD Fpts', 'Fpts/Game', 'DK Fpts/Game', 'FD Fpts/Game', 'Season']
yearly_defense_vs_WR_df['Team'] = yearly_defense_vs_WR_df['Team'].str.strip()
yearly_defense_vs_WR_df = yearly_defense_vs_WR_df.fillna(0)
yearly_defense_vs_WR_df

Unnamed: 0,Team,Games,Rec Tgt,Rec,Rec Yds,Rec TD,Fpts,DK Fpts,FD Fpts,Fpts/Game,DK Fpts/Game,FD Fpts/Game,Season
0,Minnesota Vikings,16,327,193,2926,22,431.40,658.4,533.9,27.0,41.2,33.4,2002
1,Arizona Cardinals,16,296,182,2568,21,399.70,605.4,494.4,25.0,37.8,30.9,2002
2,Kansas City Chiefs,16,366,227,2887,14,396.22,651.2,509.7,24.8,40.7,31.9,2002
3,Detroit Lions,16,353,200,2842,16,378.00,610.0,478.0,23.6,38.1,29.9,2002
4,San Diego Chargers,16,347,199,2831,14,367.40,596.9,466.4,23.0,37.3,29.2,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,Buffalo Bills,16,325,200,2334,13,318.48,535.4,418.4,19.9,33.5,26.2,2020
30,Kansas City Chiefs,16,283,169,2159,15,315.60,500.6,400.1,19.7,31.3,25.0,2020
31,Baltimore Ravens,16,341,215,2361,11,312.64,545.6,428.1,19.5,34.1,26.8,2020
32,Washington Football Team,16,319,199,2321,10,304.70,518.4,409.9,19.0,32.4,25.6,2020


In [43]:
# saving the dataframe 
yearly_defense_vs_WR_df.to_csv('yearly_defense_vs_WR_stats_2002-2020.csv') 

# Pro football reference defense vs QB yearly scrape¶

In [49]:
# URL of page to be scraped

yearly_defense_vs_QB_df = pd.DataFrame()

for year in range(2002, 2021):
    
    print(year)
    print('---')
    
    pos = 'QB'
    url = f'https://www.pro-football-reference.com/years/{year}/fantasy-points-against-{pos}.htm'
    #print(url)

    # Retrieve page with the browser module
    browser.visit(url)

    # setup html parser
    html = browser.html
    fantasy_soup = BeautifulSoup(html, 'html.parser')

    # save the stats table as a df
    stats_table = fantasy_soup.find("table", {"class":"per_match_toggle"})
    stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]

    stats_df['Season'] = year
    yearly_defense_vs_QB_df = yearly_defense_vs_QB_df.append(stats_df) 

2002
---
2003
---
2004
---
2005
---
2006
---
2007
---
2008
---
2009
---
2010
---
2011
---
2012
---
2013
---
2014
---
2015
---
2016
---
2017
---
2018
---
2019
---
2020
---


In [50]:
yearly_defense_vs_QB_df.columns = ['Team', 'Games', 'Pass Comp', 'Pass Att', 'Pass Yds', 'Pass TD', 'Pass Int', '2pt Conv', 'Sacks', 'Rush Att', 'Rush Yds', 'Rush TD', 'Fpts', 'DK Fpts', 'FD Fpts', 'Fpts/Game', 'DK Fpts/Game', 'FD Fpts/Game', 'Season']
yearly_defense_vs_QB_df['Team'] = yearly_defense_vs_QB_df['Team'].str.strip()
yearly_defense_vs_QB_df = yearly_defense_vs_QB_df.fillna(0)
yearly_defense_vs_QB_df

Unnamed: 0,Team,Games,Pass Comp,Pass Att,Pass Yds,Pass TD,Pass Int,2pt Conv,Sacks,Rush Att,Rush Yds,Rush TD,Fpts,DK Fpts,FD Fpts,Fpts/Game,DK Fpts/Game,FD Fpts/Game,Season
0,Minnesota Vikings,16,335,542,4300,33,16,2,27,39,266,2,308.60,345.6,324.6,19.3,21.6,20.3,2002
1,Detroit Lions,16,371,591,4404,27,10,1,33,56,206,2,290.96,323.8,300.8,18.2,20.2,18.8,2002
2,San Diego Chargers,16,375,607,4526,26,17,1,39,46,235,2,284.54,327.5,301.5,17.8,20.5,18.8,2002
3,New Orleans Saints,16,343,591,4058,25,19,2,39,50,289,6,283.22,320.2,302.2,17.7,20.0,18.9,2002
4,Cincinnati Bengals,16,287,453,3415,30,9,0,24,58,197,2,262.50,278.3,271.3,16.4,17.4,17.0,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,Green Bay Packers,16,346,536,3793,23,11,2,41,46,268,2,256.62,275.6,268.1,16.0,17.2,16.8,2020
30,New Orleans Saints,16,333,557,3759,28,18,2,45,54,249,1,253.36,278.3,271.3,15.8,17.4,17.0,2020
31,Washington Football Team,16,328,527,3363,21,16,1,46,69,373,6,251.82,278.8,267.8,15.7,17.4,16.7,2020
32,Pittsburgh Steelers,16,298,525,3494,22,18,2,56,66,350,1,226.76,249.8,244.8,14.2,15.6,15.3,2020


In [51]:
# saving the dataframe 
yearly_defense_vs_QB_df.to_csv('yearly_defense_vs_QB_stats_2002-2020.csv') 

# Pro football reference defense vs TE yearly scrape¶

In [46]:
# URL of page to be scraped

yearly_defense_vs_TE_df = pd.DataFrame()

for year in range(2002, 2021):
    
    print(year)
    print('---')
    
    pos = 'TE'
    url = f'https://www.pro-football-reference.com/years/{year}/fantasy-points-against-{pos}.htm'
    #print(url)

    # Retrieve page with the browser module
    browser.visit(url)

    # setup html parser
    html = browser.html
    fantasy_soup = BeautifulSoup(html, 'html.parser')

    # save the stats table as a df
    stats_table = fantasy_soup.find("table", {"class":"per_match_toggle"})
    stats_df = pd.read_html(f'<table>${stats_table.tbody}</table>')[0]

    stats_df['Season'] = year
    yearly_defense_vs_TE_df = yearly_defense_vs_TE_df.append(stats_df) 

2002
---
2003
---
2004
---
2005
---
2006
---
2007
---
2008
---
2009
---
2010
---
2011
---
2012
---
2013
---
2014
---
2015
---
2016
---
2017
---
2018
---
2019
---
2020
---


In [47]:
yearly_defense_vs_TE_df.columns = ['Team', 'Games', 'Rec Tgt', 'Rec', 'Rec Yds', 'Rec TD', 'Fpts', 'DK Fpts', 'FD Fpts', 'Fpts/Game', 'DK Fpts/Game', 'FD Fpts/Game', 'Season']
yearly_defense_vs_TE_df['Team'] = yearly_defense_vs_TE_df['Team'].str.strip()
yearly_defense_vs_TE_df = yearly_defense_vs_TE_df.fillna(0)
yearly_defense_vs_TE_df

Unnamed: 0,Team,Games,Rec Tgt,Rec,Rec Yds,Rec TD,Fpts,DK Fpts,FD Fpts,Fpts/Game,DK Fpts/Game,FD Fpts/Game,Season
0,Kansas City Chiefs,16,89,64,712,7,113.20,180.2,145.2,7.1,11.3,9.1,2002
1,Pittsburgh Steelers,16,86,56,600,7,104.10,163.0,132.0,6.5,10.2,8.3,2002
2,San Diego Chargers,16,92,66,672,4,99.60,171.6,132.6,6.2,10.7,8.3,2002
3,New York Jets,16,102,74,686,5,99.50,175.9,135.9,6.2,11.0,8.5,2002
4,Jacksonville Jaguars,16,92,61,629,6,97.60,159.6,128.1,6.1,10.0,8.0,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,New England Patriots,16,100,69,813,3,99.30,168.3,133.8,6.2,10.5,8.4,2020
30,Green Bay Packers,16,101,67,712,5,97.20,166.2,130.7,6.1,10.4,8.2,2020
31,Indianapolis Colts,16,116,74,720,3,96.10,170.1,133.1,6.0,10.6,8.3,2020
32,Pittsburgh Steelers,16,114,64,638,3,82.20,146.8,113.8,5.1,9.2,7.1,2020


In [48]:
# saving the dataframe 
yearly_defense_vs_TE_df.to_csv('yearly_defense_vs_TE_stats_2002-2020.csv') 