### 0) Description

This notebook contains code to pull seasonal data (2000-2020) from the following https://www.sports-reference.com/ websites:
    - College Football: https://www.sports-reference.com/cfb/
    - NFL: https://www.pro-football-reference.com/
    
Tables/sites to be scraped:
    - NCAA team Ratings: https://www.sports-reference.com/cfb/years/yyyy-ratings.html
    - NCAA Team Offense: https://www.sports-reference.com/cfb/years/yyyy-team-offense.html
    - NCAA Team Defense: https://www.sports-reference.com/cfb/years/yyyy-team-defense.html
    - NCAA Player Passing Stats: https://www.sports-reference.com/cfb/years/yyyy-passing.html
    - NCAA Player Receiving Stats: https://www.sports-reference.com/cfb/years/yyyy-receiving.html
    - NCAA Player Rushing Stats: https://www.sports-reference.com/cfb/years/yyyy-rushing.html
    - NCAA Player Kicking Stats: https://www.sports-reference.com/cfb/years/yyyy-kicking.html
    - NCAA Player Punting Stats: https://www.sports-reference.com/cfb/years/yyyy-punting.html
    - NCAA Season Summary: https://www.sports-reference.com/cfb/years/yyyy.html
    - NFL Combine Results: https://www.pro-football-reference.com/draft/yyyy-combine.htm
    - NFL Draft Results: https://www.pro-football-reference.com/years/yyyy/draft.htm
    - NFL Standings and Team Stats: https://www.pro-football-reference.com/years/yyyy/
    - NFL Combine Results: https://nflcombineresults.com/nflcombinedata_expanded.php?year=yyyy&pos=&college=
    

### 1) Import packages

In [1]:
import requests                   # to make html requests
from bs4 import BeautifulSoup     # to pull data from html websites
import pandas as pd               # to create and concat dataframes + read html tables
import time                       # to set delays between requests
import numpy as np                # to choose random elements from list
import re                         # to perform regular expression operations

### 2) Set parameters

In [5]:
# 2000-2020 - e.g 2000 nfl draft is based on 1999 college season
years_nfl = ['2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019', '2020']

# 1999-2019 - e.g 1999 season ends in 2000 and players get drafted in 2000
years_ncaa = ['1999','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019']

# list of tables for player stats
player_tables = ['passing','rushing','receiving','kicking','punting']

# list of delays to put between get requests
delays = [5, 6, 7, 8, 9, 10, 14, 17, 19, 23, 27, 29, 31, 33, 35, 42, 45]

# set request headers based on website
headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'http://www.google.com/',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}

### 3) Define functions to pull data

#### 3.1 Pull combine results

In [9]:
# get combine data
def pullCombineData(years):
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.pro-football-reference.com/draft/' + year + '-combine.htm'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with headers
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            # get combine table
            table = soup.find('table', {'id': 'combine'})
            # get column headers
            column_headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True) 

            # get player ids and ncaa links, if not exist put N/A
            player_ids = []
            ncaa_links = []
            for tbody in table.find_all('tbody'):
                for tr in tbody.find_all('tr'):
                    for th in tr.find_all('th'):
                        if th.text not in column_headers:
                            try:
                                player_ids.append(th['data-append-csv'])
                            except KeyError:
                                player_ids.append('N/A')
                    for td in tr.find_all('td'):
                            if td['data-stat'] == "college":
                                if td.find_all('a'):
                                    for a in td.find_all('a'):
                                        ncaa_links.append(a['href'])
                                else:
                                    ncaa_links.append('N/A')
            
            # insert year, player id, ncaa link to table
            df.insert(0, "NCAA_Link", ncaa_links)
            df.insert(0, "Player_ID", player_ids)
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)

        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # merge into one DataFrame
    combines_df = pd.concat(dfs, ignore_index=True)
    combines_df.to_csv('nfl_combine_2000_2020.csv', index=False)

In [10]:
pullCombineData(years_nfl)

[]


#### 3.2 Pull draft results

In [2]:
# get draft data
def pullDraftData(years):
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.pro-football-reference.com/years/' + year + '/draft.htm'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            # get draft table
            table = soup.find('table', {'id': 'drafts'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multiheaders (first header row)
            df.columns = df.columns.droplevel(0)

            player_ids = []
            ncaa_links = []
            for tbody in table.find_all('tbody'):
                for tr in tbody.find_all('tr'):
                    for td in tr.find_all('td'):
                        if td['data-stat'] == "college_link":
                            if td.find_all('a'):
                                for a in td.find_all('a'):
                                    ncaa_links.append(a['href'])
                            else:
                                ncaa_links.append('N/A')
                        elif td['data-stat'] == "player":
                            try:
                                player_ids.append(td['data-append-csv'])
                            except KeyError:
                                player_ids.append('N/A')
                            
            # insert year, player id, ncaa link to table
            df.insert(0, "NCAA_Link", ncaa_links)
            df.insert(0, "Player_ID", player_ids)
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)

        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dfs into a single df
    draft_df = pd.concat(dfs, ignore_index=True)
    draft_df.to_csv('nfl_draft_2000_2020.csv', index=False)

In [3]:
pullDraftData(years_nfl)

[]


#### 3.3 Pull NCAA team stats

In [13]:
# get ncaa team offense data
def pullTeamOffData(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-team-offense.html'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            
            # get team offense table
            table = soup.find('table', {'id': 'offense'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multilevel headers (first header row)
            df.columns = df.columns.droplevel(0)
            
            # add year
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)
        
        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dataframes into a single dataframe
    team_offense_df = pd.concat(dfs, ignore_index=True)
    # save dataframe to csv
    team_offense_df.to_csv('ncaaf_team_offense_1999_2019.csv', index=False)


In [14]:
pullTeamOffData(years_ncaa)

[['https://www.sports-reference.com/cfb/years/1999-team-offense.html', ValueError('No tables found')]]


In [9]:
# get ncaa team defense data
def pullTeamDefData(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-team-defense.html'
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # get html
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            
            # get team defense table
            table = soup.find('table', {'id': 'defense'})
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multilevel headers
            df.columns = df.columns.droplevel(0)
            
            # add year
            df.insert(0, "Year", year)
            
            # append dataframe to dataframes list
            dfs.append(df)
        
        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dataframes into a single dataframe
    team_defense_df = pd.concat(dfs, ignore_index=True)
    # save dataframe to csv
    team_defense_df.to_csv('ncaaf_team_defense_1999_2019.csv', index=False)


In [10]:
pullTeamDefData(years_ncaa)

[['https://www.sports-reference.com/cfb/years/1999-team-defense.html', ValueError('No tables found')]]


In [52]:
# get NCAAF ratings
def pullTeamRatings(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-ratings.html'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            
            # get ratings table
            table = soup.find('table', {'id': 'ratings'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multilevel headers (first header row)
            df.columns = df.columns.droplevel(0)
            
            # add year
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)
        
        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dataframes into a single dataframe
    team_ratings_df = pd.concat(dfs, ignore_index=True)
    # save dataframe to csv
    team_ratings_df.to_csv('ncaaf_team_ratings_1999_2019.csv', index=False)


In [53]:
pullTeamRatings(years_ncaa)

[]


#### 3.4 Pull NFL team stats

In [4]:
# get nfl team stats
def pullNFLTeamStats(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.pro-football-reference.com/years/' + year + '/'
            # get html
            delay = np.random.choice(delays)
            time.sleep(delay)
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            
            # get NFL team stats
            table = soup.find_all('table', {'id': ['AFC', 'NFC']})
            # read table as dataframe
            tables = pd.read_html(str(table))
            
            # concatenate tables into single dataframe
            df = pd.concat(tables, ignore_index=True)
            
            # remove division rows
            df = df[~df['Tm'].isin(divisions)]
            
            # add year
            df.insert(0, "Year", year)
            
            # append dataframe to dataframes list
            dfs.append(df)
        
        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dataframes into a single dataframe
    nfl_team_stats_df = pd.concat(dfs, ignore_index=True)
    
    # save dataframe to csv
    nfl_team_stats_df.to_csv('nfl_team_stats_1999_2019.csv', index=False)

In [5]:
pullNFLTeamStats(years_ncaa)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




#### 3.5 Pull player stats (minimum 14 Att/G, 75% of school games played)

In [None]:
# get ncaa player data
def pullPlayerData(years, tableID):
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    # list of tables with multilevel headers
    multi_list = ['passing','rushing','receiving']
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-' + tableID + '.html'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            # get table
            table = soup.find('table', {'id': tableID})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multiheaders (first header row)
            if tableID in multi_list:
                df.columns = df.columns.droplevel(0)

            player_ncaa_ids = []
            ncaa_links = []
            for tbody in table.find_all('tbody'):
                for tr in tbody.find_all('tr'):
                    for td in tr.find_all('td'):
                        if td['data-stat'] == "player":
                            try:
                                player_ncaa_ids.append(td['data-append-csv'])
                            except KeyError:
                                player_ncaa_ids.append('N/A')
                            
                            if td.find_all('a'):
                                for a in td.find_all('a'):
                                    ncaa_links.append(a['href'])
                            else:
                                ncaa_links.append('N/A')
                            
            # insert year, player id, ncaa link to table
            df.insert(0, "NCAA_Link", ncaa_links)
            df.insert(0, "Player_NCAA_ID", player_ncaa_ids)
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)

        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dataframes into a single dataframe
    out_df = pd.concat(dfs, ignore_index=True)
    out_df.to_csv('ncaa_player_' + tableID + '_stats_1999_2019.csv', index=False)

In [None]:
for table in player_tables:
    pullPlayerData(years, table)

### 3.6 Pull Consensus All-Americans

In [159]:
# pull All-Americans
def pullAllAmericans(years):

    base_url = 'https://www.sports-reference.com'
    lst = []
    
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '.html'
            
            # put random delays between get requests 
            delay = np.random.choice(delays)
            time.sleep(delay)
            
            # get html
            res = requests.get(url, headers = headers)
            
            # Work around comments
            comm = re.compile("<!--|-->")
            soup = BeautifulSoup(comm.sub("", res.text), 'lxml')
            for row in soup.find_all('div', id = 'div_all_americans'):
                for p in row.find_all('p'):
                    for a in p.find_all('a',limit=1):
                        line = [year, p.text.split(',')[0].replace('*',''), p.text.split(',')[1].replace(' ',''), p.text.split(',')[2].lstrip(), base_url + a['href']]
                    
                    lst.append(line)
        
        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # add list to dataframe
    df=pd.DataFrame(lst,columns=['Year','Player','Pos','School','NCAA_Link'])
    # save as csv
    df.to_csv('ncaa_all_americans__1999_2019.csv', index=False)
    

In [160]:
pullAllAmericans(years_ncaa)

[]


In [7]:
def PullNFLCombineResults(years):
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://nflcombineresults.com/nflcombinedata_expanded.php?year=' + year + ' &pos=&college='
                        
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            
            # get ratings table
            table = soup.find('table', {'class': 'sortable'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            
            # append df to dfs list
            dfs.append(df)
            
        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dataframes into a single dataframe
    combineresults_df = pd.concat(dfs, ignore_index=True)
    # save dataframe to csv
    combineresults_df.to_csv('nflcombineresults_2000_2020.csv', index=False)

In [8]:
PullNFLCombineResults(years_nfl)

       Year              Name             College  POS  Height (in)  \
0    2000.0    Darnell Alford      Boston College   OT         76.0   
1    2000.0      Kyle Allamon          Texas Tech   TE         74.5   
2    2000.0  Rashard Anderson  Jackson State (MS)   CB         74.4   
3    2000.0      Corey Atkins      South Carolina  OLB         72.4   
4    2000.0     Reggie Austin         Wake Forest   CB         69.4   
..      ...               ...                 ...  ...          ...   
254  2000.0     Destry Wright  Jackson State (MS)   RB         71.3   
255  2000.0      Spergon Wynn         Texas State   QB         75.4   
256  2000.0     Bashir Yamini                Iowa   WR         75.3   
257  2000.0       Brian Young       Texas-El Paso   DE         74.4   
258     NaN               NaN                 NaN  NaN          NaN   

     Weight (lbs)  Hand Size (in)  Arm Length (in)  Wonderlic  40 Yard  \
0           334.0           10.50            34.00        NaN     5.58   

       Year              Name         College  POS  Height (in)  Weight (lbs)  \
0    2003.0   Khalid Abdullah  Mars Hill (NC)  OLB         74.1         227.0   
1    2003.0     Anthony Adams      Penn State   DT         71.6         299.0   
2    2003.0         Sam Aiken  North Carolina   WR         73.9         209.0   
3    2003.0   Nnamdi Asomugha      California   CB         74.4         213.0   
4    2003.0        Rod Babers           Texas   CB         68.8         192.0   
..      ...               ...             ...  ...          ...           ...   
255  2003.0  George Wrighster          Oregon   TE         74.1         249.0   
256  2003.0     Thomas Wright  Michigan State   SS         73.4         201.0   
257  2003.0      Walter Young        Illinois   WR         76.6         214.0   
258  2003.0      Doug Zeigler     Mississippi   TE         76.0         257.0   
259     NaN               NaN             NaN  NaN          NaN           NaN   

     Hand Size (in)  Arm Le

       Year             Name              College  POS  Height (in)  \
0    2006.0    Darrell Adams            Villanova   DE       76.625   
1    2006.0      Titus Adams             Nebraska   DT       75.380   
2    2006.0    Jahmile Addae        West Virginia   FS       70.250   
3    2006.0     Joseph Addai      Louisiana State   RB       71.250   
4    2006.0  Victor Adeyanju              Indiana   DE       76.250   
..      ...              ...                  ...  ...          ...   
350  2006.0    Claude Wroten      Louisiana State   DT       74.000   
351  2006.0     Justin Wyatt  Southern California   CB       69.125   
352  2006.0      James Wyche             Syracuse   DE       77.400   
353  2006.0      Vince Young                Texas   QB       76.600   
354     NaN              NaN                  NaN  NaN          NaN   

     Weight (lbs)  Hand Size (in)  Arm Length (in)  Wonderlic  40 Yard  \
0           277.0            9.63            33.50        NaN     5.08   

       Year               Name           College  POS  Height (in)  \
0    2009.0     Spencer Adkins        Miami (FL)  ILB       71.125   
1    2009.0         Al Afalava      Oregon State   SS       70.875   
2    2009.0        Kevin Akins    Boston College  OLB       74.000   
3    2009.0        Asher Allen           Georgia   CB       69.500   
4    2009.0        Roger Allen  Missouri Western   OG       74.750   
..      ...                ...               ...  ...          ...   
378  2009.0      Morris Wooten     Arizona State  ILB       71.625   
379  2009.0  Jahi Word-Daniels      Georgia Tech   CB       72.000   
380  2009.0     DeAndre Wright        New Mexico   CB       70.750   
381  2009.0        Jarius Wynn           Georgia   DE       74.625   
382     NaN                NaN               NaN  NaN          NaN   

     Weight (lbs)  Hand Size (in)  Arm Length (in)  Wonderlic  40 Yard  \
0           230.0             NaN              NaN        NaN     9.99   
1          

       Year             Name           College  POS  Height (in)  \
0    2012.0    Emmanuel Acho             Texas  OLB       73.625   
1    2012.0       Jeff Adams     Columbia (NY)   OT       78.000   
2    2012.0        Joe Adams          Arkansas   WR       70.625   
3    2012.0       Mike Adams        Ohio State   OT       79.250   
4    2012.0     Chas Alecxih        Pittsburgh   DT       75.750   
..      ...              ...               ...  ...          ...   
373  2012.0     Desmond Wynn           Rutgers   OG       77.500   
374  2012.0    Kevin Zeitler         Wisconsin   OG       75.875   
375  2012.0    Greg Zuerlein  Missouri Western    K       72.000   
376  2012.0  Markus Zusevics              Iowa   OT       77.375   
377     NaN              NaN               NaN  NaN          NaN   

     Weight (lbs)  Hand Size (in)  Arm Length (in)  Wonderlic  40 Yard  \
0           238.0           10.00            33.00        NaN     4.64   
1           306.0             NaN  

       Year               Name              College  POS  Height (in)  \
0    2015.0     Ameer Abdullah             Nebraska   RB       68.750   
1    2015.0     Nelson Agholor  Southern California   WR       72.125   
2    2015.0      Malcolm Agnew    Southern Illinois   RB       69.125   
3    2015.0          Jay Ajayi          Boise State   RB       71.750   
4    2015.0  Brandon Alexander      Central Florida   FS       72.000   
..      ...                ...                  ...  ...          ...   
752  2015.0     Sterling Young       Arkansas State   FS       73.375   
753  2015.0   Konrad Zagzebski            Wisconsin   DE       75.000   
754  2015.0        Zach Zenner   South Dakota State   FB       71.500   
755  2015.0        Zach Zwinak           Penn State   FB       73.000   
756     NaN                NaN                  NaN  NaN          NaN   

     Weight (lbs)  Hand Size (in)  Arm Length (in)  Wonderlic  40 Yard  \
0           205.0            8.63            30.0

       Year                 Name               College  POS  Height (in)  \
0    2018.0           Josh Adams            Notre Dame   RB       73.625   
1    2018.0        Matthew Adams               Houston  ILB       72.000   
2    2018.0           Tony Adams  North Carolina State   OG       73.375   
3    2018.0  Olasunkanmi Adeniyi                Toledo   DE       73.125   
4    2018.0         Jordan Akins       Central Florida   TE       75.000   
..      ...                  ...                   ...  ...          ...   
608  2018.0        Jonathan Wynn            Vanderbilt   DE       76.250   
609  2018.0         Isaac Yiadom        Boston College   CB       72.875   
610  2018.0          Kenny Young                  UCLA  ILB       73.000   
611  2018.0         Trevon Young            Louisville   DE       75.875   
612     NaN                  NaN                   NaN  NaN          NaN   

     Weight (lbs)  Hand Size (in)  Arm Length (in)  Wonderlic  40 Yard  \
0           2

#### Pull player stats (minimum 14 Att/G, 75% of school games played) -- OBSOLETE

In [12]:
'''
# get ncaa player passing data
def pullPassingData(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-passing.html'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            # get passing table
            table = soup.find('table', {'id': 'passing'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multiheaders (first header row)
            df.columns = df.columns.droplevel(0)

            player_ncaa_ids = []
            ncaa_links = []
            for tbody in table.find_all('tbody'):
                for tr in tbody.find_all('tr'):
                    for td in tr.find_all('td'):
                        if td['data-stat'] == "player":
                            try:
                                player_ncaa_ids.append(td['data-append-csv'])
                            except KeyError:
                                player_ncaa_ids.append('N/A')
                            
                            if td.find_all('a'):
                                for a in td.find_all('a'):
                                    ncaa_links.append(a['href'])
                            else:
                                ncaa_links.append('N/A')
                            
            # insert year, player id, ncaa link to table
            df.insert(0, "NCAA_Link", ncaa_links)
            df.insert(0, "Player_NCAA_ID", player_ncaa_ids)
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)

        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    #print(len(ncaa_links))
    #print(len(player_ncaa_ids))
    #print(df.head())
    passing_df = pd.concat(dfs, ignore_index=True)
    passing_df.to_csv('ncaa_player_passing_stats_1999_2019.csv', index=False)
'''

In [13]:
#pullPassingData(years_ncaa)

[]


In [15]:
'''
# get ncaa player rushing data
def pullRushingData(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-rushing.html'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            # get rushing table
            table = soup.find('table', {'id': 'rushing'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multiheaders (first header row)
            df.columns = df.columns.droplevel(0)

            player_ncaa_ids = []
            ncaa_links = []
            for tbody in table.find_all('tbody'):
                for tr in tbody.find_all('tr'):
                    for td in tr.find_all('td'):
                        if td['data-stat'] == "player":
                            try:
                                player_ncaa_ids.append(td['data-append-csv'])
                            except KeyError:
                                player_ncaa_ids.append('N/A')
                            
                            if td.find_all('a'):
                                for a in td.find_all('a'):
                                    ncaa_links.append(a['href'])
                            else:
                                ncaa_links.append('N/A')
                            
            # insert year, player id, ncaa link to table
            df.insert(0, "NCAA_Link", ncaa_links)
            df.insert(0, "Player_NCAA_ID", player_ncaa_ids)
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)

        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    #print(len(ncaa_links))
    #print(len(player_ncaa_ids))
    #print(df.head())
    rushing_df = pd.concat(dfs, ignore_index=True)
    rushing_df.to_csv('ncaa_player_rushing_stats_1999_2019.csv', index=False)
'''

In [16]:
#pullRushingData(years_ncaa)

[]


In [19]:
'''
# get ncaa player receiving data
def pullReceivingData(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-receiving.html'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            # get receiving table
            table = soup.find('table', {'id': 'receiving'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multiheaders (first header row)
            df.columns = df.columns.droplevel(0)

            player_ncaa_ids = []
            ncaa_links = []
            for tbody in table.find_all('tbody'):
                for tr in tbody.find_all('tr'):
                    for td in tr.find_all('td'):
                        if td['data-stat'] == "player":
                            try:
                                player_ncaa_ids.append(td['data-append-csv'])
                            except KeyError:
                                player_ncaa_ids.append('N/A')
                            
                            if td.find_all('a'):
                                for a in td.find_all('a'):
                                    ncaa_links.append(a['href'])
                            else:
                                ncaa_links.append('N/A')
                            
            # insert year, player id, ncaa link to table
            df.insert(0, "NCAA_Link", ncaa_links)
            df.insert(0, "Player_NCAA_ID", player_ncaa_ids)
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)

        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    #print(len(ncaa_links))
    #print(len(player_ncaa_ids))
    #print(df.head())
    receiving_df = pd.concat(dfs, ignore_index=True)
    receiving_df.to_csv('ncaa_player_receiving_stats_1999_2019.csv', index=False)
'''

In [20]:
#pullReceivingData(years_ncaa)

[]


In [23]:
'''
# get ncaa player kicking data
def pullKickingData(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-kicking.html'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            # get kicking table
            table = soup.find('table', {'id': 'kicking'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)

            player_ncaa_ids = []
            ncaa_links = []
            for tbody in table.find_all('tbody'):
                for tr in tbody.find_all('tr'):
                    for td in tr.find_all('td'):
                        if td['data-stat'] == "player":
                            try:
                                player_ncaa_ids.append(td['data-append-csv'])
                            except KeyError:
                                player_ncaa_ids.append('N/A')
                            
                            if td.find_all('a'):
                                for a in td.find_all('a'):
                                    ncaa_links.append(a['href'])
                            else:
                                ncaa_links.append('N/A')
                            
            # insert year, player id, ncaa link to table
            df.insert(0, "NCAA_Link", ncaa_links)
            df.insert(0, "Player_NCAA_ID", player_ncaa_ids)
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)

        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    #print(len(ncaa_links))
    #print(len(player_ncaa_ids))
    #print(df.head())
    kicking_df = pd.concat(dfs, ignore_index=True)
    kicking_df.to_csv('ncaa_player_kicking_stats_1999_2019.csv', index=False)
'''

In [24]:
#pullKickingData(years_ncaa)

[['https://www.sports-reference.com/cfb/years/1999-kicking.html', ValueError('No tables found')]]


In [25]:
'''
# get ncaa player punting data
def pullPuntingData(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-punting.html'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            # get punting table
            table = soup.find('table', {'id': 'punting'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)

            player_ncaa_ids = []
            ncaa_links = []
            for tbody in table.find_all('tbody'):
                for tr in tbody.find_all('tr'):
                    for td in tr.find_all('td'):
                        if td['data-stat'] == "player":
                            try:
                                player_ncaa_ids.append(td['data-append-csv'])
                            except KeyError:
                                player_ncaa_ids.append('N/A')
                            
                            if td.find_all('a'):
                                for a in td.find_all('a'):
                                    ncaa_links.append(a['href'])
                            else:
                                ncaa_links.append('N/A')
                            
            # insert year, player id, ncaa link to table
            df.insert(0, "NCAA_Link", ncaa_links)
            df.insert(0, "Player_NCAA_ID", player_ncaa_ids)
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)

        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    #print(len(ncaa_links))
    #print(len(player_ncaa_ids))
    #print(df.head())
    punting_df = pd.concat(dfs, ignore_index=True)
    punting_df.to_csv('ncaa_player_punting_stats_1999_2019.csv', index=False)
'''

In [26]:
#pullPuntingData(years_ncaa)

[['https://www.sports-reference.com/cfb/years/1999-punting.html', ValueError('No tables found')]]
