In [1]:
# import libraries for web scraping
import requests
from bs4 import BeautifulSoup
import selenium as se
import pandas as pd
import numpy as np
import time

`get_table` finds a table with the id `fantasy`. Relying on the id only is not the most robust web-scraping approach. If the script cannot find a table with that id, it should try an alternative approach. This function is perhaps the most fragile.

`get_headers` assumes that the table has two headers (similar to MultiIndex). 

`get_data` also rests on the assumption that the table includes two header rows. 

In [3]:

# get the proxy
# possible solution: use https://www.scraperapi.com/solutions/scraping-api/ to get the proxy
def get_proxy():
    try:
        # get the proxy
        proxy = requests.get('https://gimmeproxy.com/api/getProxy').json()
        # get the ip and port
        ip = proxy['ip']
        port = proxy['port']
        # create the proxy
        proxy = {
            'http': f'http://{ip}:{port}',
            'https': f'https://{ip}:{port}'
        }
        return proxy
    except Exception as e:
        print('Error getting proxy')
        print(e)
        return None


# get table from url with id fantasy
# handle errors
def get_table(url):
    try:
        # get and parsethe page
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')

        # check if 429 error
        # get proxy
        if page.status_code == 429:
            print('Error 429: Too many requests')
            print('Getting proxy')
            proxy = get_proxy()
            print('Using proxy')
            page = requests.get(url, proxies=proxy)

        # find and return the table
        # find the table with id fantasy
        table = soup.find('table', {'id': 'fantasy'})
        return table
    except Exception as e:
        print('Error getting table')
        # print the error
        print(e)
        return None

# get the headers
# get the first and second tr tags in the table
def get_headers(table):
    try:
        # get the first two tr tags
        trs = table.find_all('tr')
        tr1 = trs[0]
        tr2 = trs[1]

        # get the first row of tags
        th1 = tr1.find_all('th')
        # get the second row of tags
        th2 = tr2.find_all('th')

        # get the text from the tags
        th1 = [th.getText() for th in th1]
        th2 = [th.getText() for th in th2]

        # save the headers as two-row dataframe
        headers = pd.DataFrame([th1, th2])
        return headers
    except Exception as e:
        print('Error getting headers')
        print(e)
        return None

# get the data
def get_data(table):
    try:
        # get the rows
        rows = table.find_all('tr')
        # get the data from the rows
        data = []
        for row in rows[2:]:
            # get the data from the row
            td = row.find_all('td')
            row = [i.getText() for i in td]
            data.append(row)
        return data
    except Exception as e:
        print('Error getting data')
        print(e)
        return None


# define the indexes
# TODO: decide if we want to use MultiIndex
# games = [6, 7]
# passing = [8, 9, 10, 11, 12]
# rushing = [13, 14, 15, 16]
# receiving = [17, 18, 19, 20, 21]
# fumbles = [22, 23]
# scoring = [24, 25, 26]
# fantasy_points = [27, 28, 29, 30, 31, 32, 33]

# change the names of the headers
# 0 = rank, 1 = player, 2 = team, 3 = FantPostition, 4 = age, 5 = games, 6 = games started, 7 = passing completions
# 8 = passing attempts, 9 = passing yards, 10 = passing touchdowns, 11 = interceptions, 12 = rushing attempts
# 13 = rushing yards, 14 = rushing yards per attempt, 15 = rushing touchdowns, 16 = targets
# 17 = receptions, 18 = receiving yards, 19 = receiving yards per reception, 20 = receiving touchdowns
# 21 = fumbles, 22 = fumbles lost, 23 = scoring touchdowns, 24 = scoring 2-point conversions made
# 25 = 2-point conversion passes, 26 = fantasy points, 27 = fantasy points ppr (point per reception) league
# 28 = fantasy points draft kings, 29 = fantasy points fanduel, 30 = fantasy points above baseline, 31 = positional rank
# 32 = overall rank

# define the years: 1970-2023
years = np.arange(1970, 2024)

# loop through the years
# wait between requests
for year in years:
    # define the url
    url = f'https://www.pro-football-reference.com/years/{year}/fantasy.htm#fantasy'

    # get the table, split into headers and data
    table = get_table(url)
    headers = get_headers(table)
    data = get_data(table)

    df = pd.DataFrame(data, columns=headers.iloc[1, 1:])
    
    # save the data
    df.to_csv(f'data/raw/league/fantasy_{year}.csv', index=False)

    # wait between 1 and 5 seconds
    wait = np.random.randint(1, 6)
    time.sleep(wait)
