In [None]:
# import libraries for web scraping
import requests
from bs4 import BeautifulSoup
import selenium as se
import pandas as pd
import numpy as np
import time

# load StatHead player ids
stathead = pd.read_csv('scraping/stat-head_player_data.csv')
template_url = 'https://www.pro-football-reference.com/players/'

# split the player names
stathead[['first_name', 'last_name']] = stathead['Player'].str.split(' ', n=1, expand=True)

# save the player page urls
player_urls = []
for last_name, id in zip(stathead['last_name'], stathead['PlayerId']):
    player_urls.append(template_url + last_name[0] + '/' + id + '.htm')

# get the tables
pd.Series(player_urls).to_csv('scraping/player_urls.csv', index=False)

For simplicity, we will only retrieve data on the top fantasy players from each season. Additionally, we will save on the number of read operations by limiting the data we are trying to retrieve for each player. For example, we won't try to retrieve receiving stats for quarterbacks or passing stats for running backs. 

Currently, we only have player IDs for players active in 2023. We need all player IDs. 

In [None]:
import requests
from bs4 import BeautifulSoup
import selenium as se
import pandas as pd
import numpy as np
import time

base_url = 'https://www.pro-football-reference.com/players/'
target_section_id = 'div_players'
outer_element = 'p'
inner_element = 'a'
target_attribute = 'href'

letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
letters = np.random.permutation(list(letters))
player_info = pd.DataFrame(columns=['player_name', 'player_url'])

# proxies = pd.read_csv('proxies.csv')
# proxies = np.random.permutation(proxies['proxy'])
# i = 1

for letter in letters:
    url = base_url + letter
    print(url)
    # print(f'Attempting to use {proxies[i]}')
    page = requests.get(url)
    # while page.status_code != 200 and i < len(proxies):
    #     i += 1
    #     page = requests.get(url, proxies={"http": proxies[i], "https": proxies[i]})
    soup = BeautifulSoup(page.content, 'html.parser')
    target_section = soup.find(id=target_section_id)
    outer_elements = target_section.find_all(outer_element)
    inner_elements = [element.find(inner_element) for element in outer_elements]

    # this doesn't work as desired; results in list of lists
    player_url = [element.get(target_attribute) for element in inner_elements]
    player_name = [element.get_text() for element in inner_elements]
    player_info = pd.concat([pd.DataFrame([[player_name, player_url]], columns=player_info.columns), player_info], ignore_index=True)
    wait = np.random.randint(10, 15)
    time.sleep(wait)

# convert list of lists into dataframe and save
tmp = player_info.copy()
names = pd.Series()
urls = pd.Series()
for name, url in zip(tmp['player_name'], tmp['player_url']):
    names = pd.concat([names, pd.Series(name)], ignore_index=True)
    urls = pd.concat([urls, pd.Series(url)], ignore_index=True)

pd.DataFrame({'player_name': names, 'player_url': urls}).to_csv('player_urls_all.csv', index=False)


In [18]:
# load the player urls
# get the pages
# get the relevant tables
player_urls = pd.read_csv('player_urls_all.csv')
example = player_urls[player_urls['player_name'] == 'Josh Allen']


From example it is clear that there are duplicates that we will need to handle. The first thing we can do is remove players that don't play the positions of interest (QB, RB, WR, TE) from player_urls_all and save to player_urls_filtered. 

In [None]:
# Let's get the pages for the players
import os
os.chdir('/Users/ryan-saloma/Python Projects/fantasy_football/')
base_player_url = 'https://www.pro-football-reference.com'
player_urls_all = pd.read_csv('scraping/player_urls_all.csv')
player_urls = player_urls_all['player_url']

# Load league data from 1970 to 2023
# Filter urls for players who played in this time frame
# Remove duplicates
# Problem: more than one player can have the same name

# Change the directory to the league data
# Concatenate the data
league_data = pd.DataFrame()
for year in range(1970, 2024):
    file = f'data/league/clean/cleaned_fantasy_{year}.csv'
    data = pd.read_csv(file)
    league_data = pd.concat([league_data, data], ignore_index=True)

# Get the player names
players = league_data.drop_duplicates(subset=['player', 'position'])[['player', 'position']]

# Load the player urls
player_urls = pd.read_csv('scraping/player_urls_all.csv')

# Match player names with player urls
# Save the matched player urls
matched_urls = pd.DataFrame()
for name, position in zip(players['player'], players['position']):
    matched = player_urls[player_urls['player_name'] == name]
    matched['position'] = position
    matched_urls = pd.concat([matched_urls, matched], ignore_index=True)

matched_urls.to_csv('scraping/player_urls_filtered.csv', index=False)

In [46]:
base_player_url = 'https://www.pro-football-reference.com'
player_urls_filtered = pd.read_csv('scraping/player_urls_filtered.csv')['player_url']

# make this work in batches to reduce the number of pages stored at any given time
for url in player_urls_filtered:
    player_page = requests.get(base_player_url + url)
    if player_page.status_code == 200:
        with open('scraping/player_pages/' + url.split('/')[-1], 'w') as f:
            f.write(player_page.text)
    else:
        print(f'Failed to get page for {url}')
    wait = np.random.randint(2, 3)
    time.sleep(wait)

KeyboardInterrupt: 