## Part A: Data Collection 

RB Player Data is getting web scraped from ProFootballReference site between the years 2014-2024


### Websites Used For Support:<br>
- [BrowserStack - Download File using Selenium](https://www.browserstack.com/guide/download-file-using-selenium-python)
- [GeeksForGeeks - Scrape and Save Table using Selenium](https://www.geeksforgeeks.org/scrape-and-save-table-data-in-csv-file-using-selenium-in-python/#)
- [RealPython - Modern Web Automation with Selenium](https://realpython.com/modern-web-automation-with-python-and-selenium/#locate-elements-in-the-dom) 
- [StackOverflow - Wait for file to be downloaded in Selenium](https://stackoverflow.com/questions/63637077/how-to-wait-for-a-file-to-be-downloaded-in-selenium-and-python-before-moving-for)

Semi-Automated Data Extraction:

Selenium opens Google Chrome,<br>
User downloads the Excel file,<br>
File renamed according to offset in its URL

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
import os, time

#File download Paths
selenium_profile_path = r"C:\SeleniumProfiles\StatheadSession"
download_dir = os.path.join(os.getcwd(), "selenium_downloads")
os.makedirs(download_dir, exist_ok=True)

# Chrome Options Setup
options = uc.ChromeOptions()
options.user_data_dir = selenium_profile_path
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
prefs = {"download.default_directory": download_dir,
         "download.prompt_for_download": False,
         "directory_upgrade": True,
         "safebrowsing.enabled": True}
options.add_experimental_option("prefs", prefs)

driver = uc.Chrome(options=options, user_data_dir=selenium_profile_path, headless=False)

# Detect if download is finished
def download_complete():
    print("Checking if download is complete")
    return not any(f.endswith(".crdownload") for f in os.listdir(download_dir))


max_rows = 71000 #Estimated finished based on website query
base_url = f"https://stathead.com/football/player-game-finder.cgi?request=1&timeframe=seasons&match=player_game&qb_start_num_career_max=400&season_end=-1&rookie=N&team_game_num_season_min=1&weight_max=500&comp_type=reg&qb_start_num_career_min=1&player_game_num_career_min=1&draft_pick_type=overall&player_game_num_career_max=400&year_min=2014&year_max=2024&season_start=1&season_positions[]=rb&player_game_num_season_min=1&week_num_season_max=22&team_game_num_season_max=17&week_num_season_min=1&player_game_num_season_max=18&order_by=fantasy_points&cstat[1]=rush_att&ccomp[1]=gt&cval[1]=1"
print(f"File will populate here: {download_dir}")
files_preDownload = set(os.listdir(download_dir))
print(f"Content before download: {files_preDownload}")
for offset in range (0, max_rows, 200):
    try:
        url = base_url + f"&offset={offset}"
        driver.get(url)
        print(f"🟢 Opened URL: {url}")
        time.sleep(10)

        #Time Delay to allow user to click 'download'
        while not download_complete():
            time.sleep(5)

        # Rename file
        files_postDownload = set(os.listdir(download_dir))
        print("Prepping for post download workflow")
        new_file = (files_postDownload - files_preDownload)
        new_xlsx_files = {f for f in new_file if f.endswith(".xls") or f.endswith(".xlsx")}
        if len(new_xlsx_files) == 1:
            original_name = new_xlsx_files.pop()
            new_name = f"Weekly-NFL-RB_stats({offset}).xlsx"
            os.rename(
                os.path.join(download_dir, original_name),
                os.path.join(download_dir, new_name)
            )
            print(f"Renamed: {original_name} → {new_name}")
        elif len(new_xlsx_files) > 1:
            print(f"Multiple new files detected: {new_xlsx_files}. Skipping rename")
        else:
            print("No new file detected")
        files_preDownload = set(os.listdir(download_dir))
    except Exception as e:
        print(f"An error occured: {e}")

driver.quit()
print("Complete, closing Chrome")


File will populate here: c:\Users\zacha\ENG296\ENG296_CapstoneProject\Clean-Copy\ENG296_CapstoneProject-main\selenium_downloads
Content before download: {'Trial1'}
🟢 Opened URL: https://stathead.com/football/player-game-finder.cgi?request=1&timeframe=seasons&match=player_game&qb_start_num_career_max=400&season_end=-1&rookie=N&team_game_num_season_min=1&weight_max=500&comp_type=reg&qb_start_num_career_min=1&player_game_num_career_min=1&draft_pick_type=overall&player_game_num_career_max=400&year_min=2014&year_max=2024&season_start=1&season_positions[]=rb&player_game_num_season_min=1&week_num_season_max=22&team_game_num_season_max=17&week_num_season_min=1&player_game_num_season_max=18&order_by=fantasy_points&cstat[1]=rush_att&ccomp[1]=gt&cval[1]=1&offset=0
Checking if download is complete
Prepping for post download workflow
✅ Renamed: sportsref_download.xls → Weekly-NFL-RB_stats(0).xlsx
🟢 Opened URL: https://stathead.com/football/player-game-finder.cgi?request=1&timeframe=seasons&match=pl

KeyboardInterrupt: 

Statistical Summary of a downloaded weekly stats file

In [14]:
import pandas as pd
import openpyxl 
excelName = 'Weekly-NFL-RB_stats(9800).xlsx'
stats_path = os.path.join(download_dir, excelName)
sample_df = pd.read_html(stats_path)[0]
summary = sample_df.describe()
print(summary)

      Unnamed: 0_level_0 Unnamed: 2_level_0 Unnamed: 3_level_0  \
                      Rk             FantPt                Att   
count         200.000000         200.000000         200.000000   
mean         9900.500000           2.334500           5.030000   
std            57.879185           0.048699           3.226632   
min          9801.000000           2.200000           1.000000   
25%          9850.750000           2.300000           2.000000   
50%          9900.500000           2.300000           4.500000   
75%          9950.250000           2.400000           7.000000   
max         10000.000000           2.400000          20.000000   

      Unnamed: 5_level_0 Unnamed: 6_level_0     Rushing              \
                      G#               Week         Att         Yds   
count         200.000000         200.000000  200.000000  200.000000   
mean            8.650000           9.170000    5.030000   16.345000   
std             4.885369           5.284004    3.226632