# NHL Stats Preparation
## Collecting NHL Players' Stats from EliteProspects
This step gathers player performance data across leagues and seasons for downstream NHL prediction.

- Collect league-wide player stats for a given season from
https://eliteprospects.com/league/{league}/stats/{season}?page={n}

- Extract metadata, including individual player profile links.

- Build a unique player list from seasons 2000–2025 by storing profile links in a set.

- Scrape each player’s page to retrieve their first 5 years of NHL stats, filtered by seasons where they played more than 30 games.

- Gather pre-NHL stats for each player (junior, college, international, etc.).

### Import Libraries

In [1]:
import eliteprospects_scraper_api as ep
import pandas as pd
import os
import time
import random

### Collecting NHL Players' Stats from EliteProspects


In [2]:
if not os.path.exists('./data/nhl/nhl_players_metadata.csv'):
    # Collect players from 2000-2025 season
    for i in range(0, 25):
        season = f'20{str(i).zfill(2)}-20{str(i + 1).zfill(2)}'
        print(f'Scraping {season}')
        nhl_players = ep.get_season_roster("nhl", season)
        nhl_players.to_csv(f'./data/nhl/players/nhl_players_{season}.csv', index=False, encoding='utf-8-sig')
        print(f'Finished scraping {season}')
else:
    print('NHL players metadata already exists. Skipping scraping.')

NHL players metadata already exists. Skipping scraping.


In [3]:
if not os.path.exists('./data/nhl/nhl_players_metadata.csv'):
    # Collect players' metadata from 2000-2025 season
    nhl_players_metadata = pd.DataFrame()
    for i in range(0, 25):
        season = f'20{str(i).zfill(2)}-20{str(i + 1).zfill(2)}'
        nhl_players = pd.read_csv(f'./data/nhl/players/nhl_players_{season}.csv')
        nhl_players_metadata = pd.concat([nhl_players_metadata, ep.get_players_metadata(nhl_players)]).reset_index(drop=True)
        # Remove duplicates
        nhl_players_metadata = nhl_players_metadata.drop_duplicates(subset=['playername']).reset_index(drop=True)
        print(f'Finished Merging Metadata for {season}')
else:
    print('NHL players metadata already exists. Skipping merging.')
    nhl_players_metadata = pd.read_csv('./data/nhl/nhl_players_metadata.csv')


NHL players metadata already exists. Skipping merging.


In [4]:
nhl_players_metadata

Unnamed: 0,playername,fw_def,link
0,Jaromír Jágr,FW,https://www.eliteprospects.com/player/8627/jar...
1,Joe Sakic,FW,https://www.eliteprospects.com/player/8862/joe...
2,Patrik Elias,FW,https://www.eliteprospects.com/player/8698/pat...
3,Alexei Kovalev,FW,https://www.eliteprospects.com/player/8670/ale...
4,Jason Allison,FW,https://www.eliteprospects.com/player/9064/jas...
...,...,...,...
3923,Nikita Grebyonkin,FW,https://www.eliteprospects.com/player/652832/n...
3924,Noah Östlund,FW,https://www.eliteprospects.com/player/535699/n...
3925,Matt Stienburg,FW,https://www.eliteprospects.com/player/267112/m...
3926,Nikita Prishchepov,FW,https://www.eliteprospects.com/player/691679/n...


### Collect Stats for Each Player from 2000-2025 CSV file

In [5]:
# Define output file
output_path = './data/nhl/stats/nhl_players_stats.csv'
failed_path = './data/nhl/stats/failed_players.csv'

In [6]:
# Define function to get player stats with retries and start, end
# Use ep.get_single_player_stats()

def get_player_stats_with_retries(start = 0, end = len(nhl_players_metadata), retries = 3):
    if end > len(nhl_players_metadata):
        end = len(nhl_players_metadata)
    print(f'Collecting stats for players {start} to {end}')

    # Delete files if it's the first player
    if start == 0:
        if os.path.exists(output_path):
            os.remove(output_path)
            print(f'Deleted {output_path} to start fresh')
        if os.path.exists(failed_path):
            os.remove(failed_path)
            print(f'Deleted {failed_path} to start fresh')

    for i in range(start, end):
        # Print divider for every 10 players
        if i % 10 == 0:
            print('----------------------------------------------------------------')

        # Get player metadata
        player_metadata_row = nhl_players_metadata.loc[i]
        player_name = player_metadata_row['playername']
        player_url = player_metadata_row['link']

        for attempt in range(retries):
            try:
                print(f"Collecting stats for {player_name} at {player_url} (Attempt {attempt + 1})")
                player_stats = ep.get_single_player_stats(player_metadata_row)

                # Write player_stats to CSV
                if os.path.exists(output_path):
                    player_stats.to_csv(output_path, mode='a', header=False, index=False, encoding='utf-8-sig')
                else:
                    player_stats.to_csv(output_path, index=False, encoding='utf-8-sig')
                print(f'Successfully scraped stats for {player_name}')
                break
            except Exception as e:
                print(f"Attempt {attempt + 1} failed for {player_metadata_row['playername']}: {e}")
                if attempt < retries - 1:
                    kill_zombie_chrome()
                    time.sleep(random.uniform(1, 3))  # Wait before retrying
                else:
                    # Log failed players to a separate CSV file
                    if os.path.exists(failed_path):
                        player_metadata_row.to_csv(failed_path, mode='a', header=False, index=False, encoding='utf-8-sig')
                    else:
                        player_metadata_row.to_csv(failed_path, index=False, encoding='utf-8-sig')
                    print(f'Failed to scrape stats for {player_name} after {retries} attempts')

        # Sleep for 1-10 seconds before scraping the next player
        time.sleep(random.uniform(1, 10))


In [7]:
def kill_zombie_chrome():
    os.system("taskkill /F /IM chrome.exe /T")
    os.system("taskkill /F /IM chromedriver.exe /T")

In [8]:
def get_player_stats_with_batch_size(start, end, batch_size=10):
    for batch_start in range(start, end, batch_size):
        get_player_stats_with_retries(batch_start, batch_start + batch_size)
        print(f'Finished scraping stats for players {batch_start} to {batch_start + batch_size}')
        time_sleep = random.uniform(60, 300)
        time_sleep_in_minutes = time_sleep / 60
        kill_zombie_chrome()
        print(f"Sleeping for {time_sleep_in_minutes:.2f} minutes before next batch...")
        time.sleep(time_sleep)

In [None]:
# Scrape in batches of 0-100 players
get_player_stats_with_batch_size(0, 100, 10)

In [None]:
# Scrape in batches of 100-200 players
get_player_stats_with_batch_size(110, 200, 10)

Collecting stats for players 110 to 120
----------------------------------------------------------------
Collecting stats for Ulf Dahlén at https://www.eliteprospects.com/player/692/ulf-dahlen (Attempt 1)
Collecting regular season stats from https://www.eliteprospects.com/player/692/ulf-dahlen
Collecting postseason stats from https://www.eliteprospects.com/player/692/ulf-dahlen
Successfully scraped stats for Ulf Dahlén
Collecting stats for Éric Desjardins at https://www.eliteprospects.com/player/9133/eric-desjardins (Attempt 1)
Collecting regular season stats from https://www.eliteprospects.com/player/9133/eric-desjardins
Collecting postseason stats from https://www.eliteprospects.com/player/9133/eric-desjardins


In [None]:
# Scrape in batches of 200-300 players
get_player_stats_with_batch_size(200, 300, 10)

In [None]:
# Scrape in batches of 300-400 players
get_player_stats_with_batch_size(300, 400, 10)

In [None]:
# Scrape in batches of 400-500 players
get_player_stats_with_batch_size(400, 500, 10)

In [None]:
# Scrape in batches of 500-600 players
get_player_stats_with_batch_size(500, 600, 10)

In [None]:
# Scrape in batches of 600-700 players
get_player_stats_with_batch_size(600, 700, 10)

In [None]:
# Scrape in batches of 700-800 players
get_player_stats_with_batch_size(700, 800, 10)

In [None]:
# Scrape in batches of 800-900 players
get_player_stats_with_batch_size(800, 900, 10)

In [None]:
# Scrape in batches of 900-1000 players
get_player_stats_with_batch_size(900, 1000, 10)

In [None]:
# Scrape in batches of 1000-1100 players
get_player_stats_with_batch_size(1000, 1100, 10)

In [None]:
# Scrape in batches of 1100-1200 players
get_player_stats_with_batch_size(1100, 1200, 10)

In [None]:
# Scrape in batches of 1200-1300 players
get_player_stats_with_batch_size(1200, 1300, 10)

In [None]:
# Scrape in batches of 1300-1400 players
get_player_stats_with_batch_size(1300, 1400, 10)

In [None]:
# Scrape in batches of 1400-1500 players
get_player_stats_with_batch_size(1400, 1500, 10)

In [None]:
# Scrape in batches of 1500-1600 players
get_player_stats_with_batch_size(1500, 1600, 10)

In [None]:
# Scrape in batches of 1600-1700 players
get_player_stats_with_batch_size(1600, 1700, 10)

In [None]:
# Scrape in batches of 1700-1800 players
get_player_stats_with_batch_size(1700, 1800, 10)

In [None]:
# Scrape in batches of 1800-1900 players
get_player_stats_with_batch_size(1800, 1900, 10)

In [None]:
# Scrape in batches of 1900-2000 players
get_player_stats_with_batch_size(1900, 2000, 10)

In [None]:
# Scrape in batches of 2000-2100 players
get_player_stats_with_batch_size(2000, 2100, 10)

In [None]:
# Scrape in batches of 2100-2200 players
get_player_stats_with_batch_size(2100, 2200, 10)

In [None]:
# Scrape in batches of 2200-2300 players
get_player_stats_with_batch_size(2200, 2300, 10)

In [None]:
# Scrape in batches of 2300-2400 players
get_player_stats_with_batch_size(2300, 2400, 10)

In [None]:
# Scrape in batches of 2400-2500 players
get_player_stats_with_batch_size(2400, 2500, 10)

In [None]:
# Scrape in batches of 2500-2600 players
get_player_stats_with_batch_size(2500, 2600, 10)

In [None]:
# Scrape in batches of 2600-2700 players
get_player_stats_with_batch_size(2600, 2700, 10)

In [None]:
# Scrape in batches of 2700-2800 players
get_player_stats_with_batch_size(2700, 2800, 10)

In [None]:
# Scrape in batches of 2800-2900 players
get_player_stats_with_batch_size(2800, 2900, 10)

In [None]:
# Scrape in batches of 2900-3000 players
get_player_stats_with_batch_size(2900, 3000, 10)

In [None]:
# Scrape in batches of 3000-3100 players
get_player_stats_with_batch_size(3000, 3100, 10)

In [None]:
# Scrape in batches of 3100-3200 players
get_player_stats_with_batch_size(3100, 3200, 10)

In [None]:
# Scrape in batches of 3200-3300 players
get_player_stats_with_batch_size(3200, 3300, 10)

In [None]:
# Scrape in batches of 3300-3400 players
get_player_stats_with_batch_size(3300, 3400, 10)

In [None]:
# Scrape in batches of 3400-3500 players
get_player_stats_with_batch_size(3400, 3500, 10)

In [None]:
# Scrape in batches of 3500-3600 players
get_player_stats_with_batch_size(3500, 3600, 10)

In [None]:
# Scrape in batches of 3600-3700 players
get_player_stats_with_batch_size(3600, 3700, 10)

In [None]:
# Scrape in batches of 3700-3800 players
get_player_stats_with_batch_size(3700, 3800, 10)

In [None]:
# Scrape in batches of 3800-3900 players
get_player_stats_with_batch_size(3800, 3900, 10)

In [None]:
# Scrape in batches of 3900-len(nhl_players_metadata) players
get_player_stats_with_batch_size(3900, len(nhl_players_metadata), 10)