# NHL Stats Preparation
## Collecting NHL Players' Stats from EliteProspects
This step gathers player performance data across leagues and seasons for downstream NHL prediction.

- Collect league-wide player stats for a given season from
https://eliteprospects.com/league/{league}/stats/{season}?page={n}

- Extract metadata, including individual player profile links.

- Build a unique player list from seasons 2000–2025 by storing profile links in a set.

- Scrape each player’s page to retrieve their first 5 years of NHL stats, filtered by seasons where they played more than 30 games.

- Gather pre-NHL stats for each player (junior, college, international, etc.).

### Import Libraries

In [1]:
import eliteprospects_scraper_api as ep
import pandas as pd
import os
import time
import random

### Collecting NHL Players' Stats from EliteProspects


In [2]:
if not os.path.exists('./data/nhl/nhl_players_metadata.csv'):
    # Collect players from 2000-2025 season
    for i in range(0, 25):
        season = f'20{str(i).zfill(2)}-20{str(i + 1).zfill(2)}'
        print(f'Scraping {season}')
        nhl_players = ep.get_season_roster("nhl", season)
        nhl_players.to_csv(f'./data/nhl/players/nhl_players_{season}.csv', index=False, encoding='utf-8-sig')
        print(f'Finished scraping {season}')
else:
    print('NHL players metadata already exists. Skipping scraping.')

NHL players metadata already exists. Skipping scraping.


In [3]:
if not os.path.exists('./data/nhl/nhl_players_metadata.csv'):
    # Collect players' metadata from 2000-2025 season
    nhl_players_metadata = pd.DataFrame()
    for i in range(0, 25):
        season = f'20{str(i).zfill(2)}-20{str(i + 1).zfill(2)}'
        nhl_players = pd.read_csv(f'./data/nhl/players/nhl_players_{season}.csv')
        nhl_players_metadata = pd.concat([nhl_players_metadata, ep.get_players_metadata(nhl_players)]).reset_index(drop=True)
        # Remove duplicates
        nhl_players_metadata = nhl_players_metadata.drop_duplicates(subset=['playername']).reset_index(drop=True)
        print(f'Finished Merging Metadata for {season}')
else:
    print('NHL players metadata already exists. Skipping merging.')
    nhl_players_metadata = pd.read_csv('./data/nhl/nhl_players_metadata.csv')


NHL players metadata already exists. Skipping merging.


In [4]:
nhl_players_metadata

Unnamed: 0,playername,fw_def,link
0,Jaromír Jágr,FW,https://www.eliteprospects.com/player/8627/jar...
1,Joe Sakic,FW,https://www.eliteprospects.com/player/8862/joe...
2,Patrik Elias,FW,https://www.eliteprospects.com/player/8698/pat...
3,Alexei Kovalev,FW,https://www.eliteprospects.com/player/8670/ale...
4,Jason Allison,FW,https://www.eliteprospects.com/player/9064/jas...
...,...,...,...
3923,Nikita Grebyonkin,FW,https://www.eliteprospects.com/player/652832/n...
3924,Noah Östlund,FW,https://www.eliteprospects.com/player/535699/n...
3925,Matt Stienburg,FW,https://www.eliteprospects.com/player/267112/m...
3926,Nikita Prishchepov,FW,https://www.eliteprospects.com/player/691679/n...


### Collect Stats for Each Player from 2000-2025 CSV file

In [5]:
# Define output file
output_path = './data/nhl/stats/nhl_players_stats.csv'
failed_path = './data/nhl/stats/failed_players.csv'

In [6]:
# Define function to get player stats with retries and start, end
# Use ep.get_single_player_stats()

def get_player_stats_with_retries(start = 0, end = len(nhl_players_metadata), retries = 3):
    if end > len(nhl_players_metadata):
        end = len(nhl_players_metadata)
    print(f'Collecting stats for players {start} to {end}')

    # Delete files if it's the first player
    if start == 0:
        if os.path.exists(output_path):
            os.remove(output_path)
            print(f'Deleted {output_path} to start fresh')
        if os.path.exists(failed_path):
            os.remove(failed_path)
            print(f'Deleted {failed_path} to start fresh')

    for i in range(start, end):
        # Print divider for every 10 players
        if i % 10 == 0:
            print('----------------------------------------------------------------')

        # Get player metadata
        player_metadata = nhl_players_metadata.iloc[i]
        player_name = player_metadata['playername']
        player_url = player_metadata['link']

        for attempt in range(retries):
            try:
                print(f"Collecting stats for {player_name} at {player_url} (Attempt {attempt + 1})")
                player_stats = ep.get_single_player_stats(player_metadata)

                # Write player_stats to CSV
                if os.path.exists(output_path):
                    player_stats.to_csv(output_path, mode='a', header=False, index=False, encoding='utf-8-sig')
                else:
                    player_stats.to_csv(output_path, index=False, encoding='utf-8-sig')
                print(f'Successfully scraped stats for {player_name}')
                break
            except Exception as e:
                print(f"Attempt {attempt + 1} failed for {player_metadata['playername']}: {e}")
                if attempt < retries - 1:
                    time.sleep(random.uniform(1, 3))  # Wait before retrying
                else:
                    # Log failed players to a separate CSV file
                    if os.path.exists(failed_path):
                        player_metadata.to_csv(failed_path, mode='a', header=False, index=False, encoding='utf-8-sig')
                    else:
                        player_metadata.to_csv(failed_path, index=False, encoding='utf-8-sig')
                    print(f'Failed to scrape stats for {player_name} after {retries} attempts')

        # Sleep for 1-10 seconds before scraping the next player
        time.sleep(random.uniform(1, 10))


In [None]:
# Process the first 100 players in batches of 10 for testing purposes
for batch_start in range(0, 100, 10):
    get_player_stats_with_retries(batch_start, batch_start + 10)
    print(f'Finished scraping stats for players {batch_start} to {batch_start + 10}')
    time_sleep = random.uniform(300, 600)
    time_sleep_in_minutes = time_sleep / 60
    print(f"Sleeping for {time_sleep_in_minutes:.2f} minutes before next batch...")
    time.sleep(time_sleep)

Collecting stats for players 0 to 10
Deleted ./data/nhl/stats/nhl_players_stats.csv to start fresh
----------------------------------------------------------------
Collecting stats for Jaromír Jágr at https://www.eliteprospects.com/player/8627/jaromir-jagr (Attempt 1)
Collecting regular season stats from https://www.eliteprospects.com/player/8627/jaromir-jagr
Collecting postseason stats from https://www.eliteprospects.com/player/8627/jaromir-jagr
Successfully scraped stats for Jaromír Jágr
Collecting stats for Joe Sakic at https://www.eliteprospects.com/player/8862/joe-sakic (Attempt 1)
Collecting regular season stats from https://www.eliteprospects.com/player/8862/joe-sakic
Collecting postseason stats from https://www.eliteprospects.com/player/8862/joe-sakic
Successfully scraped stats for Joe Sakic
Collecting stats for Patrik Elias at https://www.eliteprospects.com/player/8698/patrik-elias (Attempt 1)
Collecting regular season stats from https://www.eliteprospects.com/player/8698/patr

In [None]:
def get_player_stats_with_batch_size(start, end, batch_size=10):
    for batch_start in range(start, end, batch_size):
        get_player_stats_with_retries(batch_start, batch_start + batch_size)
        print(f'Finished scraping stats for players {batch_start} to {batch_start + batch_size}')
        time_sleep = random.uniform(300, 600)
        time_sleep_in_minutes = time_sleep / 60
        print(f"Sleeping for {time_sleep_in_minutes:.2f} minutes before next batch...")
        time.sleep(time_sleep)


In [None]:
# Scrape in batches of 100-500 players
get_player_stats_with_batch_size(100, 300, 10)