# NHL Stats Preparation
## Collecting NHL Players' Stats from EliteProspects
This step gathers player performance data across leagues and seasons for downstream NHL prediction.

- Collect league-wide player stats for a given season from
https://eliteprospects.com/league/{league}/stats/{season}?page={n}

- Extract metadata, including individual player profile links.

- Build a unique player list from seasons 2000–2025 by storing profile links in a set.

- Scrape each player’s page to retrieve their first 5 years of NHL stats, filtered by seasons where they played more than 30 games.

- Gather pre-NHL stats for each player (junior, college, international, etc.).

### Import Libraries

In [1]:
import eliteprospects_scraper_api as ep
import pandas as pd
import os
import time
import random

### Collecting NHL Players' Stats from EliteProspects


In [2]:
if not os.path.exists('./data/nhl/nhl_players_metadata.csv'):
    # Collect players from 2000-2025 season
    for i in range(0, 25):
        season = f'20{str(i).zfill(2)}-20{str(i + 1).zfill(2)}'
        print(f'Scraping {season}')
        nhl_players = ep.get_season_roster("nhl", season)
        nhl_players.to_csv(f'./data/nhl/players/nhl_players_{season}.csv', index=False, encoding='utf-8-sig')
        print(f'Finished scraping {season}')
else:
    print('NHL players metadata already exists. Skipping scraping.')

NHL players metadata already exists. Skipping scraping.


In [3]:
if not os.path.exists('./data/nhl/nhl_players_metadata.csv'):
    # Collect players' metadata from 2000-2025 season
    nhl_players_metadata = pd.DataFrame()
    for i in range(0, 25):
        season = f'20{str(i).zfill(2)}-20{str(i + 1).zfill(2)}'
        nhl_players = pd.read_csv(f'./data/nhl/players/nhl_players_{season}.csv')
        nhl_players_metadata = pd.concat([nhl_players_metadata, ep.get_players_metadata(nhl_players)]).reset_index(drop=True)
        # Remove duplicates
        nhl_players_metadata = nhl_players_metadata.drop_duplicates(subset=['playername']).reset_index(drop=True)
        print(f'Finished Merging Metadata for {season}')
else:
    print('NHL players metadata already exists. Skipping merging.')
    nhl_players_metadata = pd.read_csv('./data/nhl/nhl_players_metadata.csv')


NHL players metadata already exists. Skipping merging.


### Define Function to Get Player Stats

In [4]:
# Define output file
output_path = './data/nhl/stats/nhl_players_stats.csv'

In [5]:
def get_players_stats_by_batch(players_to_scrape):
    curr_len = len(players_to_scrape)
    fail_count = 0

    for i in range(curr_len):
        player_metadata = players_to_scrape.iloc[i]
        player_name = player_metadata['playername']
        player_url = player_metadata['link']
        print(f"\n [{i + 1}] Collecting stats for {player_name} at {player_url}")

        try:
            player_stats = ep.get_player_stats(player_metadata)

            # Write to CSV file
            if os.path.exists(output_path):
                player_stats.to_csv(output_path, mode='a', header=False, index=False, encoding='utf-8-sig')
            else:
                player_stats.to_csv(output_path, index=False, encoding='utf-8-sig')
            print(f'Successfully scraped stats for {player_name}')
            
            # Print Fail Rate
            print(f'Failed rate: {fail_count / (i + 1):.2f}')

            # Add random sleep to prevent getting blocked
            if i < curr_len - 1:
                sleep_time = random.uniform(10, 120)
                print(f"Sleep for {sleep_time / 60:.2f} minutes to prevent getting blocked")
                time.sleep(sleep_time) 
        except Exception as e:
            print(f"Failed to get stats for {player_name}: {e}")

            fail_count += 1

            if i < curr_len - 1:
                # Sleep for 15-60 seconds before trying the next player
                sleep_time = random.uniform(15, 60)
                print(f"Sleeping for {sleep_time / 60:.2f} seconds before trying the next player")
                time.sleep(sleep_time)

### Fetch Players' Stats by Batch

In [11]:
# Scrape players from 650-700
curr_batch_metadata = nhl_players_metadata[650:700]
get_players_stats_by_batch(curr_batch_metadata)


 [1] Collecting stats for Ron Sutter at https://www.eliteprospects.com/player/29015/ron-sutter
Collecting Regular Season + Postseason stats for Ron Sutter at https://www.eliteprospects.com/player/29015/ron-sutter
Successfully scraped stats for Ron Sutter
Failed rate: 0.00
Sleep for 1.62 minutes to prevent getting blocked

 [2] Collecting stats for Alexei Ponikarovsky at https://www.eliteprospects.com/player/8658/alexei-ponikarovsky
Collecting Regular Season + Postseason stats for Alexei Ponikarovsky at https://www.eliteprospects.com/player/8658/alexei-ponikarovsky
Successfully scraped stats for Alexei Ponikarovsky
Failed rate: 0.00
Sleep for 0.89 minutes to prevent getting blocked

 [3] Collecting stats for Jamie Allison at https://www.eliteprospects.com/player/9063/jamie-allison
Collecting Regular Season + Postseason stats for Jamie Allison at https://www.eliteprospects.com/player/9063/jamie-allison
Successfully scraped stats for Jamie Allison
Failed rate: 0.00
Sleep for 1.43 minutes

In [16]:
# Scrape players from 700-750
curr_batch_metadata = nhl_players_metadata[700:750]
get_players_stats_by_batch(curr_batch_metadata)


 [1] Collecting stats for Mark Greig at https://www.eliteprospects.com/player/54582/mark-greig
Collecting Regular Season + Postseason stats for Mark Greig at https://www.eliteprospects.com/player/54582/mark-greig
Successfully scraped stats for Mark Greig
Failed rate: 0.00
Sleep for 0.63 minutes to prevent getting blocked

 [2] Collecting stats for Dmitry Afanasenkov at https://www.eliteprospects.com/player/8634/dmitry-afanasenkov
Collecting Regular Season + Postseason stats for Dmitry Afanasenkov at https://www.eliteprospects.com/player/8634/dmitry-afanasenkov
Successfully scraped stats for Dmitry Afanasenkov
Failed rate: 0.00
Sleep for 1.49 minutes to prevent getting blocked

 [3] Collecting stats for Antti Aalto at https://www.eliteprospects.com/player/2703/antti-aalto
Collecting Regular Season + Postseason stats for Antti Aalto at https://www.eliteprospects.com/player/2703/antti-aalto
Successfully scraped stats for Antti Aalto
Failed rate: 0.00
Sleep for 1.91 minutes to prevent get

In [None]:
# Scrape players from 750-800
curr_batch_metadata = nhl_players_metadata[750:800]
get_players_stats_by_batch(curr_batch_metadata)


 [1] Collecting stats for Sean Haggerty at https://www.eliteprospects.com/player/31356/sean-haggerty
Collecting Regular Season + Postseason stats for Sean Haggerty at https://www.eliteprospects.com/player/31356/sean-haggerty
Successfully scraped stats for Sean Haggerty
Failed rate: 0.00
Sleep for 0.64 minutes to prevent getting blocked

 [2] Collecting stats for Don MacLean at https://www.eliteprospects.com/player/8953/don-maclean
Collecting Regular Season + Postseason stats for Don MacLean at https://www.eliteprospects.com/player/8953/don-maclean
Successfully scraped stats for Don MacLean
Failed rate: 0.00
Sleep for 0.97 minutes to prevent getting blocked

 [3] Collecting stats for Ed Ward at https://www.eliteprospects.com/player/568/ed-ward
Collecting Regular Season + Postseason stats for Ed Ward at https://www.eliteprospects.com/player/568/ed-ward
Successfully scraped stats for Ed Ward
Failed rate: 0.00
Sleep for 1.56 minutes to prevent getting blocked

 [4] Collecting stats for Da

In [14]:
# Scrape players from 800-850
curr_batch_metadata = nhl_players_metadata[800:850]
get_players_stats_by_batch(curr_batch_metadata)


 [1] Collecting stats for Paul Kruse at https://www.eliteprospects.com/player/27139/paul-kruse
Collecting Regular Season + Postseason stats for Paul Kruse at https://www.eliteprospects.com/player/27139/paul-kruse
Successfully scraped stats for Paul Kruse
Failed rate: 0.00
Sleep for 1.11 minutes to prevent getting blocked

 [2] Collecting stats for Sean Selmser at https://www.eliteprospects.com/player/15579/sean-selmser
Collecting Regular Season + Postseason stats for Sean Selmser at https://www.eliteprospects.com/player/15579/sean-selmser
Successfully scraped stats for Sean Selmser
Failed rate: 0.00
Sleep for 0.92 minutes to prevent getting blocked

 [3] Collecting stats for Jody Shelley at https://www.eliteprospects.com/player/8722/jody-shelley
Collecting Regular Season + Postseason stats for Jody Shelley at https://www.eliteprospects.com/player/8722/jody-shelley
Successfully scraped stats for Jody Shelley
Failed rate: 0.00
Sleep for 1.61 minutes to prevent getting blocked

 [4] Coll

In [15]:
# Scrape players from 850-900
curr_batch_metadata = nhl_players_metadata[850:900]
get_players_stats_by_batch(curr_batch_metadata)


 [1] Collecting stats for David Emma at https://www.eliteprospects.com/player/24188/david-emma
Collecting Regular Season + Postseason stats for David Emma at https://www.eliteprospects.com/player/24188/david-emma
Error scraping David Emma: HTTPConnectionPool(host='localhost', port=60201): Read timed out. (read timeout=120)
Failed to get stats for David Emma: 'NoneType' object has no attribute 'to_csv'
Sleeping for 0.33 seconds before trying the next player

 [2] Collecting stats for Bryan Allen at https://www.eliteprospects.com/player/8767/bryan-allen
Collecting Regular Season + Postseason stats for Bryan Allen at https://www.eliteprospects.com/player/8767/bryan-allen
Successfully scraped stats for Bryan Allen
Failed rate: 0.50
Sleep for 1.14 minutes to prevent getting blocked

 [3] Collecting stats for Matt Higgins at https://www.eliteprospects.com/player/13398/matt-higgins
Collecting Regular Season + Postseason stats for Matt Higgins at https://www.eliteprospects.com/player/13398/ma

In [None]:
# Scrape players from 900-950
curr_batch_metadata = nhl_players_metadata[900:950]
get_players_stats_by_batch(curr_batch_metadata)

In [None]:
# Scrape players from 950-1000
curr_batch_metadata = nhl_players_metadata[950:1000]
get_players_stats_by_batch(curr_batch_metadata)

### Fetch Players' Stats for Players Not Scraped

In [17]:
# Get unique players from nhl_players_stats.csv
if os.path.exists(output_path):
    nhl_players_stats = pd.read_csv(output_path)
    unique_players = set(nhl_players_stats['playername'].unique())
else:
    unique_players = set()

In [18]:
# Find players in nhl_players_metadata 0-1000 not in unique_players
subset = nhl_players_metadata[0:900]
players_to_scrape = subset[~subset['playername'].isin(unique_players)].reset_index(drop=True)
players_to_scrape

Unnamed: 0,playername,fw_def,link


In [19]:
curr_len = len(players_to_scrape)
# curr_len = 10

for i in range(curr_len):
    player_metadata = players_to_scrape.iloc[i]
    player_name = player_metadata['playername']
    player_url = player_metadata['link']
    print(f"\nCollecting stats for {player_name} at {player_url}")

    try:
        player_stats = ep.get_player_stats(player_metadata)

        # Write to CSV file
        if os.path.exists(output_path):
            player_stats.to_csv(output_path, mode='a', header=False, index=False, encoding='utf-8-sig')
        else:
            player_stats.to_csv(output_path, index=False, encoding='utf-8-sig')
        print(f'Successfully scraped stats for {player_name}')

        # Add random sleep to prevent getting blocked
        if i < curr_len - 1:
            sleep_time = random.uniform(10, 120)
            print(f"Sleep for {sleep_time / 60:.2f} minutes to prevent getting blocked")
            time.sleep(sleep_time)
    except Exception as e:
        print(f"Failed to get stats for {player_name}: {e}")


        if i < curr_len - 1:
            # Sleep for 15-60 seconds before trying the next player
            sleep_time = random.uniform(15, 60)
            print(f"Sleeping for {sleep_time / 60:.2f} seconds before trying the next player")
            time.sleep(sleep_time)

In [20]:
# Check how many distinct players are in the output_path
nhl_players_stats = pd.read_csv(output_path)
unique_players = set(nhl_players_stats['playername'].unique())
print(f'Number of unique players: {len(unique_players)}')

Number of unique players: 900


In [21]:
# Remove Duplicate rows in nhl_players_stats.csv
nhl_players_stats = pd.read_csv(output_path)
nhl_players_stats = nhl_players_stats.drop_duplicates().reset_index(drop=True)
nhl_players_stats.to_csv(output_path, index=False, encoding='utf-8-sig')