# NHL Stats Preparation
## Collecting NHL Players' Stats from EliteProspects
This step gathers player performance data across leagues and seasons for downstream NHL prediction.

- Collect league-wide player stats for a given season from
https://eliteprospects.com/league/{league}/stats/{season}?page={n}

- Extract metadata, including individual player profile links.

- Build a unique player list from seasons 2000–2025 by storing profile links in a set.

- Scrape each player’s page to retrieve their first 5 years of NHL stats, filtered by seasons where they played more than 30 games.

- Gather pre-NHL stats for each player (junior, college, international, etc.).

### Import Libraries

In [1]:
import eliteprospects_scraper_api as ep
import pandas as pd
import os
import time
import random

### Collecting NHL Players' Stats from EliteProspects


In [2]:
if not os.path.exists('./data/nhl/nhl_players_metadata.csv'):
    # Collect players from 2000-2025 season
    for i in range(0, 25):
        season = f'20{str(i).zfill(2)}-20{str(i + 1).zfill(2)}'
        print(f'Scraping {season}')
        nhl_players = ep.get_season_roster("nhl", season)
        nhl_players.to_csv(f'./data/nhl/players/nhl_players_{season}.csv', index=False, encoding='utf-8-sig')
        print(f'Finished scraping {season}')
else:
    print('NHL players metadata already exists. Skipping scraping.')

NHL players metadata already exists. Skipping scraping.


In [3]:
if not os.path.exists('./data/nhl/nhl_players_metadata.csv'):
    # Collect players' metadata from 2000-2025 season
    nhl_players_metadata = pd.DataFrame()
    for i in range(0, 25):
        season = f'20{str(i).zfill(2)}-20{str(i + 1).zfill(2)}'
        nhl_players = pd.read_csv(f'./data/nhl/players/nhl_players_{season}.csv')
        nhl_players_metadata = pd.concat([nhl_players_metadata, ep.get_players_metadata(nhl_players)]).reset_index(drop=True)
        # Remove duplicates
        nhl_players_metadata = nhl_players_metadata.drop_duplicates(subset=['playername']).reset_index(drop=True)
        print(f'Finished Merging Metadata for {season}')
else:
    print('NHL players metadata already exists. Skipping merging.')
    nhl_players_metadata = pd.read_csv('./data/nhl/nhl_players_metadata.csv')


NHL players metadata already exists. Skipping merging.


### Collect Stats for Each Player from 2000-2025 CSV file

In [4]:
# Define output file
output_path = './data/nhl/stats/nhl_players_stats.csv'
failed_path = './data/nhl/stats/failed_players.csv'

In [12]:
def get_players_stats_by_batch(players_metadata, start = 0, end = len(nhl_players_metadata)):
    # Boundary Check
    end = min(end, len(players_metadata))

    # Delete files if it's the first player
    if start == 0:
        if os.path.exists(output_path):
            os.remove(output_path)
            print(f'Deleted {output_path} to start fresh')
        if os.path.exists(failed_path):
            os.remove(failed_path)
            print(f'Deleted {failed_path} to start fresh')

    # Count to calculate the failed rate
    count = 0

    for i in range(start, end):
        # Print divider for every 10 players
        if i % 10 == 0 and i != start:
            print('----------------------------------------------------------------')
            # Sleep for 3-10 minutes before trying the next player
            sleep_time = random.uniform(180, 600)
            print(f"Sleep for {sleep_time / 60:.2f} minutes before trying the next player")
            time.sleep(sleep_time)

        player_metadata = players_metadata.iloc[i]
        player_name = player_metadata['playername']
        player_url = player_metadata['link']
        print(f"Collecting stats for {player_name} at {player_url}")

        try:
            player_stats = ep.get_player_stats(player_metadata)

            # Write to CSV file
            if os.path.exists(output_path):
                player_stats.to_csv(output_path, mode='a', header=False, index=False, encoding='utf-8-sig')
            else:
                player_stats.to_csv(output_path, index=False, encoding='utf-8-sig')
            print(f'Successfully scraped stats for {player_name}')

            # Add random sleep to prevent getting blocked 0.5-3 minutes
            sleep_time = random.uniform(30, 180)
            print(f"Sleep for {sleep_time / 60:.2f} minutes to prevent getting blocked")
            time.sleep(sleep_time)
        except Exception as e:
            count += 1
            print(f"Failed to get stats for {player_name}: {e}")

            # Write failed player to CSV file
            if os.path.exists(failed_path):
                player_metadata.to_csv(failed_path, mode='a', header=False, index=False, encoding='utf-8-sig')
            else:
                player_metadata.to_csv(failed_path, index=False, encoding='utf-8-sig')

            print(f'Failed to scrape stats for {player_name}. Logged to {failed_path}')

            # Print fail rate
            print(f'Failed rate: {count / (i + 1 - start):.2f}')

            # Sleep for 2-5 minutes before trying the next player
            sleep_time = random.uniform(120, 300)
            print(f"Sleep for {sleep_time / 60:.2f} minutes before trying the next player")
            time.sleep(sleep_time)



### Fetch Players' Stats by Batch

In [None]:
get_players_stats_by_batch(nhl_players_metadata, 200, 300)

----------------------------------------------------------------
Collecting stats for Karel Rachunek at https://www.eliteprospects.com/player/9629/karel-rachunek

Collecting Regular Season + Postseason stats for Karel Rachunek at https://www.eliteprospects.com/player/9629/karel-rachunek
Successfully scraped stats for Karel Rachunek
Sleep for 0.33 minutes to prevent getting blocked
Collecting stats for Brad Isbister at https://www.eliteprospects.com/player/8705/brad-isbister

Collecting Regular Season + Postseason stats for Brad Isbister at https://www.eliteprospects.com/player/8705/brad-isbister
Successfully scraped stats for Brad Isbister
Sleep for 0.30 minutes to prevent getting blocked
Collecting stats for Adam Deadmarsh at https://www.eliteprospects.com/player/21314/adam-deadmarsh

Collecting Regular Season + Postseason stats for Adam Deadmarsh at https://www.eliteprospects.com/player/21314/adam-deadmarsh
Successfully scraped stats for Adam Deadmarsh
Sleep for 2.52 minutes to preve

### Fetch failed players

In [28]:
# Get unique players from nhl_players_stats.csv
if os.path.exists(output_path):
    nhl_players_stats = pd.read_csv(output_path)
    unique_players = set(nhl_players_stats['playername'].unique())
else:
    unique_players = set()

In [29]:
# Find players in nhl_players_metadata 0-200 not in unique_players
subset = nhl_players_metadata[0:300]
players_to_scrape = subset[~subset['playername'].isin(unique_players)].reset_index(drop=True)
players_to_scrape

Unnamed: 0,playername,fw_def,link
0,Shaun Van Allen,FW,https://www.eliteprospects.com/player/42653/sh...


In [None]:
curr_len = len(players_to_scrape)

for i in range(curr_len):
    player_metadata = players_to_scrape.iloc[i]
    player_name = player_metadata['playername']
    player_url = player_metadata['link']
    print(f"Collecting stats for {player_name} at {player_url}")

    try:
        player_stats = ep.get_player_stats(player_metadata)

        # Write to CSV file
        if os.path.exists(output_path):
            player_stats.to_csv(output_path, mode='a', header=False, index=False, encoding='utf-8-sig')
        else:
            player_stats.to_csv(output_path, index=False, encoding='utf-8-sig')
        print(f'Successfully scraped stats for {player_name}\n')

        # Add random sleep to prevent getting blocked
        if i < curr_len - 1:
            sleep_time = random.uniform(10, 180)
            print(f"Sleep for {sleep_time / 60:.2f} minutes to prevent getting blocked")
            time.sleep(sleep_time)
    except Exception as e:
        print(f"Failed to get stats for {player_name}: {e}")


        if i < curr_len - 1:
            # Sleep for 1-3 minutes before trying the next player
            print(f"Sleeping for 1-3 minutes before trying the next player")
            time.sleep(random.uniform(60, 180))

Collecting stats for Shaun Van Allen at https://www.eliteprospects.com/player/42653/shaun-van-allen

Collecting Regular Season + Postseason stats for Shaun Van Allen at https://www.eliteprospects.com/player/42653/shaun-van-allen
Successfully scraped stats for Shaun Van Allen

Sleep for 2.52 minutes to prevent getting blocked
