# NHL Stats Preparation
## Collecting NHL Players' Stats from EliteProspects
This step gathers player performance data across leagues and seasons for downstream NHL prediction.

- Collect league-wide player stats for a given season from
https://eliteprospects.com/league/{league}/stats/{season}?page={n}

- Extract metadata, including individual player profile links.

- Build a unique player list from seasons 2000–2025 by storing profile links in a set.

- Scrape each player’s page to retrieve their first 5 years of NHL stats, filtered by seasons where they played more than 30 games.

- Gather pre-NHL stats for each player (junior, college, international, etc.).

### Import Libraries

In [None]:
import eliteprospects_scraper_api as ep
import pandas as pd
import time

### Collecting NHL Players' Stats from EliteProspects


In [None]:
# Collect players from 2000-2025 season
for i in range(0, 25):
    season = f'20{str(i).zfill(2)}-20{str(i + 1).zfill(2)}'
    print(f'Scraping {season}')
    nhl_players = ep.getSeasonRoster("nhl", season)
    nhl_players.to_csv(f'./data/nhl/players/nhl_players_{season}.csv', index=False, encoding='utf-8-sig')
    print(f'Finished scraping {season}')

In [None]:
# Collect players' metadata from 2000-2025 season
nhl_players_metadata = pd.DataFrame()
for i in range(0, 25):
    season = f'20{str(i).zfill(2)}-20{str(i + 1).zfill(2)}'
    nhl_players = pd.read_csv(f'./data/nhl/players/nhl_players_{season}.csv')
    nhl_players_metadata = pd.concat([nhl_players_metadata, ep.getPlayersMetadata(nhl_players)]).reset_index(drop=True)
    # Remove duplicates
    nhl_players_metadata = nhl_players_metadata.drop_duplicates(subset=['playername']).reset_index(drop=True)
    print(f'Finished Merging Metadata for {season}')


In [None]:
nhl_players_metadata.to_csv('./data/nhl/nhl_players_metadata.csv', index=False, encoding='utf-8-sig')

In [None]:
nhl_players_metadata

### Collect Stats for Each Player from 2000-2025 CSV file

In [None]:
# Define output file
output_path = './data/nhl/stats/nhl_players_stats.csv'

# Get stats for each batch
# Define batch size
batch_size = 5

# Define the start of the batch
curr_batch_start = 0

In [None]:
# Define get stats function by the given batch size
def getStatsByBatch(start, batch_size, metadata_df, first_batch=False):
    # Check if end of the batch is beyond the length of the dataframe
    if start + batch_size > len(metadata_df):
        end = len(metadata_df)
    else:
        end = start + batch_size
    batch = metadata_df.iloc[start : end]
    print(f'Getting stats for batch {start} to {end - 1}')

    # Define a list to store failed batches
    failed_players = []

    try:
        # Attempt to get player stats for this batch
        nhl_players_stats = ep.getPlayersStats(batch)

        # Save to CSV
        if first_batch:
            nhl_players_stats.to_csv(output_path, index = False, encoding = 'utf-8-sig')
        else:
            nhl_players_stats.to_csv(output_path, mode = 'a', header = False, index = False, encoding = 'utf-8-sig')

    except Exception as e:
        print(f"Error processing batch {start}-{end}: {e}")
        failed_players.append(batch)  # Store failed batch metadata

        # Save failed batches (if any)
        if failed_players:
            failed_df = pd.concat(failed_players).reset_index(drop=True)
            # Append to failed batches file
            failed_df.to_csv("./data/nhl/stats/failed_batches.csv", mode = 'a', index=False, encoding='utf-8-sig')
            print(f"\nSaved rows for batch {start} to {end} failed player metadata rows to 'failed_batches.csv'")

    return

In [None]:
# Get the batch start from 0 to 100
while curr_batch_start < 100:
    getStatsByBatch(curr_batch_start, batch_size)
    curr_batch_start += batch_size
    time.sleep(20)

In [None]:
# Get the batch start from 100 to 200
while curr_batch_start < 200:
    getStatsByBatch(curr_batch_start, batch_size)
    curr_batch_start += batch_size
    time.sleep(20)

In [None]:
# Get the batch start from 200 to 500
while curr_batch_start < 500:
    getStatsByBatch(curr_batch_start, batch_size)
    curr_batch_start += batch_size
    time.sleep(10)

In [None]:
# Get the batch start from 500 to 1000
while curr_batch_start < 1000:
    getStatsByBatch(curr_batch_start, batch_size)
    curr_batch_start += batch_size
    time.sleep(10)

In [None]:
# Get the batch start from 1000 to 1500
while curr_batch_start < 1500:
    getStatsByBatch(curr_batch_start, batch_size)
    curr_batch_start += batch_size
    time.sleep(10)

In [None]:
# Get the batch start from 1500 to 2000
while curr_batch_start < 2000:
    getStatsByBatch(curr_batch_start, batch_size)
    curr_batch_start += batch_size
    time.sleep(10)

In [None]:
# Get the batch start from 2000 to 2500
while curr_batch_start < 2500:
    getStatsByBatch(curr_batch_start, batch_size)
    curr_batch_start += batch_size
    time.sleep(10)

In [None]:
# Get the batch start from 2500 to 3000
while curr_batch_start < 3000:
    getStatsByBatch(curr_batch_start, batch_size)
    curr_batch_start += batch_size
    time.sleep(10)

In [None]:
# Get the batch start from 3000 to 3500
while curr_batch_start < 3500:
    getStatsByBatch(curr_batch_start, batch_size)
    curr_batch_start += batch_size
    time.sleep(10)

In [None]:
# Get the batch start from 3500 to len(nhl_players_metadata)
while curr_batch_start < len(nhl_players_metadata):
    getStatsByBatch(curr_batch_start, batch_size)
    curr_batch_start += batch_size
    time.sleep(10)

In [None]:
# Fetch those failed batches
failed_batches = pd.read_csv('./data/nhl/stats/failed_batches.csv')
failed_batches

In [None]:
# Remove data with playername = playername
failed_batches = failed_batches[failed_batches['playername'] != 'playername'].reset_index(drop=True)
failed_batches

In [None]:
# Get stats for each batch
# Define batch size
batch_size = 5

# Define the start of the batch
curr_batch_start = 0

In [None]:
# Get the batch start from 0 to 100
while curr_batch_start < len(failed_batches):
    getStatsByBatch(curr_batch_start, batch_size, failed_batches)
    curr_batch_start += batch_size
    time.sleep(10)

In [None]:
# Check if every player in the nhl_players_metadata is in the nhl_players_stats
nhl_players_metadata = pd.read_csv('./data/nhl/nhl_players_metadata.csv')
nhl_players_stats = pd.read_csv('./data/nhl/stats/nhl_players_stats.csv', dtype={'playername': str}, low_memory=False)

In [None]:
# Check if every player in the nhl_players_metadata is in the nhl_players_stats
nhl_players_metadata[~nhl_players_metadata['playername'].isin(nhl_players_stats['playername'])]

In [None]:
# Fetch those players not in the nhl_players_stats
failed_players = nhl_players_metadata[~nhl_players_metadata['playername'].isin(nhl_players_stats['playername'])]
failed_players

In [None]:
# Get stats for each batch
# Define batch size
batch_size = 5

# Define the start of the batch
curr_batch_start = 0

In [None]:
# Get the batch start from 0 to 100
while curr_batch_start < len(failed_players):
    getStatsByBatch(curr_batch_start, batch_size, failed_players)
    curr_batch_start += batch_size
    time.sleep(10)

In [None]:
# Check if every player in the nhl_players_metadata is in the nhl_players_stats
nhl_players_metadata = pd.read_csv('./data/nhl/nhl_players_metadata.csv')

In [None]:
# Check if every player in the nhl_players_metadata is in the nhl_players_stats
nhl_players_stats = pd.read_csv('./data/nhl/stats/nhl_players_stats.csv', dtype={'playername': str}, low_memory=False)
nhl_players_stats['playername'] = nhl_players_stats['playername'].astype(str)
nhl_players_metadata['playername'] = nhl_players_metadata['playername'].astype(str)