# NHL Official Data Preparation
## Collecting NHL Players' Stats from Official NHL Website
1. Collect Metadata(player_name, player_link) from NHL Official Website by Season and Team
2. Collect Stats from Each Player's Page

### Import Libraries

In [1]:
import nhl_scraper_api as nhl_scraper
import pandas as pd
import os
import time
import random
import glob

### Define Valid Teams and Valid Seasons

In [2]:
valid_teams = [
    "bruins", "sabres", "redwings", "panthers", "canadiens",
    "senators", "lightning", "mapleleafs", "hurricanes", "bluejackets",
    "devils", "islanders", "rangers", "flyers", "penguins",
    "capitals", "blackhawks", "avalanche", "stars", "wild",
    "predators", "blues", "jets", "ducks", "flames",
    "oilers", "kings", "sharks", "kraken", "canucks",
    "goldenknights", "utah"
]

# Valid seasons from 2000-2025 in YYYY-YYYY format
valid_seasons = [f'20{str(i).zfill(2)}-20{str(i + 1).zfill(2)}' for i in range(0, 25)]

### Validate Team Links' Format

In [3]:
nhl_scraper.print_team_links('2024-2025')

----------------------------------------------------------------
0: https://www.nhl.com/bruins/stats/20242025
1: https://www.nhl.com/sabres/stats/20242025
2: https://www.nhl.com/redwings/stats/20242025
3: https://www.nhl.com/panthers/stats/20242025
4: https://www.nhl.com/canadiens/stats/20242025
----------------------------------------------------------------
5: https://www.nhl.com/senators/stats/20242025
6: https://www.nhl.com/lightning/stats/20242025
7: https://www.nhl.com/mapleleafs/stats/20242025
8: https://www.nhl.com/hurricanes/stats/20242025
9: https://www.nhl.com/bluejackets/stats/20242025
----------------------------------------------------------------
10: https://www.nhl.com/devils/stats/20242025
11: https://www.nhl.com/islanders/stats/20242025
12: https://www.nhl.com/rangers/stats/20242025
13: https://www.nhl.com/flyers/stats/20242025
14: https://www.nhl.com/penguins/stats/20242025
----------------------------------------------------------------
15: https://www.nhl.com/capit

### Collect Metadata(player_name, player_link) from NHL Official Website by Season and Team

#### Test API By Collecting 2024-2025 Avalanche Players' Metadata

In [4]:
avalanche_2425_metadata = nhl_scraper.get_player_by_team("avalanche", "2024-2025")

Collecting data from https://www.nhl.com/avalanche/stats/20242025
Mackenzie Blackwood is a goalie.
Scott Wedgewood is a goalie.
Trent Miner is a goalie.


In [5]:
avalanche_2425_metadata

Unnamed: 0,player_name,player_pos,player_link,player_image
0,Nathan MacKinnon,C,https://www.nhl.com/player/8477492,https://assets.nhle.com/mugs/nhl/20242025/COL/...
1,Cale Makar,D,https://www.nhl.com/player/8480069,https://assets.nhle.com/mugs/nhl/20242025/COL/...
2,Martin Necas,C,https://www.nhl.com/player/8480039,https://assets.nhle.com/mugs/nhl/20242025/COL/...
3,Brock Nelson,C,https://www.nhl.com/player/8475754,https://assets.nhle.com/mugs/nhl/20242025/COL/...
4,Artturi Lehkonen,LW,https://www.nhl.com/player/8477476,https://assets.nhle.com/mugs/nhl/20242025/COL/...
5,Devon Toews,D,https://www.nhl.com/player/8478038,https://assets.nhle.com/mugs/nhl/20242025/COL/...
6,Valeri Nichushkin,RW,https://www.nhl.com/player/8477501,https://assets.nhle.com/mugs/nhl/20242025/COL/...
7,Ross Colton,C,https://www.nhl.com/player/8479525,https://assets.nhle.com/mugs/nhl/20242025/COL/...
8,Samuel Girard,D,https://www.nhl.com/player/8479398,https://assets.nhle.com/mugs/nhl/20242025/COL/...
9,Logan O'Connor,RW,https://www.nhl.com/player/8481186,https://assets.nhle.com/mugs/nhl/20242025/COL/...


#### Collect All Teams' Metadata for All Seasons

In [3]:
def get_all_teams_metadata(curr_team, driver, wait):
    # Print a divider after each team
    print("================================================================")

    # Print team name
    print(f"Collecting metadata for {curr_team}")

    for i, season in enumerate(valid_seasons):
        # Define output path and ensure directory exists
        output_dir = f'./data/nhl/official/teams/{curr_team}'
        os.makedirs(output_dir, exist_ok=True)  # Creates the folder if it doesn't exist

        curr_team_output_path = f'{output_dir}/{curr_team}_{season}.csv'

        # skip scraping if the file exists
        if os.path.exists(curr_team_output_path):
            print(f'{curr_team_output_path} already exists. Skipping scraping.')
            continue

        # Print Divider
        if i % 5 == 0:
            print("----------------------------------------------------------------")

        curr_team_metadata = nhl_scraper.get_player_by_team_with_reusable_driver(curr_team, season, driver, wait)
        if curr_team_metadata is None:
            print(f'Failed to scrape {curr_team} for {season}')
            continue

        # Write to CSV
        curr_team_metadata.to_csv(curr_team_output_path, index=False, encoding='utf-8-sig')
        print(f'Finished scraping {curr_team} for {season}')

        # Add random sleep to prevent getting blocked
        sleep_time = random.uniform(10, 30)
        print(f"Sleep for {sleep_time / 60:.2f} minutes to prevent getting blocked\n")
        time.sleep(sleep_time)

##### Collecting All Teams' Metadata

In [5]:
from selenium.webdriver.support.wait import WebDriverWait
import undetected_chromedriver as uc

# Setup Chrome Driver ONCE
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = uc.Chrome(version_main=138, options=chrome_options)
wait = WebDriverWait(driver, 15)

In [6]:
for team in valid_teams:
    get_all_teams_metadata(team, driver, wait)

driver.quit()

Collecting metadata for bruins
----------------------------------------------------------------
Collecting data from https://www.nhl.com/bruins/stats/20002001
Byron Dafoe is a goalie.
Peter Skudra is a goalie.
Andrew Raycroft is a goalie.
John Grahame is a goalie.
Kay Whitmore is a goalie.
Finished scraping bruins for 2000-2001
Sleep for 0.33 minutes to prevent getting blocked

Collecting data from https://www.nhl.com/bruins/stats/20012002
Byron Dafoe is a goalie.
John Grahame is a goalie.
Andrew Raycroft is a goalie.
Finished scraping bruins for 2001-2002
Sleep for 0.22 minutes to prevent getting blocked

Collecting data from https://www.nhl.com/bruins/stats/20022003
Steve Shields is a goalie.
John Grahame is a goalie.
Jeff Hackett is a goalie.
Andrew Raycroft is a goalie.
Tim Thomas is a goalie.
Finished scraping bruins for 2002-2003
Sleep for 0.42 minutes to prevent getting blocked

Collecting data from https://www.nhl.com/panthers/stats/20132014
Tim Thomas is a goalie.
Scott Clemme

#### Get unique players' metadata from all teams and seasons

In [7]:
# Get unique players' metadata from all teams and seasons
nhl_players_metadata = pd.DataFrame()

# Read in all csv files in ./data/nhl/official/teams/ and merge them
players_files = glob.glob('./data/nhl/official/teams/**/*.csv', recursive=True)

for player_file in players_files:
    curr_team_metadata = pd.read_csv(player_file)
    nhl_players_metadata = pd.concat([nhl_players_metadata, curr_team_metadata]).reset_index(drop=True)
    nhl_players_metadata = nhl_players_metadata.drop_duplicates(subset=['player_name']).reset_index(drop=True)
    print(f'Finished merging {player_file}')

# Write to CSV
nhl_players_metadata.to_csv('./data/nhl/official/nhl_players_metadata.csv', index=False, encoding='utf-8-sig')

Finished merging ./data/nhl/official/teams\avalanche\avalanche_2000-2001.csv
Finished merging ./data/nhl/official/teams\avalanche\avalanche_2001-2002.csv
Finished merging ./data/nhl/official/teams\avalanche\avalanche_2002-2003.csv
Finished merging ./data/nhl/official/teams\avalanche\avalanche_2003-2004.csv
Finished merging ./data/nhl/official/teams\avalanche\avalanche_2005-2006.csv
Finished merging ./data/nhl/official/teams\avalanche\avalanche_2006-2007.csv
Finished merging ./data/nhl/official/teams\avalanche\avalanche_2007-2008.csv
Finished merging ./data/nhl/official/teams\avalanche\avalanche_2008-2009.csv
Finished merging ./data/nhl/official/teams\avalanche\avalanche_2009-2010.csv
Finished merging ./data/nhl/official/teams\avalanche\avalanche_2010-2011.csv
Finished merging ./data/nhl/official/teams\avalanche\avalanche_2011-2012.csv
Finished merging ./data/nhl/official/teams\avalanche\avalanche_2012-2013.csv
Finished merging ./data/nhl/official/teams\avalanche\avalanche_2013-2014.csv

In [52]:
# Read in nhl_players_metadata.csv
nhl_players_metadata_official = pd.read_csv('./data/nhl/official/nhl_players_metadata.csv')

In [53]:
len(nhl_players_metadata_official)

4295

##### Exclude Goalies from the nhl_players_metadata_official

In [54]:
# Exclude rows where the position is G -> Exclude Goalies
nhl_skaters_metadata_official = nhl_players_metadata_official[nhl_players_metadata_official['player_pos'] != 'G']

# Write to CSV
nhl_skaters_metadata_official.to_csv('./data/nhl/official/nhl_skaters_metadata_official.csv', index=False, encoding='utf-8-sig')

In [55]:
len(nhl_skaters_metadata_official)

3864

#### Remove Accent Characters from EP Metadata

In [56]:
from unidecode import unidecode

# Load eliteprospects meta
nhl_skaters_metadata_ep = pd.read_csv('./data/nhl/nhl_players_metadata.csv')

# Apply unidecode to all string columns
for col in nhl_players_metadata_ep.columns:
    if nhl_players_metadata_ep[col].dtype == 'object':
        nhl_players_metadata_ep[col] = nhl_players_metadata_ep[col].apply(unidecode)

# Save the cleaned CSV
nhl_skaters_metadata_ep.to_csv('./data/nhl/nhl_skaters_metadata_accent_cleaned.csv', index=False, encoding='utf-8-sig')

#### Merged EP Metadata with Official Metadata and Skip the Missing Players

In [57]:
# Merge the two dataframes on player_name -> keep only players present in the official dataset
nhl_skaters_metadata_official_ep_merge = pd.merge(nhl_skaters_metadata_official, nhl_skaters_metadata_ep, on='player_name', how='left')

# Rename columns
nhl_skaters_metadata_official_ep_merge = nhl_skaters_metadata_official_ep_merge.rename(columns={
        'player_link': 'player_link_official',
        'link': 'player_link_ep'
    }
)

# Select only the columns we need
nhl_skaters_metadata_official_ep_merge = nhl_skaters_metadata_official_ep_merge[['player_name', 'player_pos', 'player_link_official', 'player_link_ep', 'player_image']]


In [58]:
# Write to CSV
nhl_skaters_metadata_official_ep_merge.to_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge.csv', index=False, encoding='utf-8-sig')

#### Find all players without player_link_ep


In [65]:
# Find all the missing player-rows in official data after merging
players_missing_ep_link = nhl_skaters_metadata_official_ep_merge[nhl_skaters_metadata_official_ep_merge['player_link_ep'].isnull()]

In [66]:
players_missing_ep_link

Unnamed: 0,player_name,player_pos,player_link_official,player_link_ep,player_image
5,Ray Bourque,D,https://www.nhl.com/player/8445621,,https://assets.nhle.com/mugs/nhl/20002001/COL/...
10,Greg De Vries,D,https://www.nhl.com/player/8460254,,https://assets.nhle.com/mugs/nhl/20002001/COL/...
14,Stephane Yelle,C,https://www.nhl.com/player/8459122,,https://assets.nhle.com/mugs/nhl/20002001/COL/...
16,Eric Messier,LW,https://www.nhl.com/player/8460012,,https://assets.nhle.com/mugs/nhl/20002001/COL/...
19,Steve Reinprecht,C,https://www.nhl.com/player/8468432,,https://assets.nhle.com/mugs/nhl/20002001/COL/...
...,...,...,...,...,...
3816,J.J. Daigneault,D,https://www.nhl.com/player/8446286,,https://assets.nhle.com/mugs/nhl/20002001/MIN/...
3841,Kristopher Foucault,LW,https://www.nhl.com/player/8475219,,https://assets.nhle.com/mugs/nhl/20112012/MIN/...
3845,Michael Keranen,LW,https://www.nhl.com/player/8477923,,https://assets.nhle.com/mugs/nhl/20152016/MIN/...
3848,Patrick Cannone,C,https://www.nhl.com/player/8476219,,https://assets.nhle.com/mugs/nhl/20162017/MIN/...


In [64]:
# Write to CSV
players_missing_ep_link.to_csv('./data/nhl/missing_players_in_official_after_merged.csv', index=False, encoding='utf-8-sig')

#### Manually Add the Missing Players' EP Links
- Later on, we can use the EP Links to match the players' stats from EP to the official data

In [67]:
# Read in manually added missing players' EP links
nhl_skaters_metadata_official_ep_merge_complete = pd.read_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete.csv')

In [69]:
# Find all rows missing player_link_ep
players_missing_ep_link = nhl_skaters_metadata_official_ep_merge_complete[nhl_skaters_metadata_official_ep_merge_complete['player_link_ep'].isnull()]

players_missing_ep_link

Unnamed: 0,player_name,player_pos,player_link_official,player_link_ep,player_image


##### Find players which is in nhl_skaters_metadata_official_ep_merge_complete, but not in nhl_players_metadata by player_link_ep

In [122]:
# Find players which are in nhl_skaters_metadata_official_ep_merge_complete, but not in nhl_players_metadata by player_link_ep by player_link_ep
# Load eliteprospects meta
nhl_players_metadata_ep = pd.read_csv('./data/nhl/nhl_players_metadata.csv')
nhl_skaters_metadata_official_ep_merge_complete = pd.read_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete_2.csv')

# Find players which are in nhl_skaters_metadata_official_ep_merge_complete, but not in nhl_players_metadata by player_link_ep
players_missing_after_manual_merge = nhl_players_metadata_ep[~nhl_players_metadata_ep['link'].isin(nhl_skaters_metadata_official_ep_merge_complete['player_link_ep'])]

In [123]:
print(f"nhl_players_metadata_ep: {len(nhl_players_metadata_ep)}")
print(f"nhl_skaters_metadata_official_ep_merge_complete: {len(nhl_skaters_metadata_official_ep_merge_complete)}")

nhl_players_metadata_ep: 3928
nhl_skaters_metadata_official_ep_merge_complete: 3932


In [124]:
players_missing_after_manual_merge

Unnamed: 0,player_name,fw_def,link
1215,Pavel Vorobyov,FW,https://www.eliteprospects.com/player/9561/pav...
1271,Brennan Evans,DEF,https://www.eliteprospects.com/player/10638/br...


In [127]:
# Remove duplicates based on player_link_ep for nhl_skaters_metadata_official_ep_merge_complete_2
nhl_skaters_metadata_official_ep_merge_complete = nhl_skaters_metadata_official_ep_merge_complete.drop_duplicates(subset=['player_link_ep']).reset_index(drop=True)

In [128]:
len(nhl_skaters_metadata_official_ep_merge_complete)

3930

In [129]:
# Write to CSV
nhl_skaters_metadata_official_ep_merge_complete.to_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete_final.csv', index=False, encoding='utf-8-sig')

##### Update player_image URL to the latest
- original: https://assets.nhle.com/mugs/nhl/{season}/{team}/{player_id}.png
- latest: https://assets.nhle.com/mugs/nhl/latest/{player_id}

In [136]:
# Load nhl_skaters_metadata_official_ep_merge_complete_final.csv
nhl_skaters_metadata_official_ep_merge_complete_final = pd.read_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete_final.csv')

# Update player_image URL to the latest
nhl_skaters_metadata_official_ep_merge_complete_final['player_image'] = nhl_skaters_metadata_official_ep_merge_complete_final['player_link_official'].apply(lambda x: f"https://assets.nhle.com/mugs/nhl/latest/{x.split('/')[-1]}.png")

In [137]:
nhl_skaters_metadata_official_ep_merge_complete_final.head(10)

Unnamed: 0,player_name,player_pos,player_link_official,player_link_ep,player_image
0,Joe Sakic,C,https://www.nhl.com/player/8451101,https://www.eliteprospects.com/player/8862/joe...,https://assets.nhle.com/mugs/nhl/latest/845110...
1,Peter Forsberg,C,https://www.nhl.com/player/8458520,https://www.eliteprospects.com/player/710/pete...,https://assets.nhle.com/mugs/nhl/latest/845852...
2,Milan Hejduk,RW,https://www.nhl.com/player/8460577,https://www.eliteprospects.com/player/8603/mil...,https://assets.nhle.com/mugs/nhl/latest/846057...
3,Alex Tanguay,LW,https://www.nhl.com/player/8467338,https://www.eliteprospects.com/player/8779/ale...,https://assets.nhle.com/mugs/nhl/latest/846733...
4,Chris Drury,C,https://www.nhl.com/player/8460562,https://www.eliteprospects.com/player/8804/chr...,https://assets.nhle.com/mugs/nhl/latest/846056...
5,Ray Bourque,D,https://www.nhl.com/player/8445621,https://www.eliteprospects.com/player/19141/ra...,https://assets.nhle.com/mugs/nhl/latest/844562...
6,Shjon Podein,LW,https://www.nhl.com/player/8457704,https://www.eliteprospects.com/player/3644/shj...,https://assets.nhle.com/mugs/nhl/latest/845770...
7,Adam Deadmarsh,RW,https://www.nhl.com/player/8459436,https://www.eliteprospects.com/player/21314/ad...,https://assets.nhle.com/mugs/nhl/latest/845943...
8,Martin Skoula,D,https://www.nhl.com/player/8467343,https://www.eliteprospects.com/player/8598/mar...,https://assets.nhle.com/mugs/nhl/latest/846734...
9,Ville Nieminen,LW,https://www.nhl.com/player/8466210,https://www.eliteprospects.com/player/2661/vil...,https://assets.nhle.com/mugs/nhl/latest/846621...


In [139]:
# Write to CSV to update image
nhl_skaters_metadata_official_ep_merge_complete_final.to_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete_final.csv', index=False, encoding='utf-8-sig')

#### Collect Stats from Each Player's Page