# NHL Official Data Preparation
## Collecting NHL Players' Stats from Official NHL Website
1. Collect Metadata(player_name, player_link) from NHL Official Website by Season and Team
2. Collect Stats from Each Player's Page

### Import Libraries

In [1]:
import nhl_scraper_api as nhl_scraper
import pandas as pd
import os
import time
import random
import glob

### Define Valid Teams and Valid Seasons

In [None]:
valid_teams = [
    "bruins", "sabres", "redwings", "panthers", "canadiens",
    "senators", "lightning", "mapleleafs", "hurricanes", "bluejackets",
    "devils", "islanders", "rangers", "flyers", "penguins",
    "capitals", "blackhawks", "avalanche", "stars", "wild",
    "predators", "blues", "jets", "ducks", "flames",
    "oilers", "kings", "sharks", "kraken", "canucks",
    "goldenknights", "utah"
]

# Valid seasons from 2000-2025 in YYYY-YYYY format
valid_seasons = [f'20{str(i).zfill(2)}-20{str(i + 1).zfill(2)}' for i in range(0, 25)]

### Validate Team Links' Format

In [None]:
nhl_scraper.print_team_links('2024-2025')

### Collect Metadata(player_name, player_link) from NHL Official Website by Season and Team

#### Test API By Collecting 2024-2025 Avalanche Players' Metadata

In [None]:
avalanche_2425_metadata = nhl_scraper.get_player_by_team("avalanche", "2024-2025")

In [None]:
avalanche_2425_metadata

#### Collect All Teams' Metadata for All Seasons

In [None]:
def get_all_teams_metadata(curr_team, driver, wait):
    # Print a divider after each team
    print("================================================================")

    # Print team name
    print(f"Collecting metadata for {curr_team}")

    for i, season in enumerate(valid_seasons):
        # Define output path and ensure directory exists
        output_dir = f'./data/nhl/official/teams/{curr_team}'
        os.makedirs(output_dir, exist_ok=True)  # Creates the folder if it doesn't exist

        curr_team_output_path = f'{output_dir}/{curr_team}_{season}.csv'

        # skip scraping if the file exists
        if os.path.exists(curr_team_output_path):
            print(f'{curr_team_output_path} already exists. Skipping scraping.')
            continue

        # Print Divider
        if i % 5 == 0:
            print("----------------------------------------------------------------")

        curr_team_metadata = nhl_scraper.get_player_by_team_with_reusable_driver(curr_team, season, driver, wait)
        if curr_team_metadata is None:
            print(f'Failed to scrape {curr_team} for {season}')
            continue

        # Write to CSV
        curr_team_metadata.to_csv(curr_team_output_path, index=False, encoding='utf-8-sig')
        print(f'Finished scraping {curr_team} for {season}')

        # Add random sleep to prevent getting blocked
        sleep_time = random.uniform(10, 30)
        print(f"Sleep for {sleep_time / 60:.2f} minutes to prevent getting blocked\n")
        time.sleep(sleep_time)

##### Collecting All Teams' Metadata

In [None]:
from selenium.webdriver.support.wait import WebDriverWait
import undetected_chromedriver as uc

# Setup Chrome Driver ONCE
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = uc.Chrome(version_main=138, options=chrome_options)
wait = WebDriverWait(driver, 15)

In [None]:
for team in valid_teams:
    get_all_teams_metadata(team, driver, wait)

driver.quit()

#### Get unique players' metadata from all teams and seasons

In [None]:
# Get unique players' metadata from all teams and seasons
nhl_players_metadata = pd.DataFrame()

# Read in all csv files in ./data/nhl/official/teams/ and merge them
players_files = glob.glob('./data/nhl/official/teams/**/*.csv', recursive=True)

for player_file in players_files:
    curr_team_metadata = pd.read_csv(player_file)
    nhl_players_metadata = pd.concat([nhl_players_metadata, curr_team_metadata]).reset_index(drop=True)
    nhl_players_metadata = nhl_players_metadata.drop_duplicates(subset=['player_name']).reset_index(drop=True)
    print(f'Finished merging {player_file}')

# Write to CSV
nhl_players_metadata.to_csv('./data/nhl/official/nhl_players_metadata.csv', index=False, encoding='utf-8-sig')

In [None]:
# Read in nhl_players_metadata.csv
nhl_players_metadata_official = pd.read_csv('./data/nhl/official/nhl_players_metadata.csv')

In [None]:
len(nhl_players_metadata_official)

##### Exclude Goalies from the nhl_players_metadata_official

In [None]:
# Exclude rows where the position is G -> Exclude Goalies
nhl_skaters_metadata_official = nhl_players_metadata_official[nhl_players_metadata_official['player_pos'] != 'G']

# Write to CSV
nhl_skaters_metadata_official.to_csv('./data/nhl/official/nhl_skaters_metadata_official.csv', index=False, encoding='utf-8-sig')

In [None]:
len(nhl_skaters_metadata_official)

#### Remove Accent Characters from EP Metadata

In [None]:
from unidecode import unidecode

# Load eliteprospects meta
nhl_skaters_metadata_ep = pd.read_csv('./data/nhl/nhl_players_metadata.csv')
nhl_players_metadata_ep = pd.read_csv('./data/nhl/nhl_players_metadata.csv')

# Apply unidecode to all string columns
for col in nhl_players_metadata_ep.columns:
    if nhl_players_metadata_ep[col].dtype == 'object':
        nhl_players_metadata_ep[col] = nhl_players_metadata_ep[col].apply(unidecode)

# Save the cleaned CSV
nhl_skaters_metadata_ep.to_csv('./data/nhl/nhl_skaters_metadata_accent_cleaned.csv', index=False, encoding='utf-8-sig')

#### Merged EP Metadata with Official Metadata and Skip the Missing Players

In [None]:
# Merge the two dataframes on player_name -> keep only players present in the official dataset
nhl_skaters_metadata_official_ep_merge = pd.merge(nhl_skaters_metadata_official, nhl_skaters_metadata_ep, on='player_name', how='left')

# Rename columns
nhl_skaters_metadata_official_ep_merge = nhl_skaters_metadata_official_ep_merge.rename(columns={
        'player_link': 'player_link_official',
        'link': 'player_link_ep'
    }
)

# Select only the columns we need
nhl_skaters_metadata_official_ep_merge = nhl_skaters_metadata_official_ep_merge[['player_name', 'player_pos', 'player_link_official', 'player_link_ep', 'player_image']]


In [None]:
# Write to CSV
nhl_skaters_metadata_official_ep_merge.to_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge.csv', index=False, encoding='utf-8-sig')

#### Find all players without player_link_ep


In [None]:
# Find all the missing player-rows in official data after merging
players_missing_ep_link = nhl_skaters_metadata_official_ep_merge[nhl_skaters_metadata_official_ep_merge['player_link_ep'].isnull()]

In [None]:
players_missing_ep_link

In [None]:
# Write to CSV
players_missing_ep_link.to_csv('./data/nhl/missing_players_in_official_after_merged.csv', index=False, encoding='utf-8-sig')

#### Manually Add the Missing Players' EP Links
- Later on, we can use the EP Links to match the players' stats from EP to the official data

In [None]:
# Read in manually added missing players' EP links
nhl_skaters_metadata_official_ep_merge_complete = pd.read_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete.csv')

In [None]:
# Find all rows missing player_link_ep
players_missing_ep_link = nhl_skaters_metadata_official_ep_merge_complete[nhl_skaters_metadata_official_ep_merge_complete['player_link_ep'].isnull()]

players_missing_ep_link

##### Find players which is in nhl_skaters_metadata_official_ep_merge_complete, but not in nhl_players_metadata by player_link_ep

In [None]:
# Find players which are in nhl_skaters_metadata_official_ep_merge_complete, but not in nhl_players_metadata by player_link_ep by player_link_ep
# Load eliteprospects meta
nhl_players_metadata_ep = pd.read_csv('./data/nhl/nhl_players_metadata.csv')
nhl_skaters_metadata_official_ep_merge_complete = pd.read_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete_2.csv')

# Find players which are in nhl_skaters_metadata_official_ep_merge_complete, but not in nhl_players_metadata by player_link_ep
players_missing_after_manual_merge = nhl_players_metadata_ep[~nhl_players_metadata_ep['link'].isin(nhl_skaters_metadata_official_ep_merge_complete['player_link_ep'])]

In [None]:
print(f"nhl_players_metadata_ep: {len(nhl_players_metadata_ep)}")
print(f"nhl_skaters_metadata_official_ep_merge_complete: {len(nhl_skaters_metadata_official_ep_merge_complete)}")

In [None]:
players_missing_after_manual_merge

In [None]:
# Remove duplicates based on player_link_ep for nhl_skaters_metadata_official_ep_merge_complete_2
nhl_skaters_metadata_official_ep_merge_complete = nhl_skaters_metadata_official_ep_merge_complete.drop_duplicates(subset=['player_link_ep']).reset_index(drop=True)

In [None]:
len(nhl_skaters_metadata_official_ep_merge_complete)

In [None]:
# Write to CSV
nhl_skaters_metadata_official_ep_merge_complete.to_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete_final.csv', index=False, encoding='utf-8-sig')

##### Update player_image URL to the latest
- original: https://assets.nhle.com/mugs/nhl/{season}/{team}/{player_id}.png
- latest: https://assets.nhle.com/mugs/nhl/latest/{player_id}

In [None]:
# Load nhl_skaters_metadata_official_ep_merge_complete_final.csv
nhl_skaters_metadata_official_ep_merge_complete_final = pd.read_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete_final.csv')

# Update player_image URL to the latest
nhl_skaters_metadata_official_ep_merge_complete_final['player_image'] = nhl_skaters_metadata_official_ep_merge_complete_final['player_link_official'].apply(lambda x: f"https://assets.nhle.com/mugs/nhl/latest/{x.split('/')[-1]}.png")

In [None]:
nhl_skaters_metadata_official_ep_merge_complete_final.head(10)

In [None]:
# Write to CSV to update image
nhl_skaters_metadata_official_ep_merge_complete_final.to_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete_final.csv', index=False, encoding='utf-8-sig')

#### Collect Stats from Each Player's Page

##### Initiate the Chrome Driver

In [2]:
from selenium.webdriver.support.wait import WebDriverWait
import undetected_chromedriver as uc

# Setup Chrome Driver ONCE
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = uc.Chrome(version_main=138, options=chrome_options)
wait = WebDriverWait(driver, 15)

##### Test get_player_stats API

In [3]:
# Load CSV File
nhl_skaters_metadata_official_ep_merge_complete_final = pd.read_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete_final.csv')

In [4]:
# Get Cale Makar's Metadata which is at index 233
cale_makar_metadata = nhl_skaters_metadata_official_ep_merge_complete_final.iloc[233]
cale_makar_metadata

player_name                                                    Cale Makar
player_pos                                                              D
player_link_official                   https://www.nhl.com/player/8480069
player_link_ep          https://www.eliteprospects.com/player/199655/c...
player_image            https://assets.nhle.com/mugs/nhl/latest/848006...
Name: 233, dtype: object

In [5]:
# Passed in to the get stats function
cale_makar_stats = nhl_scraper.get_player_stats_with_reusable_driver(cale_makar_metadata, driver, wait)

Collecting Cale Makar's stats from https://www.nhl.com/player/8480069
Scraping 'All Leagues' regular season stats for Cale Makar
Successfully located dropdown button
Successfully clicked dropdown button
Successfully located all leagues option
Successfully clicked all leagues option
Successfully scraped regular season stats
Scraping 'playoff stats' for Cale Makar
Successfully located game-type dropdown button
Successfully clicked playoffs dropdown button
Successfully selected 'Playoffs' option
Successfully scraped playoff stats


In [6]:
driver.quit()

In [7]:
cale_makar_stats

Unnamed: 0,player_name,season,league,team,gp_regular,g_regular,a_regular,p_regular,plus_minus_regular,pim_regular,...,ppg_playoffs,ppp_playoffs,shg_playoffs,shp_playoffs,toi_per_game_playoffs,gwg_playoffs,otg_playoffs,sog_playoffs,shooting_pct_playoffs,fo_pct_playoffs
0,Cale Makar,2011-12,HCBAA,NWCAA Bruins Bantam AA,28,4,16,20,0.0,18,...,,,,,,,,,,
1,Cale Makar,2012-13,AMBHL,Calgary Flames Bantam AAA,33,3,19,22,0.0,22,...,,,,,,,,,,
2,Cale Makar,2013-14,AMHL,Calgary Flames Midget AAA,6,0,1,1,0.0,4,...,,,,,,,,,,
3,Cale Makar,2013-14,AMMHL,NWCAA Stampeders Minor Midget,36,9,19,28,0.0,35,...,,,,,,,,,,
4,Cale Makar,2014-15,AJHL,Brooks Bandits,3,1,4,5,0.0,4,...,,,,,,,,,,
5,Cale Makar,2014-15,AMHL,Calgary Flames Midget AAA,34,7,16,23,0.0,14,...,,,,,,,,,,
6,Cale Makar,2015-16,AJHL,Brooks Bandits,54,10,45,55,,28,...,,,,,,,,,,
7,Cale Makar,2015-16,WJAC-19,Canada West U19,4,1,0,1,,0,...,,,,,,,,,,
8,Cale Makar,2016-17,AJHL,Brooks Bandits,54,24,51,75,,18,...,,,,,,,,,,
9,Cale Makar,2016-17,WJAC-19,Canada West U19,4,4,4,8,,0,...,,,,,,,,,,


### Collect Stats for All Players

In [30]:
# Load CSV File
nhl_skaters_metadata_official_ep_merge_complete_final = pd.read_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete_final.csv')

#### Initiate Reusable Driver

In [31]:
from selenium.webdriver.support.wait import WebDriverWait
import undetected_chromedriver as uc

# Set up Chrome Driver ONCE
chrome_options = uc.ChromeOptions()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = uc.Chrome(version_main=138, options=chrome_options)
wait = WebDriverWait(driver, 15)

#### Define Output File

In [32]:
official_stats_output_path = './data/nhl/official/stats/nhl_players_official_stats.csv'

#### Define Function to Get Player Stats in Batches

In [33]:
def get_players_stats_by_batch(players_to_scrape, driver, wait, output_path):
    curr_len = len(players_to_scrape)
    fail_count = 0

    for i in range(curr_len):
        player_metadata = players_to_scrape.iloc[i]
        player_name = player_metadata['player_name']
        player_url = player_metadata['player_link_official']
        print(f"\n [{i + 1}] Collecting stats for {player_name} at {player_url}")

        try:
            player_stats = nhl_scraper.get_player_stats_with_reusable_driver(player_metadata, driver, wait)

            # Write to CSV file
            if os.path.exists(output_path):
                player_stats.to_csv(output_path, mode='a', header=False, index=False, encoding='utf-8-sig')
            else:
                player_stats.to_csv(output_path, index=False, encoding='utf-8-sig')
            print(f'Successfully scraped stats for {player_name}')

            # Print Fail Rate
            print(f'Failed rate: {fail_count / (i + 1):.2f}')

            # Add random sleep to prevent getting blocked
            if i < curr_len - 1:
                sleep_time = random.uniform(10, 120)
                print(f"Sleep for {sleep_time / 60:.2f} minutes to prevent getting blocked")
                time.sleep(sleep_time)
        except Exception as e:
            print(f"Failed to get stats for {player_name}: {e}")

            fail_count += 1

            if i < curr_len - 1:
                # Sleep for 15-60 seconds before trying the next player
                sleep_time = random.uniform(15, 60)
                print(f"Sleeping for {sleep_time / 60:.2f} seconds before trying the next player")
                time.sleep(sleep_time)

##### Collect Stats for All Players

In [12]:
# Scrape 0 to 10 players' official stats
curr_players_metadata = nhl_skaters_metadata_official_ep_merge_complete_final[0:10]
get_players_stats_by_batch(curr_players_metadata, driver, wait, official_stats_output_path)


 [1] Collecting facts for Joe Sakic at https://www.nhl.com/player/8451101
Collecting Joe Sakic's stats from https://www.nhl.com/player/8451101
Scraping 'All Leagues' regular season stats for Joe Sakic
Successfully located dropdown button
Successfully clicked dropdown button
Successfully located all leagues option
Successfully clicked all leagues option
Successfully scraped regular season stats
Scraping 'playoff stats' for Joe Sakic
Successfully located game-type dropdown button
Successfully clicked playoffs dropdown button
Successfully selected 'Playoffs' option
Successfully scraped playoff stats
Successfully scraped facts for Joe Sakic
Failed rate: 0.00
Sleep for 1.72 minutes to prevent getting blocked

 [2] Collecting facts for Peter Forsberg at https://www.nhl.com/player/8458520
Collecting Peter Forsberg's stats from https://www.nhl.com/player/8458520
Scraping 'All Leagues' regular season stats for Peter Forsberg
Successfully located dropdown button
Successfully clicked dropdown bu

In [None]:
# Scrape 10 to 40 players' official stats
curr_players_metadata = nhl_skaters_metadata_official_ep_merge_complete_final[10:40]
get_players_stats_by_batch(curr_players_metadata, driver, wait, official_stats_output_path)

In [12]:
# Scrape 40 to 45 players' official stats
curr_players_metadata = nhl_skaters_metadata_official_ep_merge_complete_final[40:45]
get_players_stats_by_batch(curr_players_metadata, driver, wait, official_stats_output_path)


 [1] Collecting stats for Jaroslav Obsut at https://www.nhl.com/player/8462220
Collecting Jaroslav Obsut's stats from https://www.nhl.com/player/8462220
Scraping 'All Leagues' regular season stats for Jaroslav Obsut
Successfully located dropdown button
Successfully clicked dropdown button
Successfully located all leagues option
Successfully clicked all leagues option
Successfully scraped regular season stats
Scraping 'playoff stats' for Jaroslav Obsut
Successfully located game-type dropdown button
Successfully clicked playoffs dropdown button
Successfully selected 'Playoffs' option
Successfully scraped playoff stats
Successfully scraped stats for Jaroslav Obsut
Failed rate: 0.00
Sleep for 0.65 minutes to prevent getting blocked

 [2] Collecting stats for Steve Moore at https://www.nhl.com/player/8467379
Collecting Steve Moore's stats from https://www.nhl.com/player/8467379
Scraping 'All Leagues' regular season stats for Steve Moore
Successfully located dropdown button
Successfully cli

In [13]:
# Scrape 45 to 50 players' official stats
curr_players_metadata = nhl_skaters_metadata_official_ep_merge_complete_final[45:50]
get_players_stats_by_batch(curr_players_metadata, driver, wait, official_stats_output_path)


 [1] Collecting stats for Serge Aubin at https://www.nhl.com/player/8460649
Collecting Serge Aubin's stats from https://www.nhl.com/player/8460649
Scraping 'All Leagues' regular season stats for Serge Aubin
Successfully located dropdown button
Successfully clicked dropdown button
Successfully located all leagues option
Successfully clicked all leagues option
Successfully scraped regular season stats
Scraping 'playoff stats' for Serge Aubin
Successfully located game-type dropdown button
Successfully clicked playoffs dropdown button
Successfully selected 'Playoffs' option
Successfully scraped playoff stats
Successfully scraped stats for Serge Aubin
Failed rate: 0.00
Sleep for 0.80 minutes to prevent getting blocked

 [2] Collecting stats for Jeff Shantz at https://www.nhl.com/player/8458972
Collecting Jeff Shantz's stats from https://www.nhl.com/player/8458972
Scraping 'All Leagues' regular season stats for Jeff Shantz
Successfully located dropdown button
Successfully clicked dropdown b

In [16]:
# Scrape 50 to 100 players' official stats
curr_players_metadata = nhl_skaters_metadata_official_ep_merge_complete_final[50:100]
get_players_stats_by_batch(curr_players_metadata, driver, wait, official_stats_output_path)


 [1] Collecting stats for Chris McAllister at https://www.nhl.com/player/8462068
Collecting Chris McAllister's stats from https://www.nhl.com/player/8462068
Scraping 'All Leagues' regular season stats for Chris McAllister
Successfully located dropdown button
Successfully clicked dropdown button
Successfully located all leagues option
Successfully clicked all leagues option
Successfully scraped regular season stats
Scraping 'playoff stats' for Chris McAllister
Successfully located game-type dropdown button
Successfully clicked playoffs dropdown button
Successfully selected 'Playoffs' option
Successfully scraped playoff stats
Successfully scraped stats for Chris McAllister
Failed rate: 0.00
Sleep for 1.57 minutes to prevent getting blocked

 [2] Collecting stats for Steve Brule at https://www.nhl.com/player/8459566
Collecting Steve Brule's stats from https://www.nhl.com/player/8459566
Scraping 'All Leagues' regular season stats for Steve Brule
Successfully located dropdown button
Succes

In [17]:
# Scrape 100 to 200 players' official stats
curr_players_metadata = nhl_skaters_metadata_official_ep_merge_complete_final[100:200]
get_players_stats_by_batch(curr_players_metadata, driver, wait, official_stats_output_path)


 [1] Collecting stats for David Jones at https://www.nhl.com/player/8470877
Collecting David Jones's stats from https://www.nhl.com/player/8470877
Scraping 'All Leagues' regular season stats for David Jones
Successfully located dropdown button
Successfully clicked dropdown button
Successfully located all leagues option
Successfully clicked all leagues option
Successfully scraped regular season stats
Scraping 'playoff stats' for David Jones
Successfully located game-type dropdown button
Successfully clicked playoffs dropdown button
Successfully selected 'Playoffs' option
Successfully scraped playoff stats
Successfully scraped stats for David Jones
Failed rate: 0.00
Sleep for 0.94 minutes to prevent getting blocked

 [2] Collecting stats for Wyatt Smith at https://www.nhl.com/player/8466357
Collecting Wyatt Smith's stats from https://www.nhl.com/player/8466357
Scraping 'All Leagues' regular season stats for Wyatt Smith
Successfully located dropdown button
Successfully clicked dropdown b

In [18]:
# Scrape 200 to 300 players' official stats
curr_players_metadata = nhl_skaters_metadata_official_ep_merge_complete_final[200:300]
get_players_stats_by_batch(curr_players_metadata, driver, wait, official_stats_output_path)


 [1] Collecting stats for Rene Bourque at https://www.nhl.com/player/8471504
Collecting Rene Bourque's stats from https://www.nhl.com/player/8471504
Scraping 'All Leagues' regular season stats for Rene Bourque
Successfully located dropdown button
Successfully clicked dropdown button
Successfully located all leagues option
Successfully clicked all leagues option
Successfully scraped regular season stats
Scraping 'playoff stats' for Rene Bourque
Successfully located game-type dropdown button
Successfully clicked playoffs dropdown button
Successfully selected 'Playoffs' option
Successfully scraped playoff stats
Successfully scraped stats for Rene Bourque
Failed rate: 0.00
Sleep for 0.47 minutes to prevent getting blocked

 [2] Collecting stats for Sven Andrighetto at https://www.nhl.com/player/8477413
Collecting Sven Andrighetto's stats from https://www.nhl.com/player/8477413
Scraping 'All Leagues' regular season stats for Sven Andrighetto
Successfully located dropdown button
Successfull

In [19]:
# Scrape 300 to 400 players' official stats
curr_players_metadata = nhl_skaters_metadata_official_ep_merge_complete_final[300:400]
get_players_stats_by_batch(curr_players_metadata, driver, wait, official_stats_output_path)


 [1] Collecting stats for Caleb Jones at https://www.nhl.com/player/8478452
Collecting Caleb Jones's stats from https://www.nhl.com/player/8478452
Scraping 'All Leagues' regular season stats for Caleb Jones
Successfully located dropdown button
Successfully clicked dropdown button
Successfully located all leagues option
Successfully clicked all leagues option
Successfully scraped regular season stats
Scraping 'playoff stats' for Caleb Jones
Successfully located game-type dropdown button
Successfully clicked playoffs dropdown button
Successfully selected 'Playoffs' option
Successfully scraped playoff stats
Successfully scraped stats for Caleb Jones
Failed rate: 0.00
Sleep for 1.44 minutes to prevent getting blocked

 [2] Collecting stats for Yakov Trenin at https://www.nhl.com/player/8478508
Collecting Yakov Trenin's stats from https://www.nhl.com/player/8478508
Scraping 'All Leagues' regular season stats for Yakov Trenin
Successfully located dropdown button
Successfully clicked dropdow

In [20]:
# Scrape 400 to 500 players' official stats
curr_players_metadata = nhl_skaters_metadata_official_ep_merge_complete_final[400:500]
get_players_stats_by_batch(curr_players_metadata, driver, wait, official_stats_output_path)


 [1] Collecting stats for Dustin Byfuglien at https://www.nhl.com/player/8470834
Collecting Dustin Byfuglien's stats from https://www.nhl.com/player/8470834
Scraping 'All Leagues' regular season stats for Dustin Byfuglien
Successfully located dropdown button
Successfully clicked dropdown button
Successfully located all leagues option
Successfully clicked all leagues option
Successfully scraped regular season stats
Scraping 'playoff stats' for Dustin Byfuglien
Successfully located game-type dropdown button
Successfully clicked playoffs dropdown button
Successfully selected 'Playoffs' option
Successfully scraped playoff stats
Successfully scraped stats for Dustin Byfuglien
Failed rate: 0.00
Sleep for 1.37 minutes to prevent getting blocked

 [2] Collecting stats for Brandon Bochenski at https://www.nhl.com/player/8469666
Collecting Brandon Bochenski's stats from https://www.nhl.com/player/8469666
Scraping 'All Leagues' regular season stats for Brandon Bochenski
Successfully located drop

In [21]:
# Scrape 500 to 600 players' official stats
curr_players_metadata = nhl_skaters_metadata_official_ep_merge_complete_final[500:600]
get_players_stats_by_batch(curr_players_metadata, driver, wait, official_stats_output_path)


 [1] Collecting stats for Klas Dahlbeck at https://www.nhl.com/player/8476403
Collecting Klas Dahlbeck's stats from https://www.nhl.com/player/8476403
Scraping 'All Leagues' regular season stats for Klas Dahlbeck
Successfully located dropdown button
Successfully clicked dropdown button
Successfully located all leagues option
Successfully clicked all leagues option
Successfully scraped regular season stats
Scraping 'playoff stats' for Klas Dahlbeck
Successfully located game-type dropdown button
Successfully clicked playoffs dropdown button
Successfully selected 'Playoffs' option
Successfully scraped playoff stats
Successfully scraped stats for Klas Dahlbeck
Failed rate: 0.00
Sleep for 0.85 minutes to prevent getting blocked

 [2] Collecting stats for Michael Paliotta at https://www.nhl.com/player/8476394
Collecting Michael Paliotta's stats from https://www.nhl.com/player/8476394
Scraping 'All Leagues' regular season stats for Michael Paliotta
Successfully located dropdown button
Succes

### Fetch Missing Players' Stats

In [34]:
# Read in nhl_skaters_metadata_official_ep_merge_complete_final.csv
nhl_skaters_metadata_official_ep_merge_complete_final = pd.read_csv('./data/nhl/nhl_skaters_metadata_official_ep_merge_complete_final.csv')

In [35]:
# Get unique players from official_stats_output_path
if os.path.exists(official_stats_output_path):
    # Read in the CSV file and set low_memory = False
    nhl_players_official_stats = pd.read_csv(official_stats_output_path, low_memory=False)
    unique_players = set(nhl_players_official_stats['player_name'].unique())
else:
    unique_players = set()

In [36]:
# Find players in nhl_players_metadata 0-600 not in unique_players
subset = nhl_skaters_metadata_official_ep_merge_complete_final[0:600]
players_to_scrape = subset[~subset['player_name'].isin(unique_players)].reset_index(drop=True)
players_to_scrape

Unnamed: 0,player_name,player_pos,player_link_official,player_link_ep,player_image


In [25]:
# Fetch Missing Players' Stats
get_players_stats_by_batch(players_to_scrape, driver, wait, official_stats_output_path)


 [1] Collecting stats for Greg Zanon at https://www.nhl.com/player/8468636
Collecting Greg Zanon's stats from https://www.nhl.com/player/8468636
Scraping 'All Leagues' regular season stats for Greg Zanon
Successfully located dropdown button
Successfully clicked dropdown button
Successfully located all leagues option
Successfully clicked all leagues option
Successfully scraped regular season stats
Scraping 'playoff stats' for Greg Zanon
Successfully located game-type dropdown button
Successfully clicked playoffs dropdown button
Successfully selected 'Playoffs' option
Successfully scraped playoff stats
Successfully scraped stats for Greg Zanon
Failed rate: 0.00
Sleep for 0.77 minutes to prevent getting blocked

 [2] Collecting stats for Jonathan Drouin at https://www.nhl.com/player/8477494
Collecting Jonathan Drouin's stats from https://www.nhl.com/player/8477494
Scraping 'All Leagues' regular season stats for Jonathan Drouin
Successfully located dropdown button
Successfully clicked dro

In [37]:
# Calculate the unique players in official_stats_output_path
unique_players = set(nhl_players_official_stats['player_name'].unique())
len(unique_players)

600

In [39]:
# Remove Duplicates Rows
nhl_players_official_stats = nhl_players_official_stats.drop_duplicates(subset=['player_name', 'season', 'team', 'league', 'gp_regular', 'g_regular', 'a_regular', 'p_regular', 'plus_minus_regular', 'pim_regular', 'ppg_regular', 'ppp_regular', 'shg_regular', 'shp_regular', 'toi_per_game_regular', 'gwg_regular', 'otg_regular', 'sog_regular', 'shooting_pct_regular', 'fo_pct_regular', 'gp_playoffs', 'g_playoffs', 'a_playoffs', 'p_playoffs', 'plus_minus_playoffs', 'pim_playoffs', 'ppg_playoffs', 'ppp_playoffs', 'shg_playoffs', 'shp_playoffs', 'toi_per_game_playoffs', 'gwg_playoffs', 'otg_playoffs', 'sog_playoffs', 'shooting_pct_playoffs', 'fo_pct_playoffs']).reset_index(drop=True)

In [41]:
# Write to CSV
nhl_players_official_stats.to_csv(official_stats_output_path, index=False, encoding='utf-8-sig')

### Close the Driver

In [14]:
driver.quit()