In [1]:
# Enable auto-reloading of modules
%load_ext autoreload
%autoreload 2

In [2]:
from nhl_scraper import scraper
import polars as pl
from tqdm import tqdm

!playwright install

from nhl_scraper.scraper import NHLGameURLGenerator, NHLPlayByPlayParser, NHLDataScraper, NHLBaseScraper

scraper = scraper.Scraper()


Welcome to the NHL Scraper!

My name is Max and I'm the developer of this project and hockey fan. I hope you enjoy using this tool!

This is a work in progress and not all functionalities are available yet.

*** IMPORTANT ***

Make sure to run the following command in the terminal (it might take a while, but it will ensure that the playwright browser is installed):

$ playwright install
or
$ playwright install chromium

***

If you have any questions, please contact me on X: @woumaxx[https://x.com/woumaxx]

Thank you for using the NHL Scraper!





In [10]:
game_id = 2024021197

data = await scraper.scrape_toi(game_ids=[game_id,])

toi_df = data['TOI_combined']

url = NHLGameURLGenerator(game_id).get_api_url()
base_scraper = NHLBaseScraper()  # Create an instance
data = await NHLDataScraper.scrape_api(base_scraper, url=url)
rosters = NHLPlayByPlayParser.process_rosters(base_scraper, data)
rosters = (rosters
           .with_columns(pl.col("Home/Away").replace_strict({0: "V", 1: "H"}).alias("H/V"),
                        pl.col("sweaterNumber").cast(pl.Int32, strict=False).alias("#"),)
           .rename({'Team': 'abbrev'})
           .select(['teamId', 'playerId', 'positionCode', 'headshot',  'fullName',  'firstName.default', 'lastName.default', 'abbrev','H/V', '#'])
        )

toi_df = toi_df.join(rosters, on=["#", "H/V"], how='left')

toi_df = toi_df.with_columns(
    # Start_Seconds calculation
    ((pl.col("Start Elapsed Time - Legacy").str.split(":").list.get(0).cast(pl.Int64) * 60) +
     (pl.col("Start Elapsed Time - Legacy").str.split(":").list.get(1).cast(pl.Int64)) +
     ((pl.col("Period").cast(pl.Int64) - 1) * 1200)).alias("Start_Seconds"),
    
    # End_Seconds calculation
    ((pl.col("End Elapsed Time - Legacy").str.split(":").list.get(0).cast(pl.Int64) * 60) +
     (pl.col("End Elapsed Time - Legacy").str.split(":").list.get(1).cast(pl.Int64)) +
     ((pl.col("Period").cast(pl.Int64) - 1) * 1200)).alias("End_Seconds"),
)



# First, let's get the game duration (max End_Seconds)
max_seconds = toi_df["End_Seconds"].max()

# Create a matrix of zeros (seconds x players)
strength_matrix = pl.DataFrame()

# Initialize with seconds column
strength_matrix = strength_matrix.with_columns(
    pl.Series("second", range(max_seconds ))
)

# Count players for each team at each second (excluding goalies)
for team in ['H', 'V']:  # Home and Visiting teams
    team_players = toi_df.filter(
        (pl.col("H/V") == team) & 
        (pl.col("positionCode") != "G")  # Exclude goalies
    )
    
    # Get team abbreviation
    team_abbrev = team_players["abbrev"].unique()[0]
    
    # Create a list of 0s for each second
    players_on = [0] * (max_seconds )
    
    # Count players on ice at each second
    for row in team_players.iter_rows():
        start = row[team_players.columns.index("Start_Seconds")]
        end = row[team_players.columns.index("End_Seconds")]
        for second in range(start, end):
            players_on[second] += 1
    
    # Add the columns to the matrix
    strength_matrix = strength_matrix.with_columns(
        pl.Series(name=f"team_{team}", values=players_on),
        pl.lit(team_abbrev).alias(f"team_{team}_abbrev")
    )

# Add strength columns for each team's perspective
strength_matrix = strength_matrix.with_columns(
    pl.concat_str([
        pl.col("team_H").cast(pl.Utf8),
        pl.lit("v"),
        pl.col("team_V").cast(pl.Utf8)
    ]).alias("strength_H"),
    
    pl.concat_str([
        pl.col("team_V").cast(pl.Utf8),
        pl.lit("v"),
        pl.col("team_H").cast(pl.Utf8)
    ]).alias("strength_V"),
    
    pl.concat_str([
        pl.col("team_H_abbrev"),
        pl.lit(" "),
        pl.col("team_H").cast(pl.Utf8),
        pl.lit("v"),
        pl.col("team_V").cast(pl.Utf8),
        pl.lit(" "),
        pl.col("team_V_abbrev")
    ]).alias("strength_with_teams"),
    # pl.lit(game_id).alias("gameId")
)



# Get unique players
players = toi_df["playerId"].unique().to_list()

# Create a matrix of zeros (seconds x players)
ice_time_matrix = pl.DataFrame()

# For each player, create a column showing when they were on ice
for player in players:
    player_shifts = toi_df.filter(pl.col("playerId") == player)
    
    # Create a list of 0s with length max_seconds + 1
    on_ice = [0] * (max_seconds)
    
    # Fill in 1s for each second the player was on ice
    for row in player_shifts.iter_rows():
        start = row[toi_df.columns.index("Start_Seconds")]
        end = row[toi_df.columns.index("End_Seconds")]
        for second in range(start, end ):
            on_ice[second] = 1
    
    # Add the column to the matrix
    ice_time_matrix = ice_time_matrix.with_columns(
        pl.Series(name=str(player), values=on_ice)
    )

# Add a seconds column
ice_time_matrix = ice_time_matrix.with_columns(
    pl.Series("second", range(max_seconds ))
)

ice_time_matrix

Scraping NHL TOI Data:   0%|          | 0/1 [00:00<?, ?it/s]

Scraping NHL TOI Data: 100%|██████████| 1/1 [00:00<00:00,  1.18it/s]


8475233,8475848,8476469,8476875,8476981,8477496,8477507,8477887,8477956,8477989,8478133,8478401,8478450,8478470,8478851,8479339,8479369,8479543,8479968,8479987,8479999,8480018,8480035,8480070,8480280,8480880,8481043,8481540,8481556,8481593,8481618,8482087,8482177,8482476,8482511,8482763,8483457,8483515,second
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,1,0
0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,1,1
0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,1,2
0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,1,3
0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1,1,4
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,1,0,0,0,3595
0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,1,0,0,0,3596
0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,1,0,0,0,3597
0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,1,0,0,0,3598
