## Import Things

In [1]:
import pandas as pd
import requests
import json

pd.set_option('display.max_columns', 500)

In [2]:
# Create function to print json nicely
def jprint(obj):
    # create a formatted string of the Python JSON object
    text = json.dumps(obj, sort_keys=True, indent=4)
    print(text)

## Get Data From NHL API

### Functions to Get Data for a Season

In [3]:
# Get skaters for a specific season
def get_skaters(season):
    url = f'https://api.nhle.com/stats/rest/en/skater/bios?limit=-1&start=0&cayenneExp=seasonId={season}'
    skaters_request = requests.get(url)
    skaters_json = skaters_request.json()
    skaters_df = pd.json_normalize(skaters_json, "data")
    return(skaters_df)

In [4]:
# Get goalies for a specific season
def get_goalies(season):
    url = f'https://api.nhle.com/stats/rest/en/goalie/bios?limit=-1&start=0&cayenneExp=seasonId={season}'
    goalies_request = requests.get(url)
    goalies_json = goalies_request.json()
    goalies_df = pd.json_normalize(goalies_json, "data")
    return(goalies_df)

In [5]:
# Get teams
def get_teams():
    url = f'https://api.nhle.com/stats/rest/en/team'
    teams_request = requests.get(url)
    teams_json = teams_request.json()
    teams_df = pd.json_normalize(teams_json, "data")
    return(teams_df)

In [6]:
# Get all games for a specified season
def get_schedule(season, teams_df):
    # Initialize schedule list
    schedule = []

    # Loop through all teams to get schedules
    for triCode in teams_df["triCode"]:
        url = f'https://api-web.nhle.com/v1/club-schedule-season/{triCode}/{season}'
        schedule_request = requests.get(url)
        schedule_json = schedule_request.json()
        schedule_df = pd.json_normalize(schedule_json, "games")
        schedule.append(schedule_df)

    # Concatenate schedules and remove duplicates
    schedule = pd.concat(schedule, ignore_index = True)
    schedule = schedule.drop_duplicates(subset = ["id"])

    return(schedule)

In [7]:
# Get all play-by-play for a specified season
def get_pbp(season, schedule):
    # Initialize play-by-play list
    pbp = []
    count = 0

    # Loop through all games to get play-by-play for the specified season
    for Game in schedule["id"]:
        url = f'https://api-web.nhle.com/v1/gamecenter/{Game}/play-by-play'
        game_request = requests.get(url)
        game_json = game_request.json()
        game_df = pd.json_normalize(game_json, "plays")
        game_df["gameId"] = Game
        pbp.append(game_df)

        print("Pbp Game: ", count)
        count +=1

    # Concatenate play-by-play data
    pbp = pd.concat(pbp, ignore_index = True)

    return(pbp)

In [8]:
# Get all shifts for a specified season
def get_shifts(season, schedule):
    # Initialize shifts list
    shifts = []
    count = 0

    # Loop through all games to get shifts for the specified season
    for Game in schedule["id"]:
        url = f'https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId={Game}'
        game_request = requests.get(url)
        shifts_json = game_request.json()
        shifts_df = pd.json_normalize(shifts_json, "data")
        shifts.append(shifts_df)

        print("Shifts Game: ", count)
        count += 1

    # Concatenate shifts data
    shifts = pd.concat(shifts, ignore_index=True)

    return(shifts)

### Get four seasons worth of data

In [9]:
seasons = ["20212022", "20222023", "20232024", "20242025"]

In [10]:
# Initialize lists for the four years of data
skaters = []
goalies = []
schedule = []
pbp = []
shifts = []

# Get teams
teams = get_teams()

# Loop through seasons
for i in range(len(seasons)):
    skaters.append(get_skaters(seasons[i]))
    goalies.append(get_goalies(seasons[i]))
    schedule.append(get_schedule(seasons[i], teams))
    pbp.append(get_pbp(seasons[i], schedule[i]))
    shifts.append(get_shifts(seasons[i], schedule[i]))
    print(seasons[i], " is Done!")

Pbp Game:  0
Pbp Game:  1
Pbp Game:  2
Pbp Game:  3
Pbp Game:  4
Pbp Game:  5
Pbp Game:  6
Pbp Game:  7
Pbp Game:  8
Pbp Game:  9
Pbp Game:  10
Pbp Game:  11
Pbp Game:  12
Pbp Game:  13
Pbp Game:  14
Pbp Game:  15
Pbp Game:  16
Pbp Game:  17
Pbp Game:  18
Pbp Game:  19
Pbp Game:  20
Pbp Game:  21
Pbp Game:  22
Pbp Game:  23
Pbp Game:  24
Pbp Game:  25
Pbp Game:  26
Pbp Game:  27
Pbp Game:  28
Pbp Game:  29
Pbp Game:  30
Pbp Game:  31
Pbp Game:  32
Pbp Game:  33
Pbp Game:  34
Pbp Game:  35
Pbp Game:  36
Pbp Game:  37
Pbp Game:  38
Pbp Game:  39
Pbp Game:  40
Pbp Game:  41
Pbp Game:  42
Pbp Game:  43
Pbp Game:  44
Pbp Game:  45
Pbp Game:  46
Pbp Game:  47
Pbp Game:  48
Pbp Game:  49
Pbp Game:  50
Pbp Game:  51
Pbp Game:  52
Pbp Game:  53
Pbp Game:  54
Pbp Game:  55
Pbp Game:  56
Pbp Game:  57
Pbp Game:  58
Pbp Game:  59
Pbp Game:  60
Pbp Game:  61
Pbp Game:  62
Pbp Game:  63
Pbp Game:  64
Pbp Game:  65
Pbp Game:  66
Pbp Game:  67
Pbp Game:  68
Pbp Game:  69
Pbp Game:  70
Pbp Game:  71
Pb

  shifts = pd.concat(shifts, ignore_index=True)


In [11]:
skaters = pd.concat(skaters, ignore_index = True)
skaters = skaters.drop_duplicates(subset = ["playerId"])
print(len(skaters))
skaters.head()

1367


Unnamed: 0,assists,birthCity,birthCountryCode,birthDate,birthStateProvinceCode,currentTeamAbbrev,currentTeamName,draftOverall,draftRound,draftYear,firstSeasonForGameType,gamesPlayed,goals,height,isInHallOfFameYn,lastName,nationalityCode,playerId,points,positionCode,shootsCatches,skaterFullName,weight
0,11,North Chelmsford,USA,1996-10-28,MA,VGK,Vegas Golden Knights,2.0,1.0,2015.0,20152016,34,14,74,N,Eichel,USA,8478403,25,C,R,Jack Eichel,206
1,1,Ajax,CAN,2001-01-15,ON,WSH,Washington Capitals,25.0,1.0,2019.0,20212022,4,0,72,N,McMichael,CAN,8481580,1,C,L,Connor McMichael,180
2,28,Grand Rapids,USA,1985-07-30,MN,,,61.0,2.0,2004.0,20072008,72,2,71,N,Goligoski,USA,8471274,30,D,L,Alex Goligoski,173
3,11,Peterborough,CAN,1999-02-16,ON,PHI,Philadelphia Flyers,10.0,1.0,2017.0,20172018,63,10,73,N,Tippett,CAN,8480015,21,R,R,Owen Tippett,210
4,0,Portland,USA,1993-02-26,OR,COL,Colorado Avalanche,,,,20182019,8,0,72,N,MacDonald,USA,8479439,0,D,L,Jacob MacDonald,204


In [12]:
goalies = pd.concat(goalies, ignore_index = True)
goalies = goalies.drop_duplicates(subset = ["playerId"])
print(len(goalies))
goalies.head()

164


Unnamed: 0,birthCity,birthCountryCode,birthDate,birthStateProvinceCode,currentTeamAbbrev,draftOverall,draftRound,draftYear,firstSeasonForGameType,gamesPlayed,goalieFullName,height,isInHallOfFameYn,lastName,losses,nationalityCode,otLosses,playerId,shootsCatches,shutouts,ties,weight,wins
0,Sherwood Park,CAN,1998-08-13,AB,,48.0,2.0,2016.0,20182019,45,Carter Hart,74,N,Hart,24,CAN,7,8479394,L,1,,196,13
1,Rosenheim,DEU,1991-11-25,,SEA,112.0,4.0,2010.0,20122013,55,Philipp Grubauer,73,N,Grubauer,31,DEU,5,8475831,L,2,,188,18
2,Waterloo,USA,1994-10-19,IA,PHI,129.0,5.0,2013.0,20182019,37,Cal Petersen,74,N,Petersen,14,USA,2,8477361,R,3,,185,20
3,Fredericton,CAN,1990-08-07,NB,NJD,34.0,2.0,2008.0,20122013,35,Jake Allen,74,N,Allen,20,CAN,4,8474596,L,2,,197,9
4,Ruse,BGR,1996-02-10,,SJS,,,,20212022,2,Alexandar Georgiev,73,N,Georgiev,1,RUS,0,8480382,L,0,,178,0


In [13]:
schedule = pd.concat(schedule, ignore_index = True)
print(len(schedule))
schedule.head()

6028


Unnamed: 0,id,season,gameType,gameDate,neutralSite,startTimeUTC,easternUTCOffset,venueUTCOffset,venueTimezone,gameState,gameScheduleState,tvBroadcasts,threeMinRecap,condensedGame,gameCenterLink,venue.default,awayTeam.id,awayTeam.commonName.default,awayTeam.placeName.default,awayTeam.placeNameWithPreposition.default,awayTeam.placeNameWithPreposition.fr,awayTeam.abbrev,awayTeam.logo,awayTeam.darkLogo,awayTeam.awaySplitSquad,awayTeam.score,homeTeam.id,homeTeam.commonName.default,homeTeam.placeName.default,homeTeam.placeNameWithPreposition.default,homeTeam.placeNameWithPreposition.fr,homeTeam.abbrev,homeTeam.logo,homeTeam.darkLogo,homeTeam.homeSplitSquad,homeTeam.score,periodDescriptor.periodType,periodDescriptor.maxRegulationPeriods,gameOutcome.lastPeriodType,winningGoalie.playerId,winningGoalie.firstInitial.default,winningGoalie.lastName.default,winningGoalScorer.playerId,winningGoalScorer.firstInitial.default,winningGoalScorer.lastName.default,venue.fr,homeTeam.commonName.fr,awayTeam.commonName.fr,winningGoalie.lastName.cs,winningGoalie.lastName.sk,winningGoalScorer.lastName.cs,winningGoalScorer.lastName.fi,winningGoalScorer.lastName.sk,winningGoalie.lastName.fi,awayTeam.placeName.fr,venue.es,winningGoalie.lastName.de,winningGoalie.lastName.sv,homeTeam.placeName.fr,specialEvent.parentId,specialEvent.name.default,specialEvent.name.fr,threeMinRecapFr,winningGoalScorer.lastName.sv,seriesUrl,seriesStatus.round,seriesStatus.seriesAbbrev,seriesStatus.seriesTitle,seriesStatus.seriesLetter,seriesStatus.neededToWin,seriesStatus.topSeedWins,seriesStatus.bottomSeedWins,seriesStatus.gameNumberOfSeries,winningGoalScorer.lastName.de,winningGoalScorer.lastName.es,winningGoalie.lastName.es,venue.cs,venue.fi,venue.sk,winningGoalScorer.lastName.fr,ticketsLink,ticketsLinkFr,awayTeam.radioLink,homeTeam.radioLink,awayTeam.hotelLink,awayTeam.hotelDesc,homeTeam.hotelLink,homeTeam.hotelDesc,awayTeam.airlineLink,awayTeam.airlineDesc,homeTeam.airlineLink,homeTeam.airlineDesc,condensedGameFr,specialEvent.lightLogoUrl.default,alternateBroadcasts,specialEvent.lightLogoUrl.fr
0,2021010001,20212022,1,2021-09-25,False,2021-09-25T23:00:00Z,-04:00,-04:00,America/Toronto,FINAL,OK,"[{'id': 324, 'market': 'N', 'countryCode': 'US...",/video/recap-mtl-1-tor-4-326349014,/video/mtl-tor-326347426,/gamecenter/mtl-vs-tor/2021/09/25/2021010001,Scotiabank Arena,8,Canadiens,Montréal,Montréal,de Montréal,MTL,https://assets.nhle.com/logos/nhl/svg/MTL_ligh...,https://assets.nhle.com/logos/nhl/svg/MTL_dark...,False,1.0,10,Maple Leafs,Toronto,Toronto,de Toronto,TOR,https://assets.nhle.com/logos/nhl/svg/TOR_ligh...,https://assets.nhle.com/logos/nhl/svg/TOR_dark...,False,4.0,REG,3,REG,8474636.0,M.,Hutchinson,8475166.0,J.,Tavares,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2021010012,20212022,1,2021-09-27,False,2021-09-27T23:00:00Z,-04:00,-04:00,America/Montreal,FINAL,OK,"[{'id': 324, 'market': 'N', 'countryCode': 'US...",/video/recap-mtl-5-tor-2-326401358,/video/tor-mtl-326401422,/gamecenter/tor-vs-mtl/2021/09/27/2021010012,Centre Bell,10,Maple Leafs,Toronto,Toronto,de Toronto,TOR,https://assets.nhle.com/logos/nhl/svg/TOR_ligh...,https://assets.nhle.com/logos/nhl/svg/TOR_dark...,False,2.0,8,Canadiens,Montréal,Montréal,de Montréal,MTL,https://assets.nhle.com/logos/nhl/svg/MTL_ligh...,https://assets.nhle.com/logos/nhl/svg/MTL_dark...,False,5.0,REG,3,REG,8474596.0,J.,Allen,8479543.0,M.,Pezzetta,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2021010042,20212022,1,2021-10-01,False,2021-10-01T23:00:00Z,-04:00,-04:00,US/Eastern,FINAL,OK,"[{'id': 131, 'market': 'A', 'countryCode': 'CA...",/video/recap-ott-7-mtl-2-326511640,/video/mtl-ott-326511164,/gamecenter/mtl-vs-ott/2021/10/01/2021010042,Canadian Tire Centre,8,Canadiens,Montréal,Montréal,de Montréal,MTL,https://assets.nhle.com/logos/nhl/svg/MTL_ligh...,https://assets.nhle.com/logos/nhl/svg/MTL_dark...,False,2.0,9,Senators,Ottawa,Ottawa,d'Ottawa,OTT,https://assets.nhle.com/logos/nhl/svg/OTT_ligh...,https://assets.nhle.com/logos/nhl/svg/OTT_dark...,False,7.0,REG,3,REG,8476899.0,M.,Murray,8477426.0,N.,Paul,Centre Canadian Tire,Sénateurs,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2021010048,20212022,1,2021-10-02,False,2021-10-02T23:00:00Z,-04:00,-04:00,America/Montreal,FINAL,OK,"[{'id': 33, 'market': 'H', 'countryCode': 'CA'...",/video/recap-mtl-2-ott-1-326538178,/video/ott-mtl-326538554,/gamecenter/ott-vs-mtl/2021/10/02/2021010048,Centre Bell,9,Senators,Ottawa,Ottawa,d'Ottawa,OTT,https://assets.nhle.com/logos/nhl/svg/OTT_ligh...,https://assets.nhle.com/logos/nhl/svg/OTT_dark...,False,1.0,8,Canadiens,Montréal,Montréal,de Montréal,MTL,https://assets.nhle.com/logos/nhl/svg/MTL_ligh...,https://assets.nhle.com/logos/nhl/svg/MTL_dark...,False,2.0,REG,3,REG,8480051.0,C.,Primeau,8480018.0,N.,Suzuki,,,Sénateurs,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2021010067,20212022,1,2021-10-05,False,2021-10-05T23:00:00Z,-04:00,-04:00,America/Toronto,FINAL,OK,"[{'id': 131, 'market': 'A', 'countryCode': 'CA...",/video/recap-tor-6-mtl-2-326607746,/video/mtl-tor-326608736,/gamecenter/mtl-vs-tor/2021/10/05/2021010067,Scotiabank Arena,8,Canadiens,Montréal,Montréal,de Montréal,MTL,https://assets.nhle.com/logos/nhl/svg/MTL_ligh...,https://assets.nhle.com/logos/nhl/svg/MTL_dark...,False,2.0,10,Maple Leafs,Toronto,Toronto,de Toronto,TOR,https://assets.nhle.com/logos/nhl/svg/TOR_ligh...,https://assets.nhle.com/logos/nhl/svg/TOR_dark...,False,6.0,REG,3,REG,8475852.0,P.,Mrazek,8481624.0,I.,Mikheyev,,,,Mrázek,Mrázek,Michejev,Mihejev,Michejev,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [14]:
pbp = pd.concat(pbp, ignore_index = True)
print(len(pbp))
pbp.head()

1884099


Unnamed: 0,eventId,timeInPeriod,timeRemaining,situationCode,homeTeamDefendingSide,typeCode,typeDescKey,sortOrder,periodDescriptor.number,periodDescriptor.periodType,periodDescriptor.maxRegulationPeriods,details.eventOwnerTeamId,details.losingPlayerId,details.winningPlayerId,details.xCoord,details.yCoord,details.zoneCode,details.hittingPlayerId,details.hitteePlayerId,details.shotType,details.shootingPlayerId,details.goalieInNetId,details.awaySOG,details.homeSOG,details.reason,details.typeCode,details.descKey,details.duration,details.committedByPlayerId,details.drawnByPlayerId,details.blockingPlayerId,details.playerId,details.secondaryReason,details.scoringPlayerId,details.scoringPlayerTotal,details.assist1PlayerId,details.assist1PlayerTotal,details.assist2PlayerId,details.assist2PlayerTotal,details.awayScore,details.homeScore,details.discreteClip,details.servedByPlayerId,gameId,periodDescriptor.otPeriods,pptReplayUrl,details.highlightClipSharingUrl,details.highlightClipSharingUrlFr,details.highlightClip,details.highlightClipFr,details.discreteClipFr
0,51.0,00:00,20:00,1551,right,520.0,period-start,8.0,1.0,REG,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021010001,,,,,,,
1,53.0,00:00,20:00,1551,right,502.0,faceoff,9.0,1.0,REG,3.0,10.0,8480018.0,8475166.0,0.0,0.0,N,,,,,,,,,,,,,,,,,,,,,,,,,,,2021010001,,,,,,,
2,101.0,00:15,19:45,1551,right,503.0,hit,10.0,1.0,REG,3.0,10.0,,,-96.0,26.0,O,8475166.0,8481014.0,,,,,,,,,,,,,,,,,,,,,,,,,2021010001,,,,,,,
3,54.0,00:17,19:43,1551,right,506.0,shot-on-goal,11.0,1.0,REG,3.0,10.0,,,-49.0,7.0,O,,,snap,8480043.0,8480051.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,2021010001,,,,,,,
4,55.0,00:27,19:33,1551,right,507.0,missed-shot,12.0,1.0,REG,3.0,10.0,,,-31.0,39.0,O,,,wrist,8480043.0,8480051.0,,,wide-of-net,,,,,,,,,,,,,,,,,,,2021010001,,,,,,,


In [15]:
shifts = pd.concat(shifts, ignore_index = True)
print(len(shifts))
shifts.head()

4166972


Unnamed: 0,id,detailCode,duration,endTime,eventDescription,eventDetails,eventNumber,firstName,gameId,hexValue,lastName,period,playerId,shiftNumber,startTime,teamAbbrev,teamId,teamName,typeCode
0,11054626,0.0,00:37,02:31,,,105.0,Jason,2021020003,#00205B,Spezza,1,8469455.0,1,01:54,TOR,10,Toronto Maple Leafs,517
1,11054627,0.0,00:46,05:30,,,116.0,Jason,2021020003,#00205B,Spezza,1,8469455.0,2,04:44,TOR,10,Toronto Maple Leafs,517
2,11054628,0.0,00:39,06:53,,,118.0,Jason,2021020003,#00205B,Spezza,1,8469455.0,3,06:14,TOR,10,Toronto Maple Leafs,517
3,11054629,0.0,01:31,09:44,,,126.0,Jason,2021020003,#00205B,Spezza,1,8469455.0,4,08:13,TOR,10,Toronto Maple Leafs,517
4,11054630,0.0,00:25,13:33,,,138.0,Jason,2021020003,#00205B,Spezza,1,8469455.0,5,13:08,TOR,10,Toronto Maple Leafs,517


In [16]:
skaters.to_csv('skaters.csv', index=False) 
goalies.to_csv('goalies.csv', index=False) 
schedule.to_csv('schedule.csv', index=False)
pbp.to_csv('pbp.csv', index=False) 
shifts.to_csv('shifts.csv', index=False) 

In [123]:
pd.read_html("https://www.nhl.com/scores/htmlreports/20242025/PL030234.HTM")[0]

HTTPError: HTTP Error 403: Forbidden

In [121]:
!pip install lxml

Collecting lxml
  Downloading lxml-5.4.0-cp311-cp311-macosx_10_9_universal2.whl.metadata (3.5 kB)
Downloading lxml-5.4.0-cp311-cp311-macosx_10_9_universal2.whl (8.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: lxml
Successfully installed lxml-5.4.0


In [122]:
import lxml

In [223]:
url = 'https://www.nhl.com/scores/htmlreports/20242025/PL030234.HTM'

header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)

dfs = pd.read_html(r.text)

Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
[0m

In [228]:
soup

<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>Play By Play</title>
</head>
<style type="text/css">
				@media screen
				{
				     .print-class { display: block;}
				}
				

				div.page
				{
					page-break-after: always;
					pabe-break-inside: avoid;
				}
				body {border:solid; border-width: 0;}
				p, td {font-family: arial,verdana; font-size: 9px;}
				.tablewidth{width:650px;}
				.heading {font-weight:bold;}
				.goal{font-weight: bold;font-size:11px;}
				.penalty{font-style: italic;font-size:11px;}
				.border {border:1px solid black;border-collapse: collapse;}
				.noborder {border:0px solid black;border-collapse: collapse;}
				.tborder{border-top:1px solid black;}
				.bborder{border-bottom:1px solid black;}
				.lborder{border-left:1px solid black;}
				.rborder{border-right:1px solid black;}
				.oddColor{background-color: #E7E7E7;}
				.evenColor{background-color: #FFFFFF;}
				.bold{font-weight:bold;}
				.italicize{font-

In [229]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://www.nhl.com/scores/htmlreports/20242025/PL030234.HTM'
headers = {"User-Agent": "Mozilla/5.0"}
r = requests.get(url, headers=headers)

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(r.content, 'html.parser')

# Find all "page" segments that contain nested tables
pages = soup.find_all('div', class_='page')

# List to hold all dataframes
all_dfs = []

# Loop through each page, extract tables, and use pd.read_html
for page in pages:
    # Convert this section of HTML back to string
    html_str = str(page)
    
    # Use pd.read_html to extract all tables from this chunk
    try:
        dfs = pd.read_html(html_str)
        for df in dfs:
            # Only keep tables with event-like structure (usually >= 8 columns)
            if df.shape[1] >= 8:
                all_dfs.append(df)
    except ValueError:
        # No tables found in this chunk
        continue

# Combine all dataframes
if all_dfs:
    final_df = pd.concat(all_dfs, ignore_index=True)
else:
    final_df = pd.DataFrame()

# Optional: Rename columns if you want to label them
final_df.columns = [
    'PER', 'STR', 'TIME', 'EVENT', 'DETAILS', 'EV_TEAM',
    'ON_ICE_1', 'ON_ICE_2', 'ON_ICE_3', 'ON_ICE_4', 'ON_ICE_5', 'ON_ICE_6',
    'ON_ICE_1_opp', 'ON_ICE_2_opp', 'ON_ICE_3_opp', 'ON_ICE_4_opp',
    'ON_ICE_5_opp', 'ON_ICE_6_opp'
][:final_df.shape[1]]  # Trim header list to match actual columns

# Output
print(final_df.head())
print(f"Total events: {len(final_df)}")


Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
[0mPassing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
[0mPassing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
[0mPassing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
[0mPassing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
[0mPassing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.
[0mPassing literal html to 'read_html' is deprecated an

                                                 PER  \
0  VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...   
1                                                NaN   
2                                                  #   
3                                                  1   
4                                                  2   

                                                 STR  \
0  VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...   
1                                                NaN   
2                                                Per   
3                                                  1   
4                                                  1   

                                                TIME  \
0  VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...   
1                                                NaN   
2                                                Str   
3                                                NaN   
4                                             

In [232]:
final_df.head(100)

Unnamed: 0,PER,STR,TIME,EVENT,DETAILS,EV_TEAM,ON_ICE_1,ON_ICE_2,ON_ICE_3,ON_ICE_4,ON_ICE_5
0,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,,,
1,,,,,,,,,,,
2,#,Per,Str,Time: Elapsed Game,Event,Description,WPG On Ice,DAL On Ice,,,
3,1,1,,0:00 20:00,PGSTR,,,,,,
4,2,1,,0:00 20:00,PGEND,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
95,,,,,,,,,,,
96,#,Per,Str,Time: Elapsed Game,Event,Description,WPG On Ice,DAL On Ice,,,
97,33,1,,5:07 14:53,STOP,PUCK IN NETTING,22 C 36 C 73 L 4 D 54 D 37 G,53 C 96 R 21 L 6 D 28 D 29 G,,,
98,34,1,EV,5:07 14:53,FAC,DAL won Def. Zone - WPG #7 NAMESTNIKOV vs DAL ...,7 C 91 C 62 R 4 D 54 D 37 G,53 C 96 R 21 L 46 D 55 D 29 G,,,


In [224]:
dfs[0]

Unnamed: 0,0,1,2,3,4,5,6,7
0,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...,VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...
1,,,,,,,,
2,#,Per,Str,Time: Elapsed Game,Event,Description,WPG On Ice,DAL On Ice
3,1,1,,0:00 20:00,PGSTR,,,
4,2,1,,0:00 20:00,PGEND,,,
5,3,1,,0:00 20:00,ANTHEM,,,
6,4,1,,0:00 20:00,PSTR,Period Start- Local time: 7:12 CDT,22 C 36 C 73 L 2 D 44 D 37 G,53 C 96 R 21 L 4 D 55 D 29 G
7,5,1,EV,0:00 20:00,FAC,WPG won Neu. Zone - WPG #36 BARRON vs DAL #53 ...,22 C 36 C 73 L 2 D 44 D 37 G,53 C 96 R 21 L 4 D 55 D 29 G
8,6,1,EV,0:19 19:41,HIT,"WPG #36 BARRON HIT DAL #55 HARLEY, Neu. Zone",22 C 36 C 73 L 2 D 44 D 37 G,53 C 96 R 21 L 4 D 55 D 29 G
9,7,1,EV,0:22 19:38,GIVE,"WPG GIVEAWAY - #2 DEMELO, Neu. Zone",22 C 36 C 73 L 2 D 44 D 37 G,53 C 96 R 21 L 4 D 55 D 29 G


In [182]:
dfs[0].iloc[35][5]

'© Copyright 2025, National Hockey League  2025-05-14 10.51.54'

In [202]:
import requests
import pandas as pd

url_template = 'https://www.nhl.com/scores/htmlreports/20242025/PL030234.HTM?page={}'
header = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
}

# Initialize an empty list to store dataframes
dfs_list = []

# Start with the first page
page_number = 1
while True:
    url = url_template.format(page_number)
    r = requests.get(url, headers=header)
    r.raise_for_status()  # Ensure we notice bad responses

    # Parse the HTML to extract tables
    dfs = pd.read_html(r.text)
    if not dfs:
        break  # No more tables, exit the loop

    # Append the dataframe to the list
    dfs_list.append(dfs[0])

    # Check if the current page contains fewer rows than expected (indicating it's the last page)
    if len(dfs[0]) < 50:  # Adjust this number based on the typical number of events per page
        break

    page_number += 1

# Concatenate all dataframes into a single dataframe
all_events_df = pd.concat(dfs_list, ignore_index=True)

# Display the first few rows to verify
print(all_events_df.head())


  dfs = pd.read_html(r.text)


                                                   0  \
0  VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...   
1                                                NaN   
2                                                  #   
3                                                  1   
4                                                  2   

                                                   1  \
0  VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...   
1                                                NaN   
2                                                Per   
3                                                  1   
4                                                  1   

                                                   2  \
0  VISITOR 1 WINNIPEG JETS Game 11 Away Game 5 Pl...   
1                                                NaN   
2                                                Str   
3                                                NaN   
4                                             

In [221]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Get HTML content
url = 'https://www.nhl.com/scores/htmlreports/20242025/PL030234.HTM'
headers = {"User-Agent": "Mozilla/5.0"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')

# Step 2: Find all play-by-play "pages"
pages = soup.find_all('div', class_='page')

# Step 3: Extract events from each table on each "page"
all_events = []

for page in pages:
    table = page.find('table')
    if not table:
        continue

    rows = table.find_all('tr')
    for row in rows:
        cols = [td.get_text(strip=True) for td in row.find_all(['td', 'th'])]
        # Only keep rows that look like event rows (usually 8 or more columns)
        if len(cols) >= 8:
            all_events.append(cols)

# Step 4: Convert to DataFrame
df = pd.DataFrame(all_events)

# Optional: Set likely column names
column_names = [
    'PER', 'STR', 'TIME', 'EVENT', 'DETAILS', 'EV_TEAM',
    'ON_ICE_1', 'ON_ICE_2', 'ON_ICE_3', 'ON_ICE_4', 'ON_ICE_5', 'ON_ICE_6',
    'ON_ICE_1_opp', 'ON_ICE_2_opp', 'ON_ICE_3_opp', 'ON_ICE_4_opp',
    'ON_ICE_5_opp', 'ON_ICE_6_opp'
]

df.columns = column_names[:len(df.columns)]

# Final check
print(df.head())
print(f"Total events found: {len(df)}")


ValueError: Length mismatch: Expected axis has 54 elements, new values have 18 elements

In [222]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53
0,VISITOR1WINNIPEG JETSGame 11 Away Game 5Play B...,VISITOR1WINNIPEG JETSGame 11 Away Game 5,VISITOR,1,,1,,WINNIPEG JETSGame 11 Away Game 5,"Play By PlayTuesday, May 13, 2025Attendance 18...",Play By Play,,,"Tuesday, May 13, 2025","Attendance 18,532 at American Airlines Center",Start 7:12 CDT; End 9:43 CDT,Game 0234,Final,HOME3DALLAS STARSGame 11 Home Game 6,HOME,3,,3,,DALLAS STARSGame 11 Home Game 6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,VISITOR1WINNIPEG JETSGame 11 Away Game 5,VISITOR,1,,1,,WINNIPEG JETSGame 11 Away Game 5,"Play By PlayTuesday, May 13, 2025Attendance 18...",Play By Play,,,"Tuesday, May 13, 2025","Attendance 18,532 at American Airlines Center",Start 7:12 CDT; End 9:43 CDT,Game 0234,Final,HOME3DALLAS STARSGame 11 Home Game 6,HOME,3,,3,,DALLAS STARSGame 11 Home Game 6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,#,Per,Str,Time:ElapsedGame,Event,Description,WPG On Ice,DAL On Ice,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1,1,,0:0020:00,PGSTR,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2,1,,0:0020:00,PGEND,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,24C,24,C,,64C,64,C,,96R,96,R,,46D,46,D,,55D,55,D,,29G,29,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1036,337,3,,20:000:00,PEND,Period End- Local time: 9:43 CDT,7C22C91C62R27L44D,7C,7,C,,22C,22,C,,91C,91,C,,62R,62,R,,27L,27,L,,44D,44,D,24C64C96R46D55D29G,24C,24,C,,64C,64,C,,96R,96,R,,46D,46,D,,55D,55,D,,29G,29,G
1037,7C,7,C,,22C,22,C,,91C,91,C,,62R,62,R,,27L,27,L,,44D,44,D,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1038,24C,24,C,,64C,64,C,,96R,96,R,,46D,46,D,,55D,55,D,,29G,29,G,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [219]:
pages

NameError: name 'pages' is not defined