# Functions to scrape NHL Games

In [1]:
import pandas as pd
import functools
import requests
example_game_url = "http://statsapi.web.nhl.com/api/v1/game/2017020100/feed/live"
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# function to pull player data/statistics. Collected from the NHL statsapi
def get_game_result(game_id):
    """
    Arguments:
        game - Game ID for a particular game. Character. Example: "2017020001"
                 for the 2017/2018 season
    Returns:
        Single game results based on input game_id
    Example: 
        import pandas as pd
        import requests
        
        pd.DataFrame(get_players("20192020"))
    """
    # setting the base API url for use throughout
    base_api = "https://statsapi.web.nhl.com"
    game_link = f"/api/v1/game/%s/feed/live"% game_id
    # getting the full list of teams
    try:
        game_url = requests.get(base_api + game_link)
        game_url.raise_for_status()
        game = game_url.json()
        data = {}
        data['gameID'] = game['gamePk']
        data['season'] = game['gameData']['game']['season']
        data['gameType'] = game['gameData']['game']['type']
        data['homeTeamId'] = game['gameData']['teams']['home']['id']
        data['homeTeamName'] = game['gameData']['teams']['home']['name']
        data['awayTeamId'] = game['gameData']['teams']['away']['id']
        data['awayTeamName'] = game['gameData']['teams']['away']['name']
        data['homeGoals'] = game['liveData']['linescore']['teams']['home']['goals']
        data['awayGoals'] = game['liveData']['linescore']['teams']['away']['goals']
        data['homeTeamWin'] = game['liveData']['linescore']['teams']['home']['goals'] > game['liveData']['linescore']['teams']['away']['goals']
        data['venue'] = game['gameData']['venue']['name']
        return(data)
    except:
        print("Game not found")


def get_season_games(season):
    """
    Arguments:
        season - Start of a particular season. Character. Example: "2017"
                 for the 2017/2018 season
    Returns:
        All regular season game results for a specified season
    Example: 
        import pandas as pd
        import requests
        # takes a few minutes to run
        pd.DataFrame(get_season_games("2019"))
    """
    # setting the base API url for use throughout
    if int(season) < 2017:
        game_range = list(range(1, 1272))
    else:
        game_range = list(range(1, 1231))
    for game in game_range:
        game_id = str(game).zfill(4)
        base_api = "https://statsapi.web.nhl.com"
        game_link = f"/api/v1/game/%s/feed/live/"% (season + "02" + game_id)
        # getting the full list of teams
        try:
            game_url = requests.get(base_api + game_link)
            game_url.raise_for_status()
            game = game_url.json()
            data = {}
            data['gameID'] = game['gamePk']
            data['season'] = game['gameData']['game']['season']
            data['gameType'] = game['gameData']['game']['type']
            data['homeTeamId'] = game['gameData']['teams']['home']['id']
            data['homeTeamName'] = game['gameData']['teams']['home']['name']
            data['awayTeamId'] = game['gameData']['teams']['away']['id']
            data['awayTeamName'] = game['gameData']['teams']['away']['name']
            data['homeGoals'] = game['liveData']['linescore']['teams']['home']['goals']
            data['awayGoals'] = game['liveData']['linescore']['teams']['away']['goals']
            data['homeTeamWin'] = game['liveData']['linescore']['teams']['home']['goals'] > game['liveData']['linescore']['teams']['away']['goals']
            data['venue'] = game['gameData']['venue']['name']
            yield(data)
        except: 
            continue

# def get_game()

def get_team_stats_season(season):
    """
    Arguments:
        season - Start of a particular season. Character. Example: "2017"
                 for the 2017/2018 season
    Returns:
        All regular season game officials for a specified season
    Example: 
        import pandas as pd
        import requests
        # takes a few minutes to run
        pd.DataFrame(get_game_officials("2019"))
    """
    # setting the base API url for use throughout
    if int(season) < 2017:
        game_range = list(range(1, 1272))
    else:
        game_range = list(range(1, 1231))
    for game in game_range:
        game_id = str(game).zfill(4)
        base_api = "https://statsapi.web.nhl.com"
        game_link = f"/api/v1/game/%s/feed/live/"% (season + "02" + game_id)
        # getting the full list of teams
        try:
            game_url = requests.get(base_api + game_link)
            game_url.raise_for_status()
            game = game_url.json()
            for i in range(len(game['liveData']['boxscore']['officials'])):
                data = {}
                data['game'] = game['gamePk']
                data['officialName'] = game['liveData']['boxscore']['officials'][i]['official']['fullName']
                data['officialType'] = game['liveData']['boxscore']['officials'][i]['officialType']
                yield(data)
        except:
            continue

def get_team_stats_season(season):
    """
    Arguments:
        season - Start of a particular season. Character. Example: "2017"
                 for the 2017/2018 season
    Returns:
        All regular season team stats for a specified season. One row for home, one for away.
    Example: 
        import pandas as pd
        import requests
        # takes a few minutes to run
        pd.DataFrame(get_team_stats_season("2019"))
    """
    # setting the base API url for use throughout
    if int(season) < 2017:
        game_range = list(range(1, 1272))
    else:
        game_range = list(range(1, 1231))
    for game in game_range:
        game_id = str(game).zfill(4)
        base_api = "https://statsapi.web.nhl.com"
        game_link = f"/api/v1/game/%s/feed/live/"% (season + "02" + game_id)
        # getting the full list of teams
        try:
            game_url = requests.get(base_api + game_link)
            game_url.raise_for_status()
            game = game_url.json()
            for team in ['home', 'away']:
                data = {}
                data['game'] = game['gamePk']
                data['homeAway'] = team
                data['homeTeamWin'] = game['liveData']['linescore']['teams']['home']['goals'] > game['liveData']['linescore']['teams']['away']['goals']
                data['periodsPlayed'] = game['liveData']['linescore']['currentPeriod']
                data['headCoach'] = game['liveData']["boxscore"]['teams'][team]['coaches'][0]['person']['fullName']
                data['teamId'] = game['gameData']['teams'][team]['id']
                data['teamName'] = game['gameData']['teams'][team]['name']
                for key in game['liveData']["boxscore"]['teams'][team]['teamStats']['teamSkaterStats'].keys():
                    data[key] = game['liveData']["boxscore"]['teams'][team]['teamStats']['teamSkaterStats'][key]
                yield(data)
        except: 
            continue

def get_team_stats_game(game):
    """
    Arguments:
        game - Game ID for a particular game. Character. Example: "2017020001"
                 for the 2017/2018 season
        All regular season game results for a specified season
    Example: 
        import pandas as pd
        import requests
        # takes a few minutes to run
        pd.DataFrame(get_seaget_team_stats_gameson_games("2019"))
    """
    # setting the base API url for use throughout
    
    base_api = "https://statsapi.web.nhl.com"
    game_link = f"/api/v1/game/%s/feed/live/"% game
    # getting the full list of teams
    try:
        game_url = requests.get(base_api + game_link)
        game_url.raise_for_status()
        game = game_url.json()
        for team in ['home', 'away']:
            data = {}
            data['game'] = game['gamePk']
            data['homeAway'] = team
            data['homeTeamWin'] = game['liveData']['linescore']['teams']['home']['goals'] > game['liveData']['linescore']['teams']['away']['goals']
            data['periodsPlayed'] = game['liveData']['linescore']['currentPeriod']
            data['headCoach'] = game['liveData']["boxscore"]['teams'][team]['coaches'][0]['person']['fullName']
            data['teamId'] = game['gameData']['teams'][team]['id']
            data['teamName'] = game['gameData']['teams'][team]['name']
            for key in game['liveData']["boxscore"]['teams'][team]['teamStats']['teamSkaterStats'].keys():
                data[key] = game['liveData']["boxscore"]['teams'][team]['teamStats']['teamSkaterStats'][key]
            yield(data)
    except: 
        print("Game not found")

def get_player_stats_game(game):
    """
    Arguments:
        game - Game ID for a particular game. Character. Example: "2017020001"
                 for the 2017/2018 season
        All regular season game results for a specified season
    Example: 
        import pandas as pd
        import requests
        # takes a few minutes to run
        pd.DataFrame(get_player_stats_game("2019"))
    """
    base_api = "https://statsapi.web.nhl.com"
    game_link = f"/api/v1/game/%s/feed/live/"% game
    # getting the full list of teams
    try:
        game_url = requests.get(base_api + game_link)
        game_url.raise_for_status()
        game = game_url.json()
        for team in ['home', 'away']:
            for player in game['liveData']["boxscore"]['teams'][team]['players']:
                if game['liveData']["boxscore"]['teams'][team]['players'][player]['position']['code'] not in ("N/A", "G"):
                    data = {}
                    data['game'] = game['gamePk']
                    data['playerId'] = game['liveData']["boxscore"]['teams'][team]['players'][player]['person']['id']
                    data['fullName'] = game['liveData']["boxscore"]['teams'][team]['players'][player]['person']['fullName']
                    data['position'] = game['liveData']["boxscore"]['teams'][team]['players'][player]['position']['code']
                    data['homeAway'] = team
                    data['teamId'] = game['gameData']['teams'][team]['id']
                    data['teamName'] = game['gameData']['teams'][team]['name']
                    if isinstance(game['liveData']["boxscore"]['teams'][team]['players'][player]['stats']['skaterStats'], dict):
                        for key in game['liveData']["boxscore"]['teams'][team]['players'][player]['stats']['skaterStats'].keys():
                            data[key] = game['liveData']["boxscore"]['teams'][team]['players'][player]['stats']['skaterStats'][key]
                    yield(data)
                else: 
                    continue
    except: 
        print("Game not found")

def get_player_stats_season(season):
    """
    Arguments:
        season - Start of a particular season. Character. Example: "2017"
                 for the 2017/2018 season
    Returns:
        All regular season team stats for a specified season. One row for home, one for away.
    Example: 
        import pandas as pd
        import requests
        # takes a few minutes to run
        pd.DataFrame(get_player_stats_season("2019"))
    """
    # setting the base API url for use throughout
    if int(season) < 2017:
        game_range = list(range(1, 1272))
    else:
        game_range = list(range(1, 1231))
    for game in game_range:
        game_id = str(game).zfill(4)
        base_api = "https://statsapi.web.nhl.com"
        game_link = f"/api/v1/game/%s/feed/live/"% (season + "02" + game_id)
        # getting the full list of teams
        try:
            game_url = requests.get(base_api + game_link)
            game_url.raise_for_status()
            game = game_url.json()
            for team in ['home', 'away']:
                for player in game['liveData']["boxscore"]['teams'][team]['players']:
                    if game['liveData']["boxscore"]['teams'][team]['players'][player]['position']['code'] not in ("N/A", "G"):
                        data = {}
                        data['game'] = game['gamePk']
                        data['playerId'] = game['liveData']["boxscore"]['teams'][team]['players'][player]['person']['id']
                        data['fullName'] = game['liveData']["boxscore"]['teams'][team]['players'][player]['person']['fullName']
                        data['position'] = game['liveData']["boxscore"]['teams'][team]['players'][player]['position']['code']
                        data['homeAway'] = team
                        data['teamId'] = game['gameData']['teams'][team]['id']
                        data['teamName'] = game['gameData']['teams'][team]['name']
                        if isinstance(game['liveData']["boxscore"]['teams'][team]['players'][player]['stats']['skaterStats'], dict):
                            for key in game['liveData']["boxscore"]['teams'][team]['players'][player]['stats']['skaterStats'].keys():
                                data[key] = game['liveData']["boxscore"]['teams'][team]['players'][player]['stats']['skaterStats'][key]
                        yield(data)
                    else: 
                        continue
        except: 
            print("Game not found")

def get_goalie_stats_game(game):
    """
    Arguments:
        game - Game ID for a particular game. Character. Example: "2017020001"
                 for the 2017/2018 season
        All regular season game results for a specified season
    Example: 
        import pandas as pd
        import requests
        # takes a few minutes to run
        pd.DataFrame(get_goalie_stats_game("2019"))
    """
    base_api = "https://statsapi.web.nhl.com"
    game_link = f"/api/v1/game/%s/feed/live/"% game
    # getting the full list of teams
    try:
        game_url = requests.get(base_api + game_link)
        game_url.raise_for_status()
        game = game_url.json()
        for team in ['home', 'away']:
            for player in game['liveData']["boxscore"]['teams'][team]['players']:
                if game['liveData']["boxscore"]['teams'][team]['players'][player]['position']['code'] == "G":
                    data = {}
                    data['game'] = game['gamePk']
                    data['playerId'] = game['liveData']["boxscore"]['teams'][team]['players'][player]['person']['id']
                    data['fullName'] = game['liveData']["boxscore"]['teams'][team]['players'][player]['person']['fullName']
                    data['position'] = game['liveData']["boxscore"]['teams'][team]['players'][player]['position']['code']
                    data['homeAway'] = team
                    data['teamId'] = game['gameData']['teams'][team]['id']
                    data['teamName'] = game['gameData']['teams'][team]['name']
                    if isinstance(game['liveData']["boxscore"]['teams'][team]['players'][player]['stats']['goalieStats'], dict):
                        for key in game['liveData']["boxscore"]['teams'][team]['players'][player]['stats']['goalieStats'].keys():
                            data[key] = game['liveData']["boxscore"]['teams'][team]['players'][player]['stats']['goalieStats'][key]
                    yield(data)
                else: 
                    continue
    except: 
        print("Game not found")

def get_goalie_stats_season(season):
    """
    Arguments:
        season - Start of a particular season. Character. Example: "2017"
                 for the 2017/2018 season
    Returns:
        All regular season team stats for a specified season. One row for home, one for away.
    Example: 
        import pandas as pd
        import requests
        # takes a few minutes to run
        pd.DataFrame(get_goalie_stats_season("2019"))
    """
    # setting the base API url for use throughout
    if int(season) < 2017:
        game_range = list(range(1, 1272))
    else:
        game_range = list(range(1, 1231))
    for game in game_range:
        game_id = str(game).zfill(4)
        base_api = "https://statsapi.web.nhl.com"
        game_link = f"/api/v1/game/%s/feed/live/"% (season + "02" + game_id)
        # getting the full list of teams
        try:
            game_url = requests.get(base_api + game_link)
            game_url.raise_for_status()
            game = game_url.json()
            for team in ['home', 'away']:
                for player in game['liveData']["boxscore"]['teams'][team]['players']:
                    if game['liveData']["boxscore"]['teams'][team]['players'][player]['position']['code'] == "G":
                        data = {}
                        data['game'] = game['gamePk']
                        data['playerId'] = game['liveData']["boxscore"]['teams'][team]['players'][player]['person']['id']
                        data['fullName'] = game['liveData']["boxscore"]['teams'][team]['players'][player]['person']['fullName']
                        data['position'] = game['liveData']["boxscore"]['teams'][team]['players'][player]['position']['code']
                        data['homeAway'] = team
                        data['teamId'] = game['gameData']['teams'][team]['id']
                        data['teamName'] = game['gameData']['teams'][team]['name']
                        if isinstance(game['liveData']["boxscore"]['teams'][team]['players'][player]['stats']['goalieStats'], dict):
                            for key in game['liveData']["boxscore"]['teams'][team]['players'][player]['stats']['goalieStats'].keys():
                                data[key] = game['liveData']["boxscore"]['teams'][team]['players'][player]['stats']['goalieStats'][key]
                        yield(data)
                    else: 
                        continue
        except: 
            print("Game not found")

In [266]:
import time
start = time.time()

players_2017 = pd.DataFrame(get_player_stats_season("2017"))

end = time.time()
print("Scraping data took:", end - start, "seconds")


Scraping data took: 253.84007215499878 seconds


In [288]:
import time
start = time.time()

goalies_2010 = pd.DataFrame(get_goalie_stats_game("2010020100"))

end = time.time()
print("Scraping data took:", end - start, "seconds")


Scraping data took: 0.13594388961791992 seconds


In [2]:
game_url = requests.get("https://statsapi.web.nhl.com/api/v1/game/2017020100/feed/live/")
game_url.raise_for_status()
game = game_url.json()


In [271]:
# helper function to see the structure of the JSON in the different levels
def print_game_keys(game, level=0):
    for key in game.keys():
        if isinstance(game[key], dict):
            print(" " * level, key, " ", "{")
            print_game_keys(game[key], level+4)
            print(" " * (level+4), "}")
        else:
            print(" "* level, key)

print_game_keys(game['liveData']["boxscore"]['teams']['away']['players']['ID8475660'])

 person   {
     id
     fullName
     link
     shootsCatches
     rosterStatus
     }
 jerseyNumber
 position   {
     code
     name
     type
     abbreviation
     }
 stats   {
     goalieStats   {
         timeOnIce
         assists
         goals
         pim
         shots
         saves
         powerPlaySaves
         shortHandedSaves
         evenSaves
         shortHandedShotsAgainst
         evenShotsAgainst
         powerPlayShotsAgainst
         decision
         savePercentage
         powerPlaySavePercentage
         evenStrengthSavePercentage
         }
     }


In [4]:
game['gameData']["datetime"]["dateTime"]

'2017-10-20T00:30:00Z'