# Mine Game Data

In [1]:
import json
import requests
import pprint
import pandas as pd
import numpy as np
import math
import pickle
from sklearn.linear_model import LinearRegression



##### Get games for a date

In [2]:
def get_schedule_for_date(date):
    APIv2 = 'https://api-web.nhle.com/v1'
    r = requests.get(url=f'{APIv2}/schedule/{date}')
    data = r.json()
    
    schedule = []
    for d in data['gameWeek']:
        if d['date'] == date:
            schedule = d['games']
            break
            
    return schedule

def get_game_data_for_schedule(schedule):
    daily_schedule = []

    for game in schedule:
        daily_schedule.append({
            'gameId': game['id'],
            'dateTime': game['startTimeUTC'],
            'awayTeamId': game['awayTeam']['id'],
            'homeTeamId': game['homeTeam']['id'],
            'awayTeamScore': game['awayTeam']['score'],
            'homeTeamScore': game['homeTeam']['score'],
            'currentPeriod': game['periodDescriptor']['number'],
            'gameState': game['gameState'],
            'threeMinRecap': game.get('threeMinRecap', ''),
            'venue': game['venue']['default']
        })
    return daily_schedule

def extract_team_game_stats_to_dict(d):
    team_game_stats = {}
    for row in d['summary']['teamGameStats']:
        team_game_stats[row['category']] = {
            'away': row['awayValue'],
            'home': row['homeValue']
        }
    return team_game_stats

def get_boxscores(gameId):
    r = requests.get(url=f'https://api-web.nhle.com/v1/gamecenter/{gameId}/boxscore')
    d = r.json()
    boxscores = []
    
    team_game_stats = extract_team_game_stats_to_dict(d)

    # away boxscore
    new_boxscore = {}
    new_boxscore['gameId'] = gameId
    new_boxscore['isHome'] = False
    new_boxscore['teamId'] = d['awayTeam']['id']
    new_boxscore['goals'] = d['awayTeam']['score']
    new_boxscore['pim'] = team_game_stats['pim']['away']
    new_boxscore['shots'] = team_game_stats['sog']['away']
    new_boxscore['powerPlayPercentage'] = team_game_stats['powerPlayPctg']['away']*100
    new_boxscore['powerPlayGoals'] = int(team_game_stats['powerPlay']['away'].split('/')[0])
    new_boxscore['powerPlayOpportunities'] = int(team_game_stats['powerPlay']['away'].split('/')[1])
    new_boxscore['faceOffWinPercentage'] = round(team_game_stats['faceoffWinningPctg']['away']*100, 1)
    new_boxscore['blocked'] = team_game_stats['blockedShots']['away']
    new_boxscore['takeaways'] = team_game_stats['takeaways']['away']
    new_boxscore['giveaways'] = team_game_stats['giveaways']['away']
    new_boxscore['hits'] = team_game_stats['hits']['away']
    boxscores.append(new_boxscore)

    # home boxscore
    new_boxscore = {}
    new_boxscore['gameId'] = gameId
    new_boxscore['isHome'] = True
    new_boxscore['teamId'] = d['homeTeam']['id']
    new_boxscore['goals'] = d['homeTeam']['score']
    new_boxscore['pim'] = team_game_stats['pim']['home']
    new_boxscore['shots'] = team_game_stats['sog']['home']
    new_boxscore['powerPlayPercentage'] = team_game_stats['powerPlayPctg']['home']*100
    new_boxscore['powerPlayGoals'] = int(team_game_stats['powerPlay']['home'].split('/')[0])
    new_boxscore['powerPlayOpportunities'] = int(team_game_stats['powerPlay']['home'].split('/')[1])
    new_boxscore['faceOffWinPercentage'] = round(team_game_stats['faceoffWinningPctg']['home']*100, 1)
    new_boxscore['blocked'] = team_game_stats['blockedShots']['home']
    new_boxscore['takeaways'] = team_game_stats['takeaways']['home']
    new_boxscore['giveaways'] = team_game_stats['giveaways']['home']
    new_boxscore['hits'] = team_game_stats['hits']['home']
    boxscores.append(new_boxscore)

    return boxscores

##### Get game data

e.g., shots, faceoffs, penalties, players in game

In [3]:
def get_game_data(gameId):
    APIv2 = 'https://api-web.nhle.com/v1'
    r = requests.get(url=f'{APIv2}/gamecenter/{gameId}/play-by-play')
    return r.json()

In [4]:
def get_all_shots(d):
    all_shots = []
    etypes = set(['blocked-shot', 'missed-shot', 'shot-on-goal', 'goal'])
    for event in d['plays']:
        if event['typeDescKey'] in etypes:
            all_shots.append(event)
    return all_shots

def calc_gametime_second(period, time):
    time_in_seconds = int(time[:2]) * 60 + int(time[3:])
    return (period - 1) * 1200 + time_in_seconds

def get_teamIds_in_shots(shots_raw):
    teamIds = set()
    for event in shots_raw:
        if 'eventOwnerTeamId' in event['details']:
            teamIds.add(event['details']['eventOwnerTeamId'])
    return list(teamIds)

def return_other_item_in_list(item, item_list):
    for i in item_list:
        if item != i:
            return i
    return None

def extract_shot_data(shots_raw, gameId):
    # get the 2 teamIds in the list of shots
    teamIds = get_teamIds_in_shots(shots_raw)
    
    seasonId = int(str(gameId)[:4])
    shots_clean = []
    for event in shots_raw:
        
        try:
            # based on event, set shooting/defending teamId
            if event['typeDescKey'] == 'blocked-shot':
                defending_teamId = event['details']['eventOwnerTeamId']
                shooting_teamId = return_other_item_in_list(defending_teamId, teamIds)
            else:
                shooting_teamId = event['details']['eventOwnerTeamId']
                defending_teamId = return_other_item_in_list(shooting_teamId, teamIds)

            # set shooter key
            shooter_KEY = 'shootingPlayerId'
            if event['typeDescKey'] == 'goal':
                shooter_KEY = 'scoringPlayerId'

            shots_clean.append({
                'seasonId': seasonId,
                'gameId': gameId,
                'time': calc_gametime_second(event['periodDescriptor']['number'], event['timeInPeriod']),
                'period': event['periodDescriptor']['number'],
                'periodTime': event['timeInPeriod'],
                'x': event['details'].get('xCoord', None),
                'y': event['details'].get('yCoord', None),
                'shooterId': event['details'].get(shooter_KEY, 0),
                'blockerId': event['details'].get('blockingPlayerId', 0),
                'goalieId': event['details'].get('goalieInNetId', 0),
                'teamId': shooting_teamId,
                'isGoal': (event['typeDescKey'] == 'goal'),
                'isBlocked': (event['typeDescKey'] == 'blocked-shot'),
                'type': event['typeDescKey'],
                'shot_type': event['details'].get('shotType', 'unknown')
            })
        except Exception as e:
            print(f"Could not parse {event}; Exception: {e}")
    return shots_clean

In [5]:
# gameId = 2023030144
# d = get_game_data(gameId)

In [6]:
# shots_raw = get_all_shots(d)
# shots_clean = extract_shot_data(shots_raw, gameId)

In [7]:
# pd.DataFrame(shots_clean)

In [8]:
def extract_faceoffs(d):
    gameId = d['id']
    faceoffs = [play for play in d['plays'] if play['typeDescKey']=='faceoff']

    # get important data from each faceoff event
    faceoff_data = []
    for f in faceoffs:
        # get winner/loser player and team
        playerWinId = f['details']['winningPlayerId']
        playerLoseId = f['details']['losingPlayerId']
        teamWinId = f['details']['eventOwnerTeamId']

        # normalized faceoff locations
        x = f['details']['xCoord']
        y = f['details']['yCoord']
        if f['periodDescriptor']['number'] % 2 == 0: # e.g., 2nd period, 1st ot period, 3rd ot period
            x = -1*x
            y = -1*y

        faceoff_data.append({
            'gameId': gameId,
            'playerWinId': playerWinId,
            'playerLoseId': playerLoseId,
            'teamWinId': teamWinId,
            'x': x,
            'y': y,
            'time': calc_gametime_second(f['periodDescriptor']['number'], f['timeInPeriod'])
        })
    return faceoff_data

In [19]:
# penalties = [play for play in d['plays'] if play['typeDescKey']=='penalty']

In [25]:
def extract_penalties(d):
    gameId = d['id']
    penalties = [play for play in d['plays'] if play['typeDescKey']=='penalty']
    
    penalty_data = []
    for pen in penalties:
        # get penaltyOn/drewBy player and team
        penaltyOnId = pen['details'].get('committedByPlayerId',0)
        penaltyDrewById = pen['details'].get('drawnByPlayerId',0)
        penaltyOnTeamId = pen['details']['eventOwnerTeamId']

        # normalized penalty locations
        x = pen['details']['xCoord']
        y = pen['details']['yCoord']
        if pen['periodDescriptor']['number'] % 2 == 0: # e.g., 2nd period, 1st ot period, 3rd ot period
            x = -1*x
            y = -1*y

        penalty_data.append({
            'gameId': gameId,
            'teamId': penaltyOnTeamId,
            'penaltyOnId': penaltyOnId,
            'penaltyDrewById': penaltyDrewById,
            'x': x,
            'y': y,
            'pim': pen['details']['duration'],
            'severity': pen['details']['typeCode'],
            'type': pen['details']['descKey'],
            'time': calc_gametime_second(pen['periodDescriptor']['number'], pen['timeRemaining'])
        })
    return penalty_data

##### Get ice time data
functions to get single player shifts

In [10]:
def extract_json_shifts(d):
    return [{
        'gameId': shift['gameId'],
        'teamId': shift['teamId'],
        'playerId': shift['playerId'],
        'start': calc_gametime_second(shift['period'], shift['startTime']),
        'end': calc_gametime_second(shift['period'], shift['endTime'])
    } for shift in d['data']]

def get_shifts_data(gameId):
    r = requests.get(url=f'https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId={gameId}')
    d = r.json()
    return extract_json_shifts(d)

def clean_raw_shifts(all_shifts):
    # convert to df
    df_shifts = pd.DataFrame(all_shifts)

    # some shifts are recorded with identical start/end times, omit these rows
    df_shifts = df_shifts[df_shifts['start'] != df_shifts['end']]

    # sort df
    df_shifts = df_shifts.sort_values(['gameId', 'teamId', 'start', 'end', 'playerId'])
    
    return df_shifts.to_dict(orient='records')

functions to group players who are on the ice together

In [11]:
def line_list_to_shift(line_list, start, end):
    # need this for when dataset changes from teamId_1 to teamId_2
    # ...corrects end time of final shift 'first' team's shift data
    if end <= start:
        end = line_list[0]['end']
    return {
        'playerIds': [player['playerId'] for player in line_list],
        'start': start,
        'end': end,
        'numPlayers': len(line_list),
        'teamId': line_list[0]['teamId']
    }

def next_shift_end(line_list):
    return min([player['end'] for player in line_list])

def group_shifts(shifts):
    # get starting lines
    start_time = shifts[0]['start']
    line_list = []
    line_shifts = []
    i = 0
    more_shifts = True

    while more_shifts and i < len(shifts): #and shifts[i]['teamId'] == 6:
        # add players starting shift
        while start_time == shifts[i]['start']:
            line_list.append(shifts[i])
            i += 1
            if i == len(shifts):
                i -= 1 # at the end of shifts data for this game
                more_shifts = False
                break

        # figure out the time when the next player is getting on/off the ice
        next_player_off = next_shift_end(line_list)
        next_player_on = shifts[i]['start']
        end_time = min([next_player_off, next_player_on])

        # new player getting on, save this line combination
        line_shifts.append(line_list_to_shift(line_list, start_time, end_time))

        # cut list to only contain players that remain on the ice after end_time
        line_list = [player for player in line_list if player['end'] > end_time]

        # if reached end of shifts for this game for this team, clear line_list
        if next_player_on == 0:
            line_list = []

        # update start_time with time that new player(s) are getting on the ice
        start_time = end_time
    
    return line_shifts

functions to compute scenario (e.g., 5on5, 5on4, ...)

In [12]:
def get_num_players_by_frame(line_shifts):
    away_shifts = [s for s in line_shifts if s['teamId']==awayTeamId]
    away_shifts_frames = []
    for s in away_shifts:
        for i in range(s['start'], s['end']):
            away_shifts_frames.append({
                'awayPlayerIds': s['playerIds'],
                'time': i
            })
    # add last second (fixes overtime goals that don't get recorded)
    away_shifts_frames.append({
        'awayPlayerIds': s['playerIds'],
        'time': i+1
    })

    home_shifts = [s for s in line_shifts if s['teamId']==homeTeamId]
    home_shifts_frames = []
    for s in home_shifts:
        for i in range(s['start'], s['end']):
            home_shifts_frames.append({
                'homePlayerIds': s['playerIds'],
                'time': i
            })
    # add last second (fixes overtime goals that don't get recorded)
    home_shifts_frames.append({
            'homePlayerIds': s['playerIds'],
            'time': i+1
    })
            
    df = pd.DataFrame(away_shifts_frames).merge(pd.DataFrame(home_shifts_frames), on='time')
    df = df[['time', 'awayPlayerIds', 'homePlayerIds']]
    df['awayNumPlayers'] = df['awayPlayerIds'].str.len()
    df['homeNumPlayers'] = df['homePlayerIds'].str.len()
    return df

functions to serialize data

In [13]:
def serialize_ids(playerIds):
    return '_'.join(str(i) for i in sorted(playerIds))

def get_serialized_frames(df, d):
    # get sets of fwd/def/goalies that appear in this game
    forwards = set()
    defensemen = set()
    goalies = set()
    for player in d['rosterSpots']:   
        if player['positionCode'] == 'D':
            defensemen.add(player['playerId'])
        elif player['positionCode'] == 'G':
            goalies.add(player['playerId'])
        else:
            forwards.add(player['playerId'])
            
    # get gameId
    gameId = d['id']
    
    timeframes = df.to_dict(orient='records')
    serialized_shifts = []
    for t in timeframes:
        awayFwdIds = serialize_ids([playerId for playerId in t['awayPlayerIds'] if playerId in forwards])
        awayDefIds = serialize_ids([playerId for playerId in t['awayPlayerIds'] if playerId in defensemen])
        homeFwdIds = serialize_ids([playerId for playerId in t['homePlayerIds'] if playerId in forwards])
        homeDefIds = serialize_ids([playerId for playerId in t['homePlayerIds'] if playerId in defensemen])

        serialized_shifts.append({
            'gameId': gameId,
            'time': t['time'],
            'awayNumPlayers': t['awayNumPlayers'],
            'awaySkaterIds': awayFwdIds + '_' + awayDefIds,
            'awayFwdIds': awayFwdIds,
            'awayDefIds': awayDefIds,
            'awayGoalieIds': serialize_ids([playerId for playerId in t['awayPlayerIds'] if playerId in goalies]),
            'homeSkaterIds': homeFwdIds + '_' + homeDefIds,
            'homeFwdIds': homeFwdIds,
            'homeDefIds': homeDefIds,
            'homeGoalieIds': serialize_ids([playerId for playerId in t['homePlayerIds'] if playerId in goalies]),
            'homeNumPlayers': t['homeNumPlayers']
        })
    return pd.DataFrame(serialized_shifts)

functions to convert timeframes into shifts

In [14]:
def get_scenario_frames(df):
    # compute scenario of each timeframe
    numHome = df['homeNumPlayers']
    numAway = df['awayNumPlayers']
    df['scenario'] = np.where((numHome==6) & (numAway==6), '5on5',
                             np.where((numHome==6) & (numAway==5), '5on4',
                                np.where((numHome==5) & (numAway==6), '5on4',
                                    np.where((numHome==5) & (numAway==5), '4on4',
                                        np.where((numHome==5) & (numAway==4), '4on3',
                                            np.where((numHome==4) & (numAway==5), '4on3',
                                                np.where((numHome==6) & (numAway==4), '5on3',
                                                        np.where((numHome==4) & (numAway==6), '5on3',
                                                                np.where((numHome==4) & (numAway==4), '3on3', 'other')
                                                            )
                                                        )
                                                    )
                                                )
                                            )
                                        )
                                    )
                                 )
    
    df['scenario'] = np.where((df['awayGoalieIds'].str.len()<7) | (df['homeGoalieIds'].str.len()<7), 
                              'emptyNet', 
                              df['scenario'])
    return df

def convert_frame_to_shifts(df, gameId, awayTeamId, homeTeamId):
    timeframes = df.to_dict(orient='records')
    shifts = []
    cats = ['awaySkaterIds', 'awayFwdIds', 'awayDefIds', 'awayGoalieIds',
        'homeSkaterIds', 'homeFwdIds', 'homeDefIds', 'homeGoalieIds']

    for cat in cats:
        teamId = homeTeamId
        if 'away' in cat:
            teamId = awayTeamId

        lineType = 'skater'
        if 'Fwd' in cat:
            lineType = 'fwd'
        elif 'Def' in cat:
            lineType = 'def'
        elif 'Goalie' in cat:
            lineType = 'goalie'

        start = 0
        cur_line = timeframes[0][cat]
        cur_scenario = timeframes[0]['scenario']

        for i in range(1,len(timeframes)):
            if timeframes[i][cat] != cur_line or timeframes[i]['scenario'] != cur_scenario:
                if cur_line != '':
                    shifts.append({
                        'start': start,
                        'end': i,
                        'duration': i-start,
                        'gameId': gameId,
                        'teamId': teamId,
                        'lineId': cur_line,
                        'lineType': lineType,
                        'scenario': cur_scenario
                    })

                start = i
                cur_line = timeframes[i][cat]
                cur_scenario = timeframes[i]['scenario']
        shifts.append({
            'start': start,
            'end': i,
            'duration': i-start,
            'gameId': gameId,
            'teamId': teamId,
            'lineId': cur_line,
            'lineType': lineType,
            'scenario': cur_scenario
        })
    return shifts

functions to get expected goals

In [15]:
def merge_and_get_expected_goals(df_shots, df_ice_frames):
    # merging
    df_shot_players = df_shots.merge(df_ice_frames, on=['gameId', 'time'])
    
    # copy shots dataframe
    df_pred = df_shot_players[['shooterId', 'goalieId', 'teamId', 'x', 'y', 'time', 'type', 'shot_type']]
    df_pred = df_pred[df_pred['goalieId'] != 0] # filter empty net goals

    #### generate features ####

    # time differance between shots
    df_pred = df_pred.sort_values(by=['goalieId', 'time']) # sort by game_id and goalie name
    df_pred['time_diff'] = df_pred.groupby(['goalieId'])['time'].diff(1).fillna(df_pred['time'])

    # get total shots
    df_pred['tot_shots'] = 1
    df_pred['tot_shots'] = df_pred.groupby(['goalieId'])['tot_shots'].cumsum()

    # fix all shots to one side of ice (adjust x and y coordinates)
    df_pred.loc[df_pred['x'] < 0, 'y'] = df_pred['y'] * -1
    df_pred['x'] = df_pred['x'].abs()
    df_pred = df_pred[df_pred['x'].notna()]

        # calculate angle of shot compared to goal
    x_goal = 89
    df_pred['shot_angle'] = np.where(df_pred['x'] != x_goal,
                                # when shot is from behind the net
                                np.where(df_pred['x'] > x_goal,
                                         np.where(df_pred['y'] >= 0,
                                                  round(90 + (90 - np.arctan(df_pred['y'] / (df_pred['x'] - x_goal)) * (180 / np.pi)), 2),
                                                  round(-90 - (90 + np.arctan(df_pred['y'] / (df_pred['x'] - x_goal)) * (180 / np.pi)), 2)
                                                 ),
                                         # when shot is in front of net
                                         round(np.arctan(df_pred['y'] / (x_goal - df_pred['x'])) * (180 / np.pi), 2)
                                        ),
                                # when shot is taken on the goal line
                                np.where(df_pred['y'] >= 0, 90, -90)
                               )

    # calculate difference in shot angle
    df_pred['shot_angle_prev'] = df_pred.groupby(['goalieId'])['shot_angle'].shift(1).fillna(0)
    df_pred['shot_angle_diff'] = np.absolute(np.where(df_pred['shot_angle'] < df_pred['shot_angle_prev'],
                                                    df_pred['shot_angle'] - df_pred['shot_angle_prev'],
                                                    df_pred['shot_angle_prev'] - df_pred['shot_angle']
                                                )
                                       )


    # determine if goalie moved to his right since last shot to attempt to make save
    df_pred['goalie_move_right'] = np.where(df_pred['shot_angle'] > df_pred['shot_angle_prev'], True, False)


    # calculate shot distance to goal
    df_pred['shot_dist'] = round(np.sqrt(np.square(df_pred['x'] - x_goal) + np.square(df_pred['y'])), 2)
    df_pred['shot_dist_prev'] = df_pred.groupby(['goalieId'])['shot_dist'].shift(1).fillna(0)
    df_pred['shot_dist_diff'] = df_pred['shot_dist'] - df_pred['shot_dist_prev']

    # rename vars to match the varnames expected by model
    df_pred.rename(columns={'x': 'x_loc', 'y': 'y_loc',
                           'time': 'time_seconds', 'time_diff': 'time_seconds_diff'}, inplace=True)

    # generate dummy vars for shot_type
    df_pred['shot_type_Backhand'] = np.where(df_pred['shot_type']=='Backhand', 1, 0)
    df_pred['shot_type_Deflected'] = np.where(df_pred['shot_type']=='Deflected', 1, 0)
    df_pred['shot_type_Slap Shot'] = np.where(df_pred['shot_type']=='Slap Shot', 1, 0)
    df_pred['shot_type_Snap Shot'] = np.where(df_pred['shot_type']=='Snap Shot', 1, 0)
    df_pred['shot_type_Tip-In'] = np.where(df_pred['shot_type']=='Tip-In', 1, 0)
    df_pred['shot_type_Wrap-around'] = np.where(df_pred['shot_type']=='Wrap-around', 1, 0)
    df_pred['shot_type_Wrist Shot'] = np.where(df_pred['shot_type']=='Wrist Shot', 1, 0)

    # drop rows with missing values
    df_pred.dropna(axis=0, inplace=True) 

    #### delete columns that are unnecessary ####
    cols_to_drop = ['shooterId', 'goalieId', 'teamId', 'type', 
                    'shot_angle_prev', 'shot_dist_prev', 'shot_type']
    df_pred = df_pred.drop(cols_to_drop, axis=1)

    #### load model and generate predictions ####
#     with open('../models/expectedGoals_FINAL.sav', 'rb') as f:
#         loaded_model = pickle.load(f)
    df_pred['xgoals'] =  0 #loaded_model.predict(df_pred)
    
    return df_shot_players.merge(df_pred[['xgoals']], left_index=True, right_index=True, how='left').fillna(0)

## Wrap everything together using functions above

In [16]:
def update_master_files(all_games, all_shifts, all_shots, all_faceoffs, all_penalties, all_boxscores):
    #### update schedule ####
    # load
    with open('../../backend/data/20232024/scheduleCurrent.json', 'r') as f:
        master = json.load(f)

    # update
    df_master = pd.DataFrame(master)
    df_new = pd.DataFrame(all_games)
    df_master = pd.concat([df_master, df_new]).drop_duplicates(keep='last')

    # write
    df_master.to_json('../../backend/data/20232024/scheduleCurrent.json', orient='records')

    #### update shifts ####
    # load
    with open('../../backend/data/20232024/shiftCurrent.json', 'r') as f:
        master = json.load(f)

    # update
    df_master = pd.DataFrame(master)
    df_new = pd.DataFrame(all_shifts)
    df_master = pd.concat([df_master, df_new]).drop_duplicates(keep='last')

    # write
    df_master.to_json('../../backend/data/20232024/shiftCurrent.json', orient='records')

    #### update shots ####
    # load
    with open('../../backend/data/20232024/shotCurrent.json', 'r') as f:
        master = json.load(f)

    # update
    df_master = pd.DataFrame(master)
    df_new = pd.DataFrame(all_shots)
    df_master = pd.concat([df_master, df_new]).drop_duplicates(subset=['gameId', 'time'], keep='last')

    # write
    df_master.to_json('../../backend/data/20232024/shotCurrent.json', orient='records')
    
    #### update faceoffs ####
    # load
    with open('../../backend/data/20232024/faceoffCurrent.json', 'r') as f:
        master = json.load(f)

    # update
    df_master = pd.DataFrame(master)
    df_new = pd.DataFrame(all_faceoffs)
    df_master = pd.concat([df_master, df_new]).drop_duplicates(subset=['gameId', 'time'], keep='last')

    # write
    df_master.to_json('../../backend/data/20232024/faceoffCurrent.json', orient='records')
    
    #### update penalties ####
    # load
    with open('../../backend/data/20232024/penaltyCurrent.json', 'r') as f:
        master = json.load(f)

    # update
    df_master = pd.DataFrame(master)
    df_new = pd.DataFrame(all_penalties)
    df_master = pd.concat([df_master, df_new]).drop_duplicates(subset=['gameId', 'time', 
                                                                       'penaltyOnId', 'type'], keep='last')

    # write
    df_master.to_json('../../backend/data/20232024/penaltyCurrent.json', orient='records')
    
    #### update boxscores ####
    # load
    with open('../../backend/data/20232024/boxscoreCurrent.json', 'r') as f:
        master = json.load(f)

    # update
    df_master = pd.DataFrame(master)
    df_new = pd.DataFrame(all_boxscores)
    df_master = pd.concat([df_master, df_new]).drop_duplicates(subset=['gameId', 'teamId'], keep='last')

    # write
    df_master.to_json('../../backend/data/20232024/boxscoreCurrent.json', orient='records')
    
def overwrite_master_files(all_games, all_shifts, all_shots, all_faceoffs, all_penalties, all_boxscores):
    #### update schedule ####
    df_new = pd.DataFrame(all_games)
    df_new.to_json('../../backend/data/20232024/scheduleCurrent.json', orient='records')

    #### update shifts ####
    df_new = pd.DataFrame(all_shifts)
    df_new.to_json('../../backend/data/20232024/shiftCurrent.json', orient='records')

    #### update shots ####
    df_new = pd.DataFrame(all_shots)
    df_new.to_json('../../backend/data/20232024/shotCurrent.json', orient='records')
    
    #### update faceoffs ####
    df_new = pd.DataFrame(all_faceoffs)
    df_new.to_json('../../backend/data/20232024/faceoffCurrent.json', orient='records')
    
    #### update penalties ####
    df_new = pd.DataFrame(all_penalties)
    df_new.to_json('../../backend/data/20232024/penaltyCurrent.json', orient='records')
    
    #### update boxscores ####
    df_new = pd.DataFrame(all_boxscores)
    df_new.to_json('../../backend/data/20232024/boxscoreCurrent.json', orient='records')

In [26]:
all_games = []
all_shifts = []
all_shots = []
all_faceoffs = []
all_penalties = []
all_boxscores = []

for i in range(22,24):
    date = f'2024-04-{i}'
    print('Date: ' + date)
    
    # get schedule for this date
    schedule = get_schedule_for_date(date)
    games = get_game_data_for_schedule(schedule)
    all_games.extend(games)

    # download data
    for game in games:
        gameId = game['gameId']
        game_data = get_game_data(gameId)
        awayTeamId = game_data['awayTeam']['id']
        homeTeamId = game_data['homeTeam']['id']
        
        # get boxscores
        all_boxscores.extend(get_boxscores(gameId))
        
        # get faceoffs
        all_faceoffs.extend(extract_faceoffs(game_data))
        
        # get penalties
        all_penalties.extend(extract_penalties(game_data))

        # get shots
        df_shots = pd.DataFrame(extract_shot_data(get_all_shots(game_data), gameId))
        df_shots.head()

        # get timeframes and shifts (store shifts as table within db)
        line_shifts = group_shifts(clean_raw_shifts(get_shifts_data(gameId)))
        df_ice_frames = get_serialized_frames(get_num_players_by_frame(line_shifts), game_data)
        shifts = convert_frame_to_shifts(get_scenario_frames(df_ice_frames), 
                                         gameId=gameId, 
                                         awayTeamId=awayTeamId, 
                                         homeTeamId=homeTeamId)

        # merge shots and timeframes data, generate xGoals
        df_shots_players = merge_and_get_expected_goals(df_shots, df_ice_frames)

        # store these data points in a list
        all_shifts.extend(shifts)
        all_shots.extend(df_shots_players.to_dict(orient='records'))
        print('scraped ' + str(gameId))
    print()

# save data
update_master_files(all_games, all_shifts, all_shots, all_faceoffs, all_penalties, all_boxscores)
print('successfully saved')

Date: 2024-04-22
scraped 2023030122
scraped 2023030142
scraped 2023030151
scraped 2023030181

Date: 2024-04-23
scraped 2023030132
scraped 2023030112
scraped 2023030162
scraped 2023030172

successfully saved
