# Mine Game Data

In [18]:
import json
import requests
import pprint
import pandas as pd
import numpy as np
import math
import pickle
from sklearn.linear_model import LinearRegression

##### Get games for a date

In [2]:
def get_games_for_date(date):
    r = requests.get(url=f'https://statsapi.web.nhl.com/api/v1/schedule?date={date}')
    d = r.json()

    all_games = []
    for game in d['dates'][0]['games']:    
        all_games.append({
            'gameId': game['gamePk'],
            'dateTime': game['gameDate'],
            'homeTeamId': game['teams']['home']['team']['id'],
            'homeTeamScore': game['teams']['home']['score'],
            'awayTeamId': game['teams']['away']['team']['id'],
            'awayTeamScore': game['teams']['away']['score']
        })
    
        
    return all_games

##### Get game data

e.g., shots, players in game

In [3]:
def get_game_data(gameId):
    r = requests.get(url=f'https://statsapi.web.nhl.com/api/v1/game/{gameId}/feed/live')
    d = r.json()
    return d

def calc_gametime_second(period, time):
    time_in_seconds = int(time[:2]) * 60 + int(time[3:])
    return (period - 1) * 1200 + time_in_seconds

def extract_shots(d):
    # get all shot events
    shots = [event for event in d['liveData']['plays']['allPlays'] \
         if event['result']['event'] in set(['Shot', 'Blocked Shot', 'Missed Shot', 'Goal'])]
    
    # extract shots
    shot_data = []
    teamIds = [val['id'] for key, val in d['gameData']['teams'].items()]
    for shot in shots:
        shooter = 0
        goalie = 0
        blocker = 0
        for player in shot['players']:
            if player['playerType'] == 'Shooter' or player['playerType'] == 'Scorer':
                shooter = player['player']['id']
            if player['playerType'] == 'Goalie':
                goalie = player['player']['id']
            if player['playerType'] == 'Blocker':
                blocker = player['player']['id']    

        # get correct teamId...if blocked shot, flip teamId
        teamId = shot['team']['id']
        if shot['result']['eventTypeId'] == 'BLOCKED_SHOT':
            if teamId == teamIds[0]:
                teamId = teamIds[1]
            else:
                teamId = teamIds[0]

        # normalized shot locations
        x = shot['coordinates']['x']
        y = shot['coordinates']['y']
        if shot['about']['period'] % 2 == 0: # e.g., 2nd period, 1st ot period, 3rd ot period
            x = -1*x
            y = -1*y

        # get shot type if shot was on net
        shot_type = 'none'
        if 'secondaryType' in shot['result']:
            shot_type = shot['result']['secondaryType']

        shot_data.append({
            'shooterId': shooter,
            'goalieId': goalie,
            'blockerId': blocker,
            'teamId': teamId,
            'x': x,
            'y': y,
            'time': calc_gametime_second(shot['about']['period'], shot['about']['periodTime']),
            'type': shot['result']['eventTypeId'],
            'shot_type': shot_type
        })
        
    return shot_data

##### Get ice time data
functions to get single player shifts

In [4]:
def extract_json_shifts(d):
    return [{
        'gameId': shift['gameId'],
        'teamId': shift['teamId'],
        'playerId': shift['playerId'],
        'start': calc_gametime_second(shift['period'], shift['startTime']),
        'end': calc_gametime_second(shift['period'], shift['endTime'])
    } for shift in d['data']]

def get_shifts_data(gameId):
    r = requests.get(url=f'https://api.nhle.com/stats/rest/en/shiftcharts?cayenneExp=gameId={gameId}')
    d = r.json()
    return extract_json_shifts(d)

def clean_raw_shifts(all_shifts):
    # convert to df
    df_shifts = pd.DataFrame(all_shifts)

    # some shifts are recorded with identical start/end times, omit these rows
    df_shifts = df_shifts[df_shifts['start'] != df_shifts['end']]

    # sort df
    df_shifts = df_shifts.sort_values(['gameId', 'teamId', 'start', 'end', 'playerId'])
    
    return df_shifts.to_dict(orient='records')

functions to group players who are on the ice together

In [5]:
def line_list_to_shift(line_list, start, end):
    # need this for when dataset changes from teamId_1 to teamId_2
    # ...corrects end time of final shift 'first' team's shift data
    if end <= start:
        end = line_list[0]['end']
    return {
        'playerIds': [player['playerId'] for player in line_list],
        'start': start,
        'end': end,
        'numPlayers': len(line_list),
        'teamId': line_list[0]['teamId']
    }

def next_shift_end(line_list):
    return min([player['end'] for player in line_list])

def group_shifts(shifts):
    # get starting lines
    start_time = shifts[0]['start']
    line_list = []
    line_shifts = []
    i = 0

    while i < len(shifts)-1: #and shifts[i]['teamId'] == 6:
        # add players starting shift
        while start_time == shifts[i]['start']:
            line_list.append(shifts[i])
            i += 1
            if i == len(shifts):
                i -= 1 # at the end of shifts data for this game
                break

        # figure out the time when the next player is getting on/off the ice
        next_player_off = next_shift_end(line_list)
        next_player_on = shifts[i]['start']
        end_time = min([next_player_off, next_player_on])

        # new player getting on, save this line combination
        line_shifts.append(line_list_to_shift(line_list, start_time, end_time))

        # cut list to only contain players that remain on the ice after end_time
        line_list = [player for player in line_list if player['end'] > end_time]

        # if reached end of shifts for this game for this team, clear line_list
        if next_player_on == 0:
            line_list = []

        # update start_time with time that new player(s) are getting on the ice
        start_time = end_time
        
    return line_shifts

functions to compute scenario (e.g., 5on5, 5on4, ...)

In [6]:
def get_num_players_by_frame(line_shifts):
    away_shifts = [s for s in line_shifts if s['teamId']==awayTeamId]
    away_shifts_frames = []
    for s in away_shifts:
        for i in range(s['start'], s['end']):
            away_shifts_frames.append({
                'awayPlayerIds': s['playerIds'],
                'time': i
            })

    home_shifts = [s for s in line_shifts if s['teamId']==homeTeamId]
    home_shifts_frames = []
    for s in home_shifts:
        for i in range(s['start'], s['end']):
            home_shifts_frames.append({
                'homePlayerIds': s['playerIds'],
                'time': i
            })
            
    df = pd.DataFrame(away_shifts_frames).merge(pd.DataFrame(home_shifts_frames), on='time')
    df = df[['time', 'awayPlayerIds', 'homePlayerIds']]
    df['awayNumPlayers'] = df['awayPlayerIds'].str.len()
    df['homeNumPlayers'] = df['homePlayerIds'].str.len()
    return df

functions to serialize data

In [7]:
def serialize_ids(playerIds):
    return '_'.join(str(i) for i in sorted(playerIds))

def get_serialized_frames(df, d):
    # get sets of fwd/def/goalies that appear in this game
    forwards = set()
    defensemen = set()
    goalies = set()
    for key, val in d['gameData']['players'].items():   
        if val['primaryPosition']['type'] == 'Forward':
            forwards.add(val['id'])
        elif val['primaryPosition']['type'] == 'Defenseman':
            defensemen.add(val['id'])
        elif val['primaryPosition']['type'] == 'Goalie':
            goalies.add(val['id'])
            
    # get gameId
    gameId = d['gamePk']
    
    timeframes = df.to_dict(orient='records')
    serialized_shifts = []
    for t in timeframes:
        serialized_shifts.append({
            'gameId': gameId,
            'time': t['time'],
            'awayNumPlayers': t['awayNumPlayers'],
            'awayFwdIds': serialize_ids([playerId for playerId in t['awayPlayerIds'] if playerId in forwards]),
            'awayDefIds': serialize_ids([playerId for playerId in t['awayPlayerIds'] if playerId in defensemen]),
            'awayGoalieIds': serialize_ids([playerId for playerId in t['awayPlayerIds'] if playerId in goalies]),
            'homeFwdIds': serialize_ids([playerId for playerId in t['homePlayerIds'] if playerId in forwards]),
            'homeDefIds': serialize_ids([playerId for playerId in t['homePlayerIds'] if playerId in defensemen]),
            'homeGoalieIds': serialize_ids([playerId for playerId in t['homePlayerIds'] if playerId in goalies]),
            'homeNumPlayers': t['homeNumPlayers']
        })
    return pd.DataFrame(serialized_shifts)

functions to convert timeframes into shifts

In [27]:
def get_scenario_frames(df):
    # compute scenario of each timeframe
    numHome = df['homeNumPlayers']
    numAway = df['awayNumPlayers']
    df['scenario'] = np.where((numHome==6) & (numAway==6), '5on5',
                             np.where((numHome==6) & (numAway==5), '5on4',
                                np.where((numHome==5) & (numAway==6), '5on4',
                                    np.where((numHome==5) & (numAway==5), '4on4',
                                        np.where((numHome==5) & (numAway==4), '4on3',
                                            np.where((numHome==4) & (numAway==5), '4on3',
                                                np.where((numHome==6) & (numAway==4), '5on3',
                                                        np.where((numHome==4) & (numAway==6), '5on3', '3on3')
                                                        )
                                                    )
                                                )
                                            )
                                        )
                                    )
                                 )
    
    df['scenario'] = np.where((df['awayGoalieIds'].str.len()<7) | (df['homeGoalieIds'].str.len()<7), 
                              'emptyNet', 
                              df['scenario'])
    return df

def convert_frame_to_shifts(df, gameId, awayTeamId, homeTeamId):
    timeframes = df.to_dict(orient='records')
    shifts = []
    cats = ['awayFwdIds', 'awayDefIds', 'awayGoalieIds',
           'homeFwdIds', 'homeDefIds', 'homeGoalieIds']

    for cat in cats:
        teamId = homeTeamId
        if 'away' in cat:
            teamId = awayTeamId
        lineType = 'fwd'
        if 'Def' in cat:
            lineType = 'def'
        elif 'Goalie' in cat:
            lineType = 'goalie'

        start = 0
        cur_line = timeframes[0][cat]
        cur_scenario = timeframes[0]['scenario']

        for i in range(1,len(timeframes)):
            if timeframes[i][cat] != cur_line or timeframes[i]['scenario'] != cur_scenario:
                if cur_line != '':
                    shifts.append({
                        'start': start,
                        'end': i,
                        'duration': i-start,
                        'gameId': gameId,
                        'teamId': teamId,
                        'lineId': cur_line,
                        'lineType': lineType,
                        'scenario': cur_scenario
                    })

                start = i
                cur_line = timeframes[i][cat]
                cur_scenario = timeframes[i]['scenario']
        shifts.append({
            'start': start,
            'end': i,
            'duration': i-start,
            'gameId': gameId,
            'teamId': teamId,
            'lineId': cur_line,
            'lineType': lineType,
            'scenario': cur_scenario
        })
    return shifts

functions to get expected goals

In [21]:
def merge_and_get_expected_goals(df_shots, df_ice_frames):
    # merging
    df_shot_players = df_shots.merge(df_ice_frames, on='time')
    
    # copy shots dataframe
    df_pred = df_shot_players[['shooterId', 'goalieId', 'teamId', 'x', 'y', 'time', 'type', 'shot_type']]
    df_pred = df_pred[df_pred['goalieId'] != 0] # filter empty net goals

    #### generate features ####

    # time differance between shots
    df_pred = df_pred.sort_values(by=['goalieId', 'time']) # sort by game_id and goalie name
    df_pred['time_diff'] = df_pred.groupby(['goalieId'])['time'].diff(1).fillna(df_pred['time'])

    # get total shots
    df_pred['tot_shots'] = 1
    df_pred['tot_shots'] = df_pred.groupby(['goalieId'])['tot_shots'].cumsum()

    # fix all shots to one side of ice (adjust x and y coordinates)
    df_pred.loc[df_pred['x'] < 0, 'y'] = df_pred['y'] * -1
    df_pred['x'] = df_pred['x'].abs()
    df_pred = df_pred[df_pred['x'].notna()]

        # calculate angle of shot compared to goal
    x_goal = 89
    df_pred['shot_angle'] = np.where(df_pred['x'] != x_goal,
                                # when shot is from behind the net
                                np.where(df_pred['x'] > x_goal,
                                         np.where(df_pred['y'] >= 0,
                                                  round(90 + (90 - np.arctan(df_pred['y'] / (df_pred['x'] - x_goal)) * (180 / np.pi)), 2),
                                                  round(-90 - (90 + np.arctan(df_pred['y'] / (df_pred['x'] - x_goal)) * (180 / np.pi)), 2)
                                                 ),
                                         # when shot is in front of net
                                         round(np.arctan(df_pred['y'] / (x_goal - df_pred['x'])) * (180 / np.pi), 2)
                                        ),
                                # when shot is taken on the goal line
                                np.where(df_pred['y'] >= 0, 90, -90)
                               )

    # calculate difference in shot angle
    df_pred['shot_angle_prev'] = df_pred.groupby(['goalieId'])['shot_angle'].shift(1).fillna(0)
    df_pred['shot_angle_diff'] = np.absolute(np.where(df_pred['shot_angle'] < df_pred['shot_angle_prev'],
                                                    df_pred['shot_angle'] - df_pred['shot_angle_prev'],
                                                    df_pred['shot_angle_prev'] - df_pred['shot_angle']
                                                )
                                       )


    # determine if goalie moved to his right since last shot to attempt to make save
    df_pred['goalie_move_right'] = np.where(df_pred['shot_angle'] > df_pred['shot_angle_prev'], True, False)


    # calculate shot distance to goal
    df_pred['shot_dist'] = round(np.sqrt(np.square(df_pred['x'] - x_goal) + np.square(df_pred['y'])), 2)
    df_pred['shot_dist_prev'] = df_pred.groupby(['goalieId'])['shot_dist'].shift(1).fillna(0)
    df_pred['shot_dist_diff'] = df_pred['shot_dist'] - df_pred['shot_dist_prev']

    # rename vars to match the varnames expected by model
    df_pred.rename(columns={'x': 'x_loc', 'y': 'y_loc',
                           'time': 'time_seconds', 'time_diff': 'time_seconds_diff'}, inplace=True)

    # generate dummy vars for shot_type
    df_pred['shot_type_Backhand'] = np.where(df_pred['shot_type']=='Backhand', 1, 0)
    df_pred['shot_type_Deflected'] = np.where(df_pred['shot_type']=='Deflected', 1, 0)
    df_pred['shot_type_Slap Shot'] = np.where(df_pred['shot_type']=='Slap Shot', 1, 0)
    df_pred['shot_type_Snap Shot'] = np.where(df_pred['shot_type']=='Snap Shot', 1, 0)
    df_pred['shot_type_Tip-In'] = np.where(df_pred['shot_type']=='Tip-In', 1, 0)
    df_pred['shot_type_Wrap-around'] = np.where(df_pred['shot_type']=='Wrap-around', 1, 0)
    df_pred['shot_type_Wrist Shot'] = np.where(df_pred['shot_type']=='Wrist Shot', 1, 0)

    # drop rows with missing values
    df_pred.dropna(axis=0, inplace=True) 

    #### delete columns that are unnecessary ####
    cols_to_drop = ['shooterId', 'goalieId', 'teamId', 'type', 
                    'shot_angle_prev', 'shot_dist_prev', 'shot_type']
    df_pred = df_pred.drop(cols_to_drop, axis=1)

    #### load model and generate predictions ####
    with open('../models/expectedGoals_FINAL.sav', 'rb') as f:
        loaded_model = pickle.load(f)
    df_pred['xgoals'] = loaded_model.predict(df_pred)
    
    return df_shot_players.merge(df_pred[['xgoals']], left_index=True, right_index=True, how='left').fillna(0)

## Wrap everything together using functions above

In [164]:
# get games for this date
date = '2022-05-10'
games = get_games_for_date(date)
all_shifts = []
all_shots = []

# download data
for game in games:
    gameId = game['gameId']
    game_data = get_game_data(gameId)
    awayTeamId = game_data['gameData']['teams']['away']['id']
    homeTeamId = game_data['gameData']['teams']['home']['id']

    # get shots
    df_shots = pd.DataFrame(extract_shots(game_data))
    df_shots.head()

    # get timeframes and shifts (store shifts as table within db)
    line_shifts = group_shifts(clean_raw_shifts(get_shifts_data(gameId)))
    df_ice_frames = get_serialized_frames(get_num_players_by_frame(line_shifts), game_data)
    shifts = convert_frame_to_shifts(get_scenario_frames(df_ice_frames), 
                                     gameId=gameId, 
                                     awayTeamId=awayTeamId, 
                                     homeTeamId=homeTeamId)

    # merge shots and timeframes data, generate xGoals
    df_shots_players = merge_and_get_expected_goals(df_shots, df_ice_frames)
    
    # store these data points in a list
    all_shifts.extend(shifts)
    all_shots.extend(df_shots_players.to_dict(orient='records'))
    print('scraped ' + str(gameId))

scraped 2021030135
scraped 2021030125
scraped 2021030165
scraped 2021030185


In [165]:
# df_shots_players[df_shots_players['scenario']=='5on3'].sort_values('xgoals', ascending=False).head(40)

## Update master files

In [166]:
def update_master_files(games, all_shifts, all_shots):
    #### update schedule ####
    # load
    with open('../../backend/data/scheduleCurrent.json', 'r') as f:
        master = json.load(f)

    # update
    df_master = pd.DataFrame(master)
    df_new = pd.DataFrame(games)
    df_master = pd.concat([df_master, df_new]).drop_duplicates(keep='last')

    # write
    df_master.to_json('../../backend/data/scheduleCurrent.json', orient='records')

    #### update shifts ####
    # load
    with open('../../backend/data/shiftCurrent.json', 'r') as f:
        master = json.load(f)

    # update
    df_master = pd.DataFrame(master)
    df_new = pd.DataFrame(all_shifts)
    df_master = pd.concat([df_master, df_new]).drop_duplicates(keep='last')

    # write
    df_master.to_json('../../backend/data/shiftCurrent.json', orient='records')

    #### update shots ####
    # load
    with open('../../backend/data/shotCurrent.json', 'r') as f:
        master = json.load(f)

    # update
    df_master = pd.DataFrame(master)
    df_new = pd.DataFrame(all_shots)
    df_master = pd.concat([df_master, df_new]).drop_duplicates(subset=['gameId', 'time'], keep='last')

    # write
    df_master.to_json('../../backend/data/shotCurrent.json', orient='records')

In [167]:
update_master_files(games, all_shifts, all_shots)

In [156]:
# with open('../../backend/data/shotCurrent.json', 'r') as f:
#         master = json.load(f)

In [157]:
# df_tmp = pd.DataFrame(master)

In [159]:
# df_tmp[(df_tmp['gameId']==2021030184) & (df_tmp['awayFwdIds']=='8475169_8478402_8479977')]#['teamId'].value_counts()

In [146]:
# with open('../../backend/data/shiftCurrent.json', 'r') as f:
#         master = json.load(f)

In [147]:
# df_tmp = pd.DataFrame(master)

In [160]:
# df_tmp[(df_tmp['gameId']==2021030184) & (df_tmp['teamId']==22) & (df_tmp['lineType']=='fwd')]
# df_tmp[(df_tmp['gameId']==2021030184) & (df_tmp['lineId']=='8475169_8478402_8479977')]