# Changing on the Fly - Quantitative Analysis


In [1]:
import json
import pprint
import pandas as pd
import numpy as np
import math
import requests
pd.set_option('max_colwidth', 200)

## Single Game Demo
Show analysis process on a single game of data
### Create 'shifts' Dataset
##### Load Data
* Load json file containing data. 
* Using pandas, sort by start/end time of shifts and filter out bad data.
* Convert dataset back to a dict

In [188]:
with open('../data/shifts2020.json', 'r') as f:
    d = json.load(f)
d[0]

{'gameId': 2020020001,
 'teamId': 5,
 'playerId': 8471215,
 'start': 35,
 'end': 65}

In [189]:
# get shift for b's caps game
shifts = [shift for shift in d if shift['gameId']==2020020419]
df = pd.DataFrame(shifts)

# some shifts are recorded with identical start/end times, omit these rows
df = df[df['start'] != df['end']]

# sort df
df = df.sort_values(['gameId', 'teamId', 'start', 'end', 'playerId'])
df.head(10)

Unnamed: 0,gameId,teamId,playerId,start,end
7,2020020419,12,8473533,0,52
149,2020020419,12,8475855,0,52
738,2020020419,12,8480830,0,52
192,2020020419,12,8476462,0,63
321,2020020419,12,8476958,0,63
3,2020020419,12,8473503,0,1200
131,2020020419,12,8475799,52,105
510,2020020419,12,8478427,52,105
717,2020020419,12,8480039,52,105
426,2020020419,12,8477488,63,105


In [190]:
shifts = df.to_dict(orient='records')
shifts[0]

{'gameId': 2020020419,
 'teamId': 12,
 'playerId': 8473533,
 'start': 0,
 'end': 52}

##### Extract 'Grouped' Shift Data
* Helper functions
* Convert dataset from 1 row representing a single player's shift to 1 row representing a group of players' shift.
* Convert dataset to dataframe, display sample of each teams' shift data

In [191]:
def line_list_to_shift(line_list, start, end):
    # need this for when dataset changes from teamId_1 to teamId_2
    # ...corrects end time of final shift 'first' team's shift data
    if end <= start:
        end = line_list[0]['end']
    return {
        'playerIds': [player['playerId'] for player in line_list],
        'start': start,
        'end': end,
        'numPlayers': len(line_list),
        'teamId': line_list[0]['teamId']
    }

def next_shift_end(line_list):
    return min([player['end'] for player in line_list])

In [192]:
# get starting lines
start_time = shifts[0]['start']
line_list = []
line_shifts = []
i = 0

while i < len(shifts)-1: #and shifts[i]['teamId'] == 6:
    # add players starting shift
    while start_time == shifts[i]['start']:
        line_list.append(shifts[i])
        i += 1
        if i == len(shifts):
            i -= 1 # at the end of shifts data for this game
            break
    
    # figure out the time when the next player is getting on/off the ice
    next_player_off = next_shift_end(line_list)
    next_player_on = shifts[i]['start']
    end_time = min([next_player_off, next_player_on])

    # new player getting on, save this line combination
    line_shifts.append(line_list_to_shift(line_list, start_time, end_time))

    # cut list to only contain players that remain on the ice after end_time
    line_list = [player for player in line_list if player['end'] > end_time]
    
    # if reached end of shifts for this game for this team, clear line_list
    if next_player_on == 0:
        line_list = []

    # update start_time with time that new player(s) are getting on the ice
    start_time = end_time
    

In [193]:
df = pd.DataFrame(line_shifts)
display(df[df['teamId']==6].head())  # bruins
display(df[df['teamId']==15].head()) # capitals
display(df)

Unnamed: 0,playerIds,start,end,numPlayers,teamId


Unnamed: 0,playerIds,start,end,numPlayers,teamId


Unnamed: 0,playerIds,start,end,numPlayers,teamId
0,"[8473533, 8475855, 8480830, 8476462, 8476958, 8473503]",0,52,6,12
1,"[8476462, 8476958, 8473503, 8475799, 8478427, 8480039]",52,63,6,12
2,"[8473503, 8475799, 8478427, 8480039, 8477488, 8476869]",63,105,6,12
3,"[8473503, 8476869, 8476921, 8476934, 8477938, 8477998]",105,115,6,12
4,"[8473503, 8476921, 8476934, 8477938, 8477998, 8479402]",115,148,6,12
...,...,...,...,...,...
302,"[8479288, 8476887, 8478851, 8479602, 8479514, 8475714]",3510,3513,6,18
303,"[8479288, 8478851, 8479602, 8479514, 8475714, 8476393]",3513,3528,6,18
304,"[8479288, 8479602, 8479514, 8475714, 8476393, 8476988]",3528,3531,6,18
305,"[8479288, 8479514, 8475714, 8476393, 8476988, 8477433]",3531,3545,6,18


##### Filter 5on5 Timeframes
* Expand shifts dataset to have 1 row for each timeframe
* Using pandas, find all timeframes that do not exhibit 5on5 play, store in list
* Go back to original shift data, exclude/modify shifts that are not 5on5 play

In [194]:
timeframes = []
for row in line_shifts:
    timeframes.extend([{
        'time': t,
        'teamId': row['teamId'],
        'numPlayers': row['numPlayers'],
#         'playerIds': row['playerIds']
    } for t in range(row['start']+1, row['end']+1)]) 
    # we need the '+1' because the way NHL API tracks events such as shots/goals.
    # the API records a goal as the last second of the shift. For example, if a goal is
    # scored at 0:53 in period 1, the API will record the goal at 0:53 and the player's shift
    # ending at 0:53. Therefore, we need to record the next player's shift starting at 0:54
    ### if we care about faceoffs, the API will record the next faceoff at 0:53 .... this will
    ### require more attention


In [195]:
# convert timeframes to dataframe, grab unique teamIds
df_times = pd.DataFrame(timeframes)
team_ids = df_times['teamId'].unique()
display(df_times.head())
print('df length: ' + str(len(df_times)))

# pivot dataframe and exclude timeframes where both teams have 6 players on the ice
df_times = df_times.pivot(index='time', columns='teamId', values='numPlayers').reset_index()
df_times = df_times[~((df_times[team_ids[0]]==6) & (df_times[team_ids[1]]==6))]
display(df_times.head())
print('df length: ' + str(len(df_times)))

timeframes_non5on5 = df_times['time'].tolist()

Unnamed: 0,time,teamId,numPlayers
0,1,12,6
1,2,12,6
2,3,12,6
3,4,12,6
4,5,12,6


df length: 7200


teamId,time,12,18
439,440,6,5
440,441,6,5
441,442,6,5
442,443,6,5
443,444,6,5


df length: 405


In [196]:
t_non5on5 = set(timeframes_non5on5)
line_shifts_5on5 = []

for shift in line_shifts:
    # entire shift is non5on5 play: skip this record
    if shift['numPlayers'] != 6:
        continue
    
    # entire shift is 5on5 play: add to final list
    if shift['start'] not in t_non5on5 and shift['end'] not in t_non5on5:
        line_shifts_5on5.append(shift)
        
    # shift starts 5on5, ends non5on5: adjust END time and add to final list
    elif shift['start'] not in t_non5on5 and shift['end'] in t_non5on5:
        end_time_new = shift['end']
        while end_time_new in t_non5on5:
            end_time_new -= 1
        shift['end'] = end_time_new
        if shift['start'] != shift['end']:
            line_shifts_5on5.append(shift)
        
    # shift starts non5on5, ends 5on5: adjust START time and add to final list
    elif shift['start'] in t_non5on5 and shift['end'] not in t_non5on5:
        start_time_new = shift['start']
        while start_time_new in t_non5on5:
            start_time_new += 1
        shift['start'] = start_time_new
        line_shifts_5on5.append(shift)
        
# display what dataset looks like
display(pd.DataFrame(line_shifts_5on5))

Unnamed: 0,playerIds,start,end,numPlayers,teamId
0,"[8473533, 8475855, 8480830, 8476462, 8476958, 8473503]",0,52,6,12
1,"[8476462, 8476958, 8473503, 8475799, 8478427, 8480039]",52,63,6,12
2,"[8473503, 8475799, 8478427, 8480039, 8477488, 8476869]",63,105,6,12
3,"[8473503, 8476869, 8476921, 8476934, 8477938, 8477998]",105,115,6,12
4,"[8473503, 8476921, 8476934, 8477938, 8477998, 8479402]",115,148,6,12
...,...,...,...,...,...
282,"[8479288, 8476887, 8478851, 8479602, 8479514, 8475714]",3510,3513,6,18
283,"[8479288, 8478851, 8479602, 8479514, 8475714, 8476393]",3513,3528,6,18
284,"[8479288, 8479602, 8479514, 8475714, 8476393, 8476988]",3528,3531,6,18
285,"[8479288, 8479514, 8475714, 8476393, 8476988, 8477433]",3531,3545,6,18


### Define 'Changing on the Fly' 
In terms of the dataset, define what 'changing on the fly' looks like.

Let's begin by looking exclusively at forwards.

##### Load Rosters Data
We needs positional data to filter on the the desired position (forward or d)
* Load JSON file containing roster information
* Get all players with specified positionType (forward or d), store their playerIds with a set
* To increase ability to read data, create dict mapping playerIds to names

In [197]:
with open('../data/roster2020.json', 'r') as f:
    r = json.load(f)
r[0]

{'playerId': 8471233,
 'fullName': 'Travis Zajac',
 'position': 'C',
 'positionType': 'Forward'}

In [198]:
position_type = 'Forward' #'Defenseman' #'Goalie'
pIds_of_interest = set([
    player['playerId'] for player in r if player['positionType'] == position_type
])
print('Players in set: ' + str(len(pIds_of_interest)))

Players in set: 599


In [199]:
pId_name = {}
for player in r:
    pId_name[player['playerId']] = player['fullName']  

##### Filter 'Grouped-5on5' Shifts Data to only Forwards
* Go back to original shifts data, update playerIds to include only forwards

In [200]:
for shift in line_shifts_5on5:
    playerIds_new = [pId for pId in shift['playerIds'] if pId in pIds_of_interest]
    player_names = [pId_name[pId] for pId in playerIds_new]
    shift['playerIds'] = playerIds_new
    shift['players'] = player_names

display(pd.DataFrame(line_shifts_5on5).head(20))

Unnamed: 0,playerIds,start,end,numPlayers,teamId,players
0,"[8473533, 8475855, 8480830]",0,52,6,12,"[Jordan Staal, Jesper Fast, Andrei Svechnikov]"
1,"[8475799, 8478427, 8480039]",52,63,6,12,"[Nino Niederreiter, Sebastian Aho, Martin Necas]"
2,"[8475799, 8478427, 8480039]",63,105,6,12,"[Nino Niederreiter, Sebastian Aho, Martin Necas]"
3,"[8476921, 8476934, 8477998]",105,115,6,12,"[Jordan Martinook, Brock McGinn, Warren Foegele]"
4,"[8476921, 8476934, 8477998]",115,148,6,12,"[Jordan Martinook, Brock McGinn, Warren Foegele]"
5,"[8479987, 8476975, 8478904]",148,198,6,12,"[Morgan Geekie, Cedric Paquette, Steven Lorentz]"
6,"[8476975, 8478904, 8473533]",198,204,6,12,"[Cedric Paquette, Steven Lorentz, Jordan Staal]"
7,"[8473533, 8475855, 8480830]",204,248,6,12,"[Jordan Staal, Jesper Fast, Andrei Svechnikov]"
8,"[8473533, 8475799, 8480039]",248,257,6,12,"[Jordan Staal, Nino Niederreiter, Martin Necas]"
9,"[8475799, 8480039, 8478427]",257,310,6,12,"[Nino Niederreiter, Martin Necas, Sebastian Aho]"


##### Define Line Change
* Count the number of players changing on each shift
* Count the number of players changing since last full shift (last time 3 new players got on ice)
* Convert dataset to DataFrame and label shift as a full line change if...
    * if forwards: 3 players have changed, if D: 2 players have changed
    * if forwards: cumulativeSum % 3 == 0, if D: cumulativeSum % 2 == 0
    

In [201]:
# manually get the playerIds for the first shift of game
last_line = set(line_shifts_5on5[0]['playerIds'])
line_shifts_5on5[0]['numPlayersChanging'] = 0

for i in range(1, len(line_shifts_5on5)):
    # if start time of this shift does not match end time of previous shift, we have a pp/pk ending
    if line_shifts_5on5[i]['start'] != line_shifts_5on5[i-1]['end']:
        line_shifts_5on5[i]['numPlayersChanging'] = 0
        continue
    
    # get previous line combination
    last_line = set(line_shifts_5on5[i-1]['playerIds'])
    
    # count number of players that are different than previous line combination
    num_players_changing = 0
    for playerId in line_shifts_5on5[i]['playerIds']:
        if playerId not in last_line:
            num_players_changing += 1
    line_shifts_5on5[i]['numPlayersChanging'] = num_players_changing
    


In [202]:
cumsum = 0
line_shifts_5on5[0]['cumNumPlayersChanging'] = cumsum

for line in line_shifts_5on5[1:]:
    cumsum += line['numPlayersChanging']
    line['cumNumPlayersChanging'] = cumsum
    
    # if fresh full line change, reset cumulative sum 
    if (line['numPlayersChanging'] == 3) or (cumsum % 3 == 0):
        cumsum = 0

In [215]:
# convert to df
df_shifts = pd.DataFrame(line_shifts_5on5)

# define completeLine
df_shifts['completeLine'] = np.where(df_shifts['cumNumPlayersChanging'] % 3 == 0, True, 
                                  np.where(df_shifts['numPlayersChanging'] == 3, True, False)
                                  )

df_shifts

Unnamed: 0,playerIds,start,end,numPlayers,teamId,players,numPlayersChanging,cumNumPlayersChanging,completeLine
0,"[8473533, 8475855, 8480830]",0,52,6,12,"[Jordan Staal, Jesper Fast, Andrei Svechnikov]",0,0,True
1,"[8475799, 8478427, 8480039]",52,63,6,12,"[Nino Niederreiter, Sebastian Aho, Martin Necas]",3,3,True
2,"[8475799, 8478427, 8480039]",63,105,6,12,"[Nino Niederreiter, Sebastian Aho, Martin Necas]",0,0,True
3,"[8476921, 8476934, 8477998]",105,115,6,12,"[Jordan Martinook, Brock McGinn, Warren Foegele]",3,3,True
4,"[8476921, 8476934, 8477998]",115,148,6,12,"[Jordan Martinook, Brock McGinn, Warren Foegele]",0,0,True
...,...,...,...,...,...,...,...,...,...
282,"[8476887, 8479514, 8475714]",3510,3513,6,18,"[Filip Forsberg, Rem Pitlick, Calle Jarnkrok]",1,2,False
283,"[8479514, 8475714, 8476393]",3513,3528,6,18,"[Rem Pitlick, Calle Jarnkrok, Nick Cousins]",1,3,True
284,"[8479514, 8475714, 8476393]",3528,3531,6,18,"[Rem Pitlick, Calle Jarnkrok, Nick Cousins]",0,0,True
285,"[8479514, 8475714, 8476393]",3531,3545,6,18,"[Rem Pitlick, Calle Jarnkrok, Nick Cousins]",0,0,True


In [214]:
# df_shifts.iloc[90:140]
df_shifts[df_shifts['teamId']==12].iloc[100:][['start', 
                                               'end', 
                                               'players', 
                                               'numPlayersChanging', 
                                               'cumNumPlayersChanging', 
                                               'completeLine']]

Unnamed: 0,start,end,players,numPlayersChanging,cumNumPlayersChanging,completeLine
100,2937,2946,"[Brock McGinn, Warren Foegele, Jordan Martinook]",0,0,True
101,2946,2995,"[Sebastian Aho, Nino Niederreiter, Martin Necas]",3,3,True
102,2995,3000,"[Sebastian Aho, Nino Niederreiter, Martin Necas]",0,0,True
103,3000,3003,"[Nino Niederreiter, Martin Necas, Morgan Geekie]",1,1,False
104,3003,3038,"[Martin Necas, Morgan Geekie, Cedric Paquette]",1,2,False
105,3038,3051,"[Morgan Geekie, Cedric Paquette, Steven Lorentz]",1,3,True
106,3051,3070,"[Morgan Geekie, Cedric Paquette, Steven Lorentz]",0,0,True
107,3070,3073,"[Morgan Geekie, Cedric Paquette, Warren Foegele]",1,1,False
108,3073,3080,"[Warren Foegele, Jordan Martinook, Brock McGinn]",2,3,True
109,3080,3139,"[Jordan Martinook, Brock McGinn, Jesper Fast]",1,1,False
