# EDA
Let's analyze how goalies are affected after not seeing a shot in a while
* Does save percentage decrease?
* Does expected goals increase?

In [1]:
import os
import json
import pprint
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

### Read in data

In [2]:
year = '2019'
path = '../../nhl_overtime_eda/data/' + year + '_regSeasonPlays.json'

games = {}
with open(path) as f:
    games[year] = json.load(f)

In [3]:
# year = '2020'
# years = ['2015','2016','2017','2018','2019','2020']
years = ['2019']
games = {}
for year in years:
    path = '../../nhl_overtime_eda/data/' + year + '_regSeasonPlays.json'
    print(path)
    with open(path) as f:
        games[year] = json.load(f)

../../nhl_overtime_eda/data/2019_regSeasonPlays.json


In [4]:
# g = games['2019']['2019020001']

# get game data for 2019-20
game_data = games['2019']

In [5]:
# for game_id in game_data:
#     events = game_data[game_id]
#     break
    
# for event in events['events']:
#     print(event.get('result').get('strength').get('name'))

In [6]:
# for key in games['2019']['2019020001']:
#     print(key)
#     print(games['2019']['2019020001'][key])

In [7]:
# for i in games['2019']['2019020001']['events']:
#     pprint.pprint(i)

### Data mining

In [8]:
# # for game_id in game_data:
# for e in game_data['2019020001']['events']:
#     if e['result']['event'] == 'Shot':
#         pprint.pprint(e)
#         print()

In [9]:
# get information we care about: shots and goals
shots = []
for game_id in game_data:
    home_team = game_data[game_id]['home_team']
    away_team = game_data[game_id]['away_team']
    for event in game_data[game_id]['events']:
        # if event is shot or goal, save it
        if event['result']['event'] == 'Shot' or event['result']['event'] == 'Goal':
            shooter = ''
            shooter_id = 0
            goalie = ''
            goalie_id = 0

            # extract shooter/goalie
            for player in event['players']:
                if player['playerType'] == 'Shooter' or player['playerType'] == 'Scorer':
                    shooter = player['player']['fullName']
                    shooter_id = player['player']['id']
                elif player['playerType'] == 'Goalie':
                    goalie = player['player']['fullName']
                    goalie_id = player['player']['id']
                    
            # extract shooter/goalie team
            shooter_team = event['team']['name']
            goalie_team = home_team
            if goalie_team == shooter_team: goalie_team = away_team

            # extract strength for goal
            strength = 'unknown'
            try:
                strength = event['result']['strength']['name']
            except:
                pass

            shots.append({
                'game_id': game_id,
                'home_team': home_team,
                'away_team': away_team,
                'dateTime': event['about']['dateTime'],
                'period': event['about']['period'],
                'periodTime': event['about']['periodTime'],
                'x_loc': event['coordinates'].get('x', np.NaN),
                'y_loc': event['coordinates'].get('y', np.NaN),
                'shooter': shooter,
                'shooter_id': shooter_id,
                'shooter_team': shooter_team,
                'goalie': goalie,
                'goalie_id': goalie_id,
                'goalie_team': goalie_team,
                'is_goal': (event['result']['event'] == 'Goal'),
                'shot_type': event['result'].get('secondaryType'),                
                'strength': strength
            })

### Feature engineering
* time since last shot (real time and time in period) **records of actual time are inaccurate
* zone where shot came from (high slot, upper slot, etc)
* change of angle since last shot
* change in distance since last shot
* days rest for goaltender
* characteristics of shooter/goalie
    * handness
    * height
    * weight
    * age
* amount of travel since previous game

In [10]:
# convert to dataframe
df = pd.DataFrame(shots)
df = df[df['goalie'] != ''] # filter empty net goals
df = df[df['period'] != 5] # filter shootout attempts


# find total seconds between shots (periodTime)
df['time_seconds'] = df['periodTime'].str[:2].astype(int) * 60 + df['periodTime'].str[3:].astype(int)
df['time_seconds'] = np.where(df['period'] == 2, df['time_seconds'] + 1200, 
                                 np.where(df['period'] == 3, df['time_seconds'] + 2400, df['time_seconds']))
df = df.sort_values(by=['game_id', 'goalie', 'time_seconds']) # sort by game_id and goalie name
df['time_seconds_diff'] = df.groupby(['game_id', 'goalie'])['time_seconds'].diff(1).fillna(df['time_seconds'])

# get total shots
df['tot_shots'] = 1
df['tot_shots'] = df.groupby(['game_id', 'goalie'])['tot_shots'].cumsum()

# fix all shots to one side of ice (adjust x and y coordinates)
df.loc[df['x_loc'] < 0, 'y_loc'] = df['y_loc'] * -1
df['x_loc'] = df['x_loc'].abs()
df = df[df['x_loc'].notna()]

# calculate angle of shot compared to goal
x_goal = 89
df['shot_angle'] = np.where(df['x_loc'] != x_goal,
                            # when shot is from behind the net
                            np.where(df['x_loc'] > x_goal,
                                     np.where(df['y_loc'] >= 0,
                                              round(90 + (90 - np.arctan(df['y_loc'] / (df['x_loc'] - x_goal)) * (180 / np.pi)), 2),
                                              round(-90 - (90 + np.arctan(df['y_loc'] / (df['x_loc'] - x_goal)) * (180 / np.pi)), 2)
                                             ),
                                     # when shot is in front of net
                                     round(np.arctan(df['y_loc'] / (x_goal - df['x_loc'])) * (180 / np.pi), 2)
                                    ),
                            # when shot is taken on the goal line
                            np.where(df['y_loc'] >= 0, 90, -90)
                           )

# calculate difference in shot angle
df['shot_angle_prev'] = df.groupby(['game_id', 'goalie'])['shot_angle'].shift(1).fillna(0)
df['shot_angle_diff'] = np.absolute(np.where(df['shot_angle'] < df['shot_angle_prev'],
                                                df['shot_angle'] - df['shot_angle_prev'],
                                                df['shot_angle_prev'] - df['shot_angle']
                                            )
                                   )

# determine if goalie moved to his right since last shot to attempt to make save
df['goalie_move_right'] = np.where(df['shot_angle'] > df['shot_angle_prev'], True, False)

# calculate shot distance to goal
df['shot_dist'] = round(np.sqrt(np.square(df['x_loc'] - x_goal) + np.square(df['y_loc'])), 2)
df['shot_dist_prev'] = df.groupby(['game_id', 'goalie'])['shot_dist'].shift(1).fillna(0)
df['shot_dist_diff'] = df['shot_dist'] - df['shot_dist_prev']

# get days rest for goalie (if at begining of season, just mark days rest as 5)
df_goalies = df[['game_id', 'goalie', 'dateTime']].groupby(['goalie', 'game_id']).first()
df_goalies['dateTime_prev'] = df_goalies['dateTime'].shift(1).fillna('1970-01-01T04:00:00Z')
df_goalies['goalie_days_rest'] = (pd.to_datetime(df_goalies['dateTime']) - pd.to_datetime(df_goalies['dateTime_prev'])).dt.round(freq='D').dt.days
df_goalies['goalie_days_rest'] = np.where(df_goalies['goalie_days_rest'] > 200, 
                                          5, 
                                          np.where(df_goalies['goalie_days_rest'] < 0,
                                                   5,
                                                   df_goalies['goalie_days_rest']
                                                  )
                                         )
df = df.sort_values(by=['goalie', 'game_id']).merge(df_goalies[['goalie_days_rest']], how='left', on=['goalie', 'game_id'])

# get player characteristics (age, height, weight, handness) and merge with dataframe
## characteristics mined in 'player_miner.ipynb'
with open('../data/playerAttributes.json') as f:
    attributes = json.load(f)          # read file
df_atr = pd.DataFrame(attributes)      # convert to df
df_atr.sort_values('id', inplace=True) # sort by id

# merge skater attrs
df_atr_skaters = df_atr[(df_atr['type']=='Forward') | (df_atr['type']=='Defenseman')]    # filter by skaters
df_atr_skaters.columns = [f'shooter_{c}' for c in list(df_atr.columns)]                  # rename columns
df = df.sort_values('shooter_id').merge(df_atr_skaters, how='left', on='shooter_id')     # merge 
df['shooter_age'] = round((pd.to_datetime(df['dateTime'].str[:10]) - 
                           pd.to_datetime(df['shooter_birthDate'])).dt.days / 365, 1)      # calculate age

# merge goalie attrs
df_atr_goalies = df_atr[df_atr['type']=='Goalie']                                   # filter by goalies
df_atr_goalies.columns = [f'goalie_{c}' for c in list(df_atr.columns)]              # rename columns
df = df.sort_values('goalie_id').merge(df_atr_goalies, how='left', on='goalie_id')  # merge 
df['goalie_age'] = round((pd.to_datetime(df['dateTime'].str[:10]) - 
                          pd.to_datetime(df['goalie_birthDate'])).dt.days / 365, 1) # calculate age

# calculate circadian effects (number of timezones crossed since last game)
with open('../data/teamCircadian.json') as f:
    d = json.load(f)          # read file
df_circ = pd.DataFrame(d)      # convert to df
df_circ = df_circ[['game_id', 'team', 'offset_diff']]

# merge skater circadian
df_circ.columns = ['game_id', 'shooter_team', 'shooter_tzTraveled']
df = df.sort_values(['shooter_team', 'game_id']).merge(df_circ, how='left', on=['shooter_team', 'game_id'])

# merge goalie circadian
df_circ.columns = ['game_id', 'goalie_team', 'goalie_tzTraveled']
df = df.sort_values(['goalie_team', 'game_id']).merge(df_circ, how='left', on=['goalie_team', 'game_id'])

# generate dummy vars
df = pd.get_dummies(df, columns=['shot_type', 'shooter_type', 'shooter_shootsCatches', 'goalie_shootsCatches'])

df.head()


Unnamed: 0,game_id,home_team,away_team,dateTime,period,periodTime,x_loc,y_loc,shooter,shooter_id,...,shot_type_Snap Shot,shot_type_Tip-In,shot_type_Wrap-around,shot_type_Wrist Shot,shooter_type_Defenseman,shooter_type_Forward,shooter_shootsCatches_L,shooter_shootsCatches_R,goalie_shootsCatches_L,goalie_shootsCatches_R
0,2019020012,Anaheim Ducks,Arizona Coyotes,2019-10-04T03:26:26Z,2,12:03,73.0,-28.0,Jason Demers,8474218,...,0,0,0,1,1,0,0,1,1,0
1,2019020012,Anaheim Ducks,Arizona Coyotes,2019-10-04T04:14:09Z,3,07:28,78.0,-23.0,Michael Grabner,8473546,...,0,0,0,0,0,1,1,0,1,0
2,2019020012,Anaheim Ducks,Arizona Coyotes,2019-10-04T02:18:47Z,1,04:11,67.0,12.0,Michael Grabner,8473546,...,0,0,0,1,0,1,1,0,1,0
3,2019020012,Anaheim Ducks,Arizona Coyotes,2019-10-04T04:25:31Z,3,15:38,44.0,27.0,Nick Schmaltz,8477951,...,0,0,0,0,0,1,0,1,1,0
4,2019020012,Anaheim Ducks,Arizona Coyotes,2019-10-04T04:06:19Z,3,03:19,37.0,39.0,Oliver Ekman-Larsson,8475171,...,0,0,0,1,1,0,1,0,1,0


In [11]:
#### delete columns that are unnecessary ####
cols_to_drop = ['game_id', 'home_team', 'away_team',
                'dateTime', 'period', 'periodTime',
                'shooter_team', 'goalie_team', 'strength', 
                'shot_angle_prev', 'shot_dist_prev',
                'shooter_fullName', 'shooter_birthDate', 'shooter_nationality', 'shooter_primaryPosition',
                'goalie_fullName', 'goalie_birthDate', 'goalie_nationality', 
                'goalie_primaryPosition', 'goalie_type']
df = df.drop(cols_to_drop, axis=1)

#### delete rows with weird data ####
df = df[df['shooter_height'].isna() == False] # filter rows where it records a goalie shooting puck


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67646 entries, 0 to 67649
Data columns (total 37 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   x_loc                    67646 non-null  float64
 1   y_loc                    67646 non-null  float64
 2   shooter                  67646 non-null  object 
 3   shooter_id               67646 non-null  int64  
 4   goalie                   67646 non-null  object 
 5   goalie_id                67646 non-null  int64  
 6   is_goal                  67646 non-null  bool   
 7   time_seconds             67646 non-null  int64  
 8   time_seconds_diff        67646 non-null  float64
 9   tot_shots                67646 non-null  int64  
 10  shot_angle               67646 non-null  float64
 11  shot_angle_diff          67646 non-null  float64
 12  goalie_move_right        67646 non-null  bool   
 13  shot_dist                67646 non-null  float64
 14  shot_dist_diff        

### EDA

In [13]:
# df_tmp = df[df['goalie']=='Frederik Andersen']
# game_ids = df_tmp['game_id'].unique().tolist()[0:5] # get list of five games

Timeline of shots faced vs goals scored

In [14]:
# for game_id in game_ids:
#     df_tmp2 = df_tmp[df_tmp['game_id']==game_id]
#     date = df_tmp2['dateTime'].iloc[0][0:10]
#     color = ['red' if goal else 'green' for goal in df_tmp2['is_goal'] ]

#     plt.scatter(df_tmp2['time_seconds'], df_tmp2['tot_shots'], c=color)
#     plt.plot(df_tmp2['time_seconds'], df_tmp2['tot_shots'], color='green')
#     plt.title(f'Fred Anderson Shots Faced on {date}')
#     plt.xlabel("Time in Seconds")
#     plt.ylabel("Total Shots")
#     plt.show()


Scatter plot of shot locations

In [15]:
# for game_id in game_ids:
#     df_tmp2 = df_tmp[df_tmp['game_id']==game_id]
#     date = df_tmp2['dateTime'].iloc[0][0:10]

#     # read img (retrieved from https://thewincolumn.ca/2021/01/15/r-tutorial-creating-an-nhl-rink-using-the-tidyverse/)
#     img = plt.imread("../data/images/nhl_rink_plot_output_light.png")
#     fig, ax = plt.subplots()
#     ax.imshow(img, extent=[0, 100, -42.5, 42.5])

#     # plot
#     goals = plt.scatter(df_tmp2[df_tmp2['is_goal']==True]['x_loc'], 
#                         df_tmp2[df_tmp2['is_goal']==True]['y_loc'], 
#                         c='red', 
#                         label='Goals')
#     non_goals = plt.scatter(df_tmp2[df_tmp2['is_goal']==False]['x_loc'], 
#                             df_tmp2[df_tmp2['is_goal']==False]['y_loc'], 
#                             c='green', 
#                             label='Saves')
#     plt.title(f'Fred Anderson Shots Faced on {date}')
#     plt.xlabel("Time in Seconds")
#     plt.ylabel("Total Shots")
#     ax.legend()
#     plt.show()

Change in angle of rebound vs Shot outcome

In [16]:
# df_rebounds = df[df['time_seconds_diff'] <= 2].copy() # get shots with previous shot taken in past 3 seconds
# # df_rebounds = df_rebounds[df_rebounds['goalie'] != ''] # filter empty netters

# fig, ax = plt.subplots()

# bins = list(np.linspace(0,200,11))
# non_goals = plt.hist(df_rebounds['shot_angle_diff'],
#                         color='#A2AAAD',
#                         label='Total Shots',
#                         bins=bins)
# goals = plt.hist(df_rebounds[df_rebounds['is_goal'] == True]['shot_angle_diff'],
#                 color='#C8102E',
#                 label='Goals',
#                 bins=bins)

# plt.title('Proportion of Goals Scored on Rebound Shots\n(Shots Taken Within 2 Seconds of Previous Shot)')
# plt.xlabel("Change in Angle Since Previous Shot")
# plt.ylabel("Count")
# ax.legend()

# # create labels
# rects = ax.patches
# labels = []
# for i in range(len(goals[0])):
#     labels.append(round(goals[0][i] / non_goals[0][i] * 100,1))


# # apply labels
# for rect, label in zip(rects, labels):
#     height = rect.get_height()
#     text_color = 'black'
#     ax.text(rect.get_x() + rect.get_width() / 2, height + 5, str(label)+'%',
#             ha='center', va='bottom', color=text_color)
    
# plt.show()

### Model Building

In [17]:
# import sys
# !{sys.executable} -m pip install seaborn

In [18]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns


##### Test train split

In [37]:
df_tmp = df.copy()
df_results = df_tmp[['shooter', 'shooter_id', 'goalie', 'goalie_id', 'is_goal']] # use this to score model performance
df_tmp.drop(['shooter', 'shooter_id', 'goalie', 'goalie_id'], axis=1, inplace=True)
y = df_tmp.pop('is_goal')
X = df_tmp
# X_normalized = preprocessing.normalize(X, norm='l2')

# test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

###### Model Training

In [38]:
# logistic regression
logit = LogisticRegression()
logit.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

##### Model Testing


In [83]:
# get probability of each shot resulting in goal
probs = logit.predict_proba(X_test)
predictions = pd.DataFrame(probs)

# get list of shots tested, add column of shot probability predictions
df_test = pd.DataFrame(y_test)
df_test['is_goal_probability'] = predictions[1].values
df_test.sort_index(inplace=True)

# join with original dataframe containing skater/goalie names, and shot results
df_test = df_results[df_results.index.isin(df_test.index)].join(df_test[['is_goal_probability']])

# get expected goals for each skater
skaters_results = df_test.groupby(['shooter_id', 'shooter'])[['is_goal', 'is_goal_probability']].sum().sort_values('is_goal', ascending=False)
skaters_results['error'] = skaters_results['is_goal_probability'] - skaters_results['is_goal']
skaters_rmse = np.sqrt(np.mean(np.square(skaters_results['error'])))
skaters_rmse

# get expected goals for each goalie
goalie_results = df_test.groupby(['goalie', 'goalie_id'])[['is_goal', 'is_goal_probability']].sum().sort_values('is_goal', ascending=False)
goalie_results['error'] = goalie_results['is_goal_probability'] - goalie_results['is_goal']
goalie_rmse = np.sqrt(np.mean(np.square(goalie_results['error'])))
goalie_rmse

4.87718929362568

In [84]:
goalie_results[goalie_results['is_goal'] >= 10]

Unnamed: 0_level_0,Unnamed: 1_level_0,is_goal,is_goal_probability,error
goalie,goalie_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Frederik Andersen,8475883,53,48.877737,-4.122263
Carey Price,8471679,49,47.732648,-1.267352
Marc-Andre Fleury,8470594,46,41.113889,-4.886111
Connor Hellebuyck,8476945,43,49.530791,6.530791
Braden Holtby,8474651,42,40.685484,-1.314516
...,...,...,...,...
Michael Hutchinson,8474636,15,14.254907,-0.745093
Cory Schneider,8471239,15,9.985036,-5.014964
Anton Khudobin,8471418,14,22.252780,8.252780
Curtis McElhinney,8470147,14,16.537756,2.537756
