### Import Things

In [1]:
import numpy as np
import pandas as pd
import warnings
import math
import random
from pandas.errors import SettingWithCopyWarning
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss, mean_squared_error

pd.set_option("display.max_columns", None)
warnings.simplefilter(action = "ignore", category = SettingWithCopyWarning)
random.seed(2025)

### Some Functions

In [2]:
# Keep only shots and goals function
def keep_SOG(df):
    df = df[(df["typeDescKey"] == "shot-on-goal") | (df["typeDescKey"] == "goal")]

    return(df)

In [3]:
# Fix pbp data types
def fix_pbp_types(pbp_df):
    pbp_df["situationCode"] = pbp_df["situationCode"].astype(int).astype(str).str.pad(width = 4, side = "left", fillchar = "0")
    pbp_df["gameId"] = pbp_df["gameId"].astype(str)
    pbp_df["periodDescriptor.number"] = pbp_df["periodDescriptor.number"].astype(int)
    pbp_df["details.eventOwnerTeamId"] = pbp_df["details.eventOwnerTeamId"].astype(int).astype(str)

    return(pbp_df)

In [4]:
# Fix schedule data types
def fix_schedule_types(schedule_df):
    schedule_df["id"] = schedule_df["id"].astype(str)
    schedule_df["awayTeam.id"] = schedule_df["awayTeam.id"].astype(str)
    schedule_df["homeTeam.id"] = schedule_df["homeTeam.id"].astype(str)

    return(schedule_df)

In [5]:
# Determine which direction the shot is being taken
def shot_side(row):
    if row["homeTeamShot"] == True:
        if row["homeTeamDefendingSide"] == "left":
            return "right"
        else:
            return "left"
    else:
        return row["homeTeamDefendingSide"]

In [6]:
# Determine shot distance
def get_shot_distance(row):
    return math.hypot(89 - row["adj.xCoord"], 0 - row["adj.yCoord"])

In [7]:
# Determine shot distance class
def get_shot_distance_class(df, model):
    # Create shot distance buckets
    if (model == "krzy05a") | (model == "krzy05b"):
        conditions = [
            df["shotDistance"] < 10,
            (df["shotDistance"] >= 10) & (df["shotDistance"] < 13),
            (df["shotDistance"] >= 13) & (df["shotDistance"] < 15),
            (df["shotDistance"] >= 15) & (df["shotDistance"] < 17),
            (df["shotDistance"] >= 17) & (df["shotDistance"] < 23),
            (df["shotDistance"] >= 23) & (df["shotDistance"] < 32),
            (df["shotDistance"] >= 32) & (df["shotDistance"] < 37),
            (df["shotDistance"] >= 37) & (df["shotDistance"] < 39),
            (df["shotDistance"] >= 39) & (df["shotDistance"] < 45),
            (df["shotDistance"] >= 45) & (df["shotDistance"] < 58),
            df["shotDistance"] >= 58
        ]

        choices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

        df["shotDistanceClass"] = np.select(conditions, choices, default = None)
        return(df)
    elif model == "krzy06":
        conditions = [
            df["shotDistance"] < 12,
            (df["shotDistance"] >= 12) & (df["shotDistance"] < 13),
            (df["shotDistance"] >= 13) & (df["shotDistance"] < 16),
            (df["shotDistance"] >= 17) & (df["shotDistance"] < 18),
            (df["shotDistance"] >= 19) & (df["shotDistance"] < 21),
            (df["shotDistance"] >= 22) & (df["shotDistance"] < 32),
            (df["shotDistance"] >= 33) & (df["shotDistance"] < 35),
            (df["shotDistance"] >= 36) & (df["shotDistance"] < 37),
            (df["shotDistance"] >= 38) & (df["shotDistance"] < 40),
            (df["shotDistance"] >= 41) & (df["shotDistance"] < 44),
            (df["shotDistance"] >= 45) & (df["shotDistance"] < 52),
            (df["shotDistance"] >= 53) & (df["shotDistance"] < 59),
            df["shotDistance"] >= 60
        ]

        choices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

        df["shotDistanceClass"] = np.select(conditions, choices, default = None)
        return(df)

In [8]:
# Get Strength at time of event
def get_strength(row):
    if row["homeTeamShot"] == True:
        if row["home_players"] > row["away_players"]:
            return "PP"
        elif row["home_players"] < row["away_players"]:
            return "SH"
        else:
            return "EV"
    elif row["homeTeamShot"] == False:
        if row["home_players"] > row["away_players"]:
            return "SH"
        elif row["home_players"] < row["away_players"]:
            return "PP"
        else:
            return "EV"

In [9]:
# Return either pre-season, regular season or playoff games
def get_games(df, game_type):
    if game_type =="pre":
        return df.loc[df["gameId"].str[5] == "1"]
    elif game_type == "regular":
        return df.loc[df["gameId"].str[5] == "2"]
    elif game_type == "playoff":
        return df.loc[df["gameId"].str[5] == "3"]

In [32]:
# Get Schedule and fix data types
schedule = pd.read_csv("schedule.csv")
schedule = fix_schedule_types(schedule)

# Preprocess data function
def preprocess_data(df, model, gametype = "regular", schedule = schedule):
    # Get required game type
    df = get_games(df, gametype)

    # Keep only shot and goal events
    df = keep_SOG(df)

    # Fix data types
    df = fix_pbp_types(df)

    # Reduce columns
    keep_pbp = ["timeInPeriod", "timeRemaining", "situationCode", "typeDescKey", "periodDescriptor.number", "details.eventOwnerTeamId", "details.xCoord", "details.yCoord", "details.shotType", "gameId", "homeTeamDefendingSide", "lastEvent", "timeSinceLastEvent", "lastEventTeam"]
    keep_schedule = ["id", "awayTeam.id", "homeTeam.id"]
    df = pd.merge(df[keep_pbp], schedule[keep_schedule], left_on = "gameId", right_on = "id", how = "inner")

    # Drop rows with missing details.shotType, this is the only column containing NaN
    df = df[df["details.shotType"].notna()]

    #Create an flipped.xCoord and flipped.yCoord column, so shots are in one direction
    df["flipped.xCoord"] = df["details.xCoord"] * -1.0
    df["flipped.yCoord"] = df["details.yCoord"] * -1.0

    # Determine whether home team took the shot
    df["homeTeamShot"] = df["details.eventOwnerTeamId"].astype(int).astype(str) == df["homeTeam.id"].astype(int).astype(str)

    # Determine which direction the shot was taken
    df["shotSide"] = df.apply(shot_side, axis = 1)

    # Adjust shots taken towards the left net
    df["adj.xCoord"] = df.apply(lambda row: row["flipped.xCoord"] if row["shotSide"] == "left" else row["details.xCoord"], axis = 1)
    df["adj.yCoord"] = df.apply(lambda row: row["flipped.yCoord"] if row["shotSide"] == "left" else row["details.yCoord"], axis = 1)

    # Create shotDistance column
    df["shotDistance"] = df.apply(get_shot_distance, axis = 1)

    # Create adjDistance
    avg_distances = df.groupby(["homeTeam.id"], as_index = False)["shotDistance"].mean()
    avg_distances = avg_distances.rename(columns = {"shotDistance": "avgShotDistance"})
    df = pd.merge(df, avg_distances, left_on = "homeTeam.id", right_on = "homeTeam.id", how = "inner")
    df["adjDistance"] = df["shotDistance"] - df["avgShotDistance"]
    
    # Create shotDistanceClass column
    if model == "krzy09":
        pass
    else:
        df = get_shot_distance_class(df, model)

    # Create rebound column as defined by Ryder (2004)
    if model == "krzy09":
        df["Rebound"] = ((df["lastEvent"] == "shot-on-goal") & (df["timeSinceLastEvent"] >=0) & (df["timeSinceLastEvent"] <= 2))
    else:
        df["Rebound"] = ((df["lastEvent"] == "shot-on-goal") & (df["timeSinceLastEvent"] >=0) & (df["timeSinceLastEvent"] <= 2) & (df["shotDistance"] < 25))

    # Create a column to denote strength
    # Breakup code 
    df["home_goalie"] = df["situationCode"].str[0].astype(int)
    df["home_skaters"] = df["situationCode"].str[1].astype(int)
    df["away_goalie"] = df["situationCode"].str[2].astype(int)
    df["away_skaters"] = df["situationCode"].str[3].astype(int)

    # Calculate players on ice for each time
    df["home_players"] = df["home_goalie"] + df["home_skaters"]
    df["away_players"] = df["away_goalie"] + df["away_skaters"]

    df["Situation"] = df.apply(get_strength, axis = 1) 

    if model == "krzy06":
        condition = (
            ((df["typeDescKey"] == "shot-on-goal") | (df["typeDescKey"] == "goal")) &
            ((df["lastEvent"] == "takeaway") | (df["lastEvent"] == "giveaway"))
        )
        df["shotAfterTurnover"] = np.where(condition, "Yes", "No")
    elif model == "krzy09":
        condition = (
            ((df["typeDescKey"] == "shot-on-goal") | (df["typeDescKey"] == "goal")) &
            ((df["lastEvent"] == "giveaway") & (df["lastEventTeam"] != df["details.eventOwnerTeamId"]))
        )
        df["shotAfterOppGiveaway"] = np.where(condition, "Yes", "No")

    # Reduce to only neccessary columns for the model
    if model == "krzy06":
        keep_cols = ["typeDescKey", "shotDistanceClass", "details.shotType", "Rebound", "Situation", "shotAfterTurnover"]
        df = df[keep_cols]
    elif model == "krzy09":
        keep_cols = ["typeDescKey", "adjDistance", "details.shotType", "Rebound", "Situation", "shotAfterOppGiveaway"]
        df = df[keep_cols]
    else:
        keep_cols = ["typeDescKey", "shotDistanceClass", "details.shotType", "Rebound", "Situation"]
        df = df[keep_cols]

    # Position data to train model (one-hot)
    df["typeDescKey"] = df["typeDescKey"].replace({"shot-on-goal": 0, "goal": 1})
    df = df.rename(columns = {"typeDescKey": "goal"})
    df = pd.get_dummies(df, dtype = float)

    if model == "krzy05a": 
        df["Rebound"] = df["Rebound"].astype(float)
    elif model == "krzy06":
        df = df.join(pd.get_dummies(df["Rebound"], dtype = float, prefix = "Rebound")).drop("Rebound", axis = 1)

    if model == "krzy06":
        df["details.shotType_wrap-or-slap"] = df["details.shotType_wrap-around"] + df["details.shotType_slap"]
        df = df.drop(["details.shotType_wrap-around", "details.shotType_slap"], axis=1)
    elif model == "krzy09":
        df["details.shotType_wrap-or-slap"] = df["details.shotType_wrap-around"] + df["details.shotType_slap"]
        df["details.shotType_tip-or-deflection"] = df["details.shotType_tip-in"] + df["details.shotType_deflected"]
        df = df.drop(["details.shotType_wrap-around", "details.shotType_slap", "details.shotType_tip-in", "details.shotType_deflected"], axis=1)
    return(df)

### Get Data

In [11]:
skaters = pd.read_csv("skaters.csv")
goalies = pd.read_csv("goalies.csv")
pbp = pd.read_csv("pbp.csv")
pbp.head()

  pbp = pd.read_csv("pbp.csv")


Unnamed: 0,eventId,timeInPeriod,timeRemaining,situationCode,homeTeamDefendingSide,typeCode,typeDescKey,sortOrder,periodDescriptor.number,periodDescriptor.periodType,periodDescriptor.maxRegulationPeriods,details.eventOwnerTeamId,details.losingPlayerId,details.winningPlayerId,details.xCoord,details.yCoord,details.zoneCode,details.hittingPlayerId,details.hitteePlayerId,details.shotType,details.shootingPlayerId,details.goalieInNetId,details.awaySOG,details.homeSOG,details.reason,details.typeCode,details.descKey,details.duration,details.committedByPlayerId,details.drawnByPlayerId,details.blockingPlayerId,details.playerId,details.secondaryReason,details.scoringPlayerId,details.scoringPlayerTotal,details.assist1PlayerId,details.assist1PlayerTotal,details.assist2PlayerId,details.assist2PlayerTotal,details.awayScore,details.homeScore,details.discreteClip,details.servedByPlayerId,gameId,periodDescriptor.otPeriods,pptReplayUrl,details.highlightClipSharingUrl,details.highlightClipSharingUrlFr,details.highlightClip,details.highlightClipFr,details.discreteClipFr
0,51.0,00:00,20:00,1551.0,right,520.0,period-start,8.0,1.0,REG,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2021010001,,,,,,,
1,53.0,00:00,20:00,1551.0,right,502.0,faceoff,9.0,1.0,REG,3.0,10.0,8480018.0,8475166.0,0.0,0.0,N,,,,,,,,,,,,,,,,,,,,,,,,,,,2021010001,,,,,,,
2,101.0,00:15,19:45,1551.0,right,503.0,hit,10.0,1.0,REG,3.0,10.0,,,-96.0,26.0,O,8475166.0,8481014.0,,,,,,,,,,,,,,,,,,,,,,,,,2021010001,,,,,,,
3,54.0,00:17,19:43,1551.0,right,506.0,shot-on-goal,11.0,1.0,REG,3.0,10.0,,,-49.0,7.0,O,,,snap,8480043.0,8480051.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,2021010001,,,,,,,
4,55.0,00:27,19:33,1551.0,right,507.0,missed-shot,12.0,1.0,REG,3.0,10.0,,,-31.0,39.0,O,,,wrist,8480043.0,8480051.0,,,wide-of-net,,,,,,,,,,,,,,,,,,,2021010001,,,,,,,


### Split Play-By-Play Into Train (2021-22, 2022-23, 2023-24) and Test (2024-25)

In [12]:
# Set "gameId" to a string
pbp["gameId"] = pbp["gameId"].apply(str)

# Split into training and testing events
train_pbp = pbp.loc[pbp.gameId.str.startswith(("2021", "2022", "2023"), na=False)]
test_pbp = pbp.loc[pbp.gameId.str.startswith("2024", na=False)]
print("Training events: ", len(train_pbp))
print("Testing events: ", len(test_pbp))

Training events:  1404747
Testing events:  479352


In [13]:
# Create timeInPeriodSeconds
train_pbp["timeInPeriodSeconds"] = pd.to_timedelta("00:" + train_pbp["timeInPeriod"]).dt.total_seconds().astype(int)
test_pbp["timeInPeriodSeconds"] = pd.to_timedelta("00:" + test_pbp["timeInPeriod"]).dt.total_seconds().astype(int)

In [14]:
# Create new columns to help define rebounds (lastEvent & timeSinceLastEvent)
train_pbp["lastEvent"] = train_pbp.groupby("gameId")["typeDescKey"].shift(1)
test_pbp["lastEvent"] = test_pbp.groupby("gameId")["typeDescKey"].shift(1)

train_pbp["lastEventTeam"] = train_pbp.groupby("gameId")["details.eventOwnerTeamId"].shift(1)
test_pbp["lastEventTeam"] = test_pbp.groupby("gameId")["details.eventOwnerTeamId"].shift(1)

train_pbp["timeInPeriodShifted"] = train_pbp.groupby("gameId")["timeInPeriodSeconds"].shift(1)
test_pbp["timeInPeriodShifted"] = test_pbp.groupby("gameId")["timeInPeriodSeconds"].shift(1)

train_pbp["timeSinceLastEvent"] = train_pbp["timeInPeriodSeconds"] - train_pbp["timeInPeriodShifted"]
test_pbp["timeSinceLastEvent"] = test_pbp["timeInPeriodSeconds"] - test_pbp["timeInPeriodShifted"]

### [Krzywicki (2005a)](https://www.hockeyanalytics.com/Research_files/Shot_Quality_Krzywicki.pdf)

In [15]:
# Variables
# - Distance (<10ft, 10ft-12ft, 13ft-14ft, 15ft-16ft, 17ft-22ft, 23ft-31ft, 32ft-36ft, 37ft-38ft
#             39ft-44ft, 45ft-57ft, >=58ft)
# - Shot Type (Wrap, Slap, Wrist, Snap, Backhand, Tip)
# - Rebound (Yes, No)
# - Situation (Even Strength, Shorthanded, Powerplay)

In [16]:
# Use preprocess function to get training and testing shots for Krzy05a
train_pbp_krzy05a = preprocess_data(train_pbp, "krzy05a", gametype = "regular")
test_pbp_krzy05a = preprocess_data(test_pbp, "krzy05a", gametype = "regular")
train_pbp_krzy05a.head()

  df["typeDescKey"] = df["typeDescKey"].replace({"shot-on-goal": 0, "goal": 1})
  df["typeDescKey"] = df["typeDescKey"].replace({"shot-on-goal": 0, "goal": 1})


Unnamed: 0,goal,Rebound,shotDistanceClass_0,shotDistanceClass_1,shotDistanceClass_2,shotDistanceClass_3,shotDistanceClass_4,shotDistanceClass_5,shotDistanceClass_6,shotDistanceClass_7,shotDistanceClass_8,shotDistanceClass_9,shotDistanceClass_10,details.shotType_backhand,details.shotType_bat,details.shotType_between-legs,details.shotType_cradle,details.shotType_deflected,details.shotType_poke,details.shotType_slap,details.shotType_snap,details.shotType_tip-in,details.shotType_wrap-around,details.shotType_wrist,Situation_EV,Situation_PP,Situation_SH
0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [17]:
print("Bat: ", train_pbp_krzy05a["details.shotType_bat"].sum())
print("Between the legs: ", train_pbp_krzy05a["details.shotType_between-legs"].sum())
print("Cradle: ", train_pbp_krzy05a["details.shotType_cradle"].sum())
print("Deflected: ", train_pbp_krzy05a["details.shotType_deflected"].sum())
print("Poke: ", train_pbp_krzy05a["details.shotType_poke"].sum())
print("Total Shots: ", len(train_pbp_krzy05a))
print("---------------------------")
print("Percentage of New Categories: ", (train_pbp_krzy05a["details.shotType_bat"].sum() 
                                         + train_pbp_krzy05a["details.shotType_between-legs"].sum() 
                                         + train_pbp_krzy05a["details.shotType_cradle"].sum() 
                                         + train_pbp_krzy05a["details.shotType_deflected"].sum() 
                                         + train_pbp_krzy05a["details.shotType_poke"].sum())/len(train_pbp_krzy05a))

Bat:  432.0
Between the legs:  103.0
Cradle:  8.0
Deflected:  4323.0
Poke:  768.0
Total Shots:  246096
---------------------------
Percentage of New Categories:  0.02289350497366881


In [18]:
# Make X's and y's out of train/test
X_train_krzy05a = train_pbp_krzy05a.drop("goal", axis=1)
X_test_krzy05a = test_pbp_krzy05a.drop("goal", axis=1)
y_train_krzy05a = train_pbp_krzy05a["goal"]
y_test_krzy05a = test_pbp_krzy05a["goal"]

In [19]:
# Train model and make predictions
model_krzy05a = LogisticRegression(penalty = None).fit(X_train_krzy05a, y_train_krzy05a)
preds_krzy05a = model_krzy05a.predict_proba(X_test_krzy05a)

In [20]:
import math
# Intercept + 15ft + Rebound + ES + Wrist
# Krzy 
print(-2.2369+0.5174+1.3362-0.1244+0.0093)
print(1/(1+math.exp(-(-2.2369+0.5174+1.3362-0.1244+0.0093))))
# Mine
print(-1.547388+0.515755+0.110711-0.627137-0.087333)
print(1/(1+math.exp(-(-1.547388+0.515755+0.110711-0.627137-0.087333))))

-0.49839999999999995
0.37791674834455724
-1.635392
0.16309304954093873


In [21]:
testing = pd.DataFrame(columns = X_train_krzy05a.columns.tolist(), data = [[0] * len(X_train_krzy05a.columns.tolist())])
testing["shotDistanceClass_3"] = 1.0
testing["details.shotType_wrist"] = 1.0
testing["Situation_EV"] = 1.0
testing["Rebound"] = 1.0
model_krzy05a.predict_proba(testing)

array([[0.83668269, 0.16331731]])

In [20]:
# New
coeff_krzy05a = pd.DataFrame({"Variable": X_train_krzy05a.columns, "Coefficient": model_krzy05a.coef_[0]})
intercept_krzy05a = pd.DataFrame({"Variable": ["Intercept"], "Coefficient": model_krzy05a.intercept_})
pd.concat([intercept_krzy05a, coeff_krzy05a], ignore_index = True)

Unnamed: 0,Variable,Coefficient
0,Intercept,-1.525397
1,Rebound,0.101603
2,shotDistanceClass_0,0.86173
3,shotDistanceClass_1,0.652037
4,shotDistanceClass_2,0.610051
5,shotDistanceClass_3,0.520885
6,shotDistanceClass_4,0.416465
7,shotDistanceClass_5,0.074876
8,shotDistanceClass_6,-0.421265
9,shotDistanceClass_7,-0.666254


In [22]:
coeff_krzy05a = pd.DataFrame({"Variable": X_train_krzy05a.columns, "Coefficient": model_krzy05a.coef_[0]})
intercept_krzy05a = pd.DataFrame({"Variable": ["Intercept"], "Coefficient": model_krzy05a.intercept_})
pd.concat([intercept_krzy05a, coeff_krzy05a], ignore_index = True)

Unnamed: 0,Variable,Coefficient
0,Intercept,-1.547388
1,Rebound,0.110711
2,shotDistanceClass_0,0.862282
3,shotDistanceClass_1,0.655184
4,shotDistanceClass_2,0.60519
5,shotDistanceClass_3,0.515755
6,shotDistanceClass_4,0.417744
7,shotDistanceClass_5,0.079233
8,shotDistanceClass_6,-0.414564
9,shotDistanceClass_7,-0.667846


In [23]:
# Obtain model performance
bs_krzy05a = brier_score_loss(y_test_krzy05a, preds_krzy05a[:,1])
print("Brier Score: ", bs_krzy05a)

Brier Score:  0.09174032542920912


In [24]:
# Obtain model performance predicting shooting percentage everytime
shoot_per = sum(y_train_krzy05a)/len(y_train_krzy05a)
bs_shoot_per = brier_score_loss(y_test_krzy05a, [shoot_per] * len(y_test_krzy05a))
print("Brier Score: ", bs_shoot_per)

Brier Score:  0.09623833828472177


### [Krzywicki (2005b)](https://hockeyanalytics.com/Research_files/Playoff_Shot_Quality_2004_Krzywicki.pdf)

In [25]:
test_pbp_krzy05b = preprocess_data(test_pbp, "krzy05b", gametype = "playoff")
test_pbp_krzy05b.head()

  df["typeDescKey"] = df["typeDescKey"].replace({"shot-on-goal": 0, "goal": 1})


Unnamed: 0,goal,Rebound,shotDistanceClass_0,shotDistanceClass_1,shotDistanceClass_2,shotDistanceClass_3,shotDistanceClass_4,shotDistanceClass_5,shotDistanceClass_6,shotDistanceClass_7,shotDistanceClass_8,shotDistanceClass_9,shotDistanceClass_10,details.shotType_backhand,details.shotType_bat,details.shotType_between-legs,details.shotType_deflected,details.shotType_poke,details.shotType_slap,details.shotType_snap,details.shotType_tip-in,details.shotType_wrap-around,details.shotType_wrist,Situation_EV,Situation_PP,Situation_SH
0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [26]:
# Get playoff testing shots
print("Testing Shots: ", len(test_pbp_krzy05b))

Testing Shots:  4392


In [27]:
# Add variable that doesn't exit in the testing data
test_pbp_krzy05b["details.shotType_cradle"] = 0.0
test_pbp_krzy05b = test_pbp_krzy05b[test_pbp_krzy05a.columns.tolist()]

# Make X's and y's out of train/test
X_test_krzy05b = test_pbp_krzy05b.drop("goal", axis=1)
y_test_krzy05b = test_pbp_krzy05b["goal"]

In [28]:
# Model used in Krzywicki (2005b) is the one created in Krzywicki (2005a)
model_krzy05b = model_krzy05a

In [29]:
# Make predictions
preds_krzy05b = model_krzy05b.predict_proba(X_test_krzy05b)

In [30]:
# Obtain model performance
bs_krzy05b = brier_score_loss(y_test_krzy05b, preds_krzy05b[:,1])
print("Brier Score: ", bs_krzy05b)

Brier Score:  0.09488614607906869


In [31]:
# Obtain model performance predicting shooting percentage everytime
bs_shoot_per_playoffs = brier_score_loss(y_test_krzy05b, [shoot_per] * len(y_test_krzy05b))
print("Brier Score: ", bs_shoot_per_playoffs)

Brier Score:  0.0990314981140374


### [Krzywicki (2006)](https://www.hockeyanalytics.com/Research_files/Shot_Quality_2006_Krzywicki.pdf)

In [15]:
# Variables
# - Distance (<12ft, 12ft, 13ft-16ft, 17ft-18ft, 19ft-21ft, 22ft-32ft, 33ft-35ft, 36ft-37ft
#             38ft-40ft, 41ft-44ft, 45ft-52ft, 53ft-59ft >=60ft)
# - Shot Type (Wrap or Slap, Wrist, Snap, Backhand, Tip)
# - Rebound (Yes, No)
# - Situation (Even Strength, Shorthanded, Powerplay)
# - Shot after turnover (Yes, No)

In [16]:
# Use preprocess function to get training and testing shots for Krzy06
train_pbp_krzy06 = preprocess_data(train_pbp, "krzy06", gametype = "regular")
test_pbp_krzy06 = preprocess_data(test_pbp, "krzy06", gametype = "regular")
train_pbp_krzy06.head()

  df["typeDescKey"] = df["typeDescKey"].replace({"shot-on-goal": 0, "goal": 1})
  df["typeDescKey"] = df["typeDescKey"].replace({"shot-on-goal": 0, "goal": 1})


Unnamed: 0,goal,shotDistanceClass_0,shotDistanceClass_1,shotDistanceClass_2,shotDistanceClass_3,shotDistanceClass_4,shotDistanceClass_5,shotDistanceClass_6,shotDistanceClass_7,shotDistanceClass_8,shotDistanceClass_9,shotDistanceClass_10,shotDistanceClass_11,shotDistanceClass_12,details.shotType_backhand,details.shotType_bat,details.shotType_between-legs,details.shotType_cradle,details.shotType_deflected,details.shotType_poke,details.shotType_snap,details.shotType_tip-in,details.shotType_wrist,Situation_EV,Situation_PP,Situation_SH,shotAfterTurnover_No,shotAfterTurnover_Yes,Rebound_False,Rebound_True,details.shotType_wrap-or-slap
0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
1,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [18]:
print("Bat: ", train_pbp_krzy06["details.shotType_bat"].sum())
print("Between the legs: ", train_pbp_krzy06["details.shotType_between-legs"].sum())
print("Cradle: ", train_pbp_krzy06["details.shotType_cradle"].sum())
print("Deflected: ", train_pbp_krzy06["details.shotType_deflected"].sum())
print("Poke: ", train_pbp_krzy06["details.shotType_poke"].sum())
print("Total Shots: ", len(train_pbp_krzy06))
print("---------------------------")
print("Percentage of New Categories: ", (train_pbp_krzy06["details.shotType_bat"].sum() 
                                         + train_pbp_krzy06["details.shotType_between-legs"].sum() 
                                         + train_pbp_krzy06["details.shotType_cradle"].sum() 
                                         + train_pbp_krzy06["details.shotType_deflected"].sum() 
                                         + train_pbp_krzy06["details.shotType_poke"].sum())/len(train_pbp_krzy06))

Bat:  432.0
Between the legs:  103.0
Cradle:  8.0
Deflected:  4323.0
Poke:  768.0
Total Shots:  246096
---------------------------
Percentage of New Categories:  0.02289350497366881


In [19]:
# Make X's and y's out of train/test
X_train_krzy06 = train_pbp_krzy06.drop("goal", axis=1)
X_test_krzy06 = test_pbp_krzy06.drop("goal", axis=1)
y_train_krzy06 = train_pbp_krzy06["goal"]
y_test_krzy06 = test_pbp_krzy06["goal"]

In [20]:
# Train model and make predictions
model_krzy06 = LogisticRegression(penalty = None).fit(X_train_krzy06, y_train_krzy06)
preds_krzy06 = model_krzy06.predict_proba(X_test_krzy06)

In [21]:
coeff_krzy06 = pd.DataFrame({"Variable": X_train_krzy06.columns, "Coefficient": model_krzy06.coef_[0]})
intercept_krzy06 = pd.DataFrame({"Variable": ["Intercept"], "Coefficient": model_krzy06.intercept_})
pd.concat([intercept_krzy06, coeff_krzy06], ignore_index = True)

Unnamed: 0,Variable,Coefficient
0,Intercept,-0.843841
1,shotDistanceClass_0,0.932792
2,shotDistanceClass_1,0.821743
3,shotDistanceClass_2,0.772717
4,shotDistanceClass_3,0.727768
5,shotDistanceClass_4,0.60768
6,shotDistanceClass_5,0.323766
7,shotDistanceClass_6,-0.118845
8,shotDistanceClass_7,-0.326
9,shotDistanceClass_8,-0.461317


In [22]:
# Intercept + 15ft + Rebound + ES + Wrist + No Turnover
# Krzy 
print(-2.0671+0.4856+1.3382-0.1542+0.0127-0.0428)
print(1/(1+math.exp(-(-2.0671+0.4856+1.3382-0.1542+0.0127-0.0428))))
# Mine
print(-1.119431+0.779497-0.470588-0.507010-0.207333-0.571362)
print(1/(1+math.exp(-(-1.119431+0.779497-0.470588-0.507010-0.207333-0.571362))))

-0.42759999999999987
0.39469957479155254
-2.096227
0.10946407804007893


In [23]:
# Obtain model performance
bs_krzy06 = brier_score_loss(y_test_krzy06, preds_krzy06[:,1])
print("Brier Score: ", bs_krzy06)

Brier Score:  0.09262130341676479


In [24]:
# Obtain model performance predicting shooting percentage everytime
shoot_per06 = sum(y_train_krzy06)/len(y_train_krzy06)
bs_shoot_per06 = brier_score_loss(y_test_krzy06, [shoot_per06] * len(y_test_krzy06))
print("Brier Score: ", bs_shoot_per06)

Brier Score:  0.09623833828472177


### [Krzywicki (2009)](https://www.hockeyanalytics.com/Research_files/SQ-DistAdj-RS0809-Krzywicki.pdf)

In [33]:
preprocess_data(train_pbp, "krzy09", gametype = "regular")

  df["typeDescKey"] = df["typeDescKey"].replace({"shot-on-goal": 0, "goal": 1})


Unnamed: 0,goal,adjDistance,Rebound,details.shotType_backhand,details.shotType_bat,details.shotType_between-legs,details.shotType_cradle,details.shotType_poke,details.shotType_snap,details.shotType_wrist,Situation_EV,Situation_PP,Situation_SH,shotAfterOppGiveaway_No,shotAfterOppGiveaway_Yes,details.shotType_wrap-or-slap,details.shotType_tip-or-deflection
0,0,-12.692993,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0,-10.299849,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0,-0.764500,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0,-5.214952,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,26.454850,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246091,0,-13.695633,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
246092,0,26.212591,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
246093,0,-7.308730,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
246094,0,22.507432,False,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
