## Import Things

In [1]:
#!pip install xgboost
#!pip install networkx
#!pip install torch_geometric

In [1]:
# Import Packages
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import pandas as pd
import torch
import random
import math
import copy
import xgboost as xgb
import networkx as nx
import matplotlib.pyplot as plt
import torch_geometric
import os
import torch.nn.functional as F
from torch import nn
from torch.nn import Linear
from torch_geometric.data import Dataset, Data
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from torch_geometric.loader import DataLoader
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
pd.set_option('display.max_columns', None)

# Set random seed
seed = 2024
random.seed(seed)

## Load MoneyPuck raw shot data and standardize structure

In [2]:
#Import the ten csv's downloaded from Moneypuck (https://moneypuck.com/data.htm)
#shots1415 = pd.read_csv("shots_2014_2015.csv")
#shots1516 = pd.read_csv("shots_2015_2016.csv")
#shots1617 = pd.read_csv("shots_2016_2017.csv")
#shots1718 = pd.read_csv("shots_2017_2018.csv")
#shots1819 = pd.read_csv("shots_2018_2019.csv")
shots1920 = pd.read_csv("shots_2019_2020.csv")
shots2021 = pd.read_csv("shots_2020_2021.csv")
shots2122 = pd.read_csv("shots_2021_2022.csv")
shots2223 = pd.read_csv("shots_2022_2023.csv")
shots2324 = pd.read_csv("shots_2023_2024.csv")

In [3]:
# Show differing number of columns
print(shots1920.shape)
print(shots2021.shape)
shots2021.columns.difference(shots1920.columns).tolist()

(104172, 124)
(78611, 137)


['gameOver',
 'homeTeamScore',
 'homeWinProbability',
 'penaltyLength',
 'playoffGame',
 'roadTeamCode',
 'roadTeamScore',
 'shotGoalProbability',
 'shotPlayContinued',
 'timeBetweenEvents',
 'timeLeft',
 'wentToOT',
 'wentToShootout']

In [4]:
# Drop columns from 2020-21 season that don't exist in the previous season
shots2021.drop(shots2021.columns.difference(shots1920.columns).tolist(), axis=1, inplace=True)
print(shots2021.shape)
print(shots1920.shape)

(78611, 124)
(104172, 124)


In [5]:
# concat all the shots dataframes together to simplify usage
pdList = []
pdList.extend(value for name, value in locals().items() if name.startswith('shots'))
print(f"num rows in seperate dataframes: {sum([len(x) for x in pdList])}")
_df = pd.concat(pdList, ignore_index=True)
print(f"num rows in combined dataframe:  {len(_df)}")

num rows in seperate dataframes: 548752
num rows in combined dataframe:  548752


## Cleanse raw data

In [6]:
# Ensure team abbreviations are consistent (ex. L.A -> LAK)
for (old, new) in [
    ('L.A', 'LAK'),
    ('N.J', 'NJD'),
    ('S.J', 'SJS'),
    ('T.B', 'TBL')
]:
    _df.loc[_df.homeTeamCode == old, "homeTeamCode"] = new
    _df.loc[_df.awayTeamCode == old, "awayTeamCode"] = new
    _df.loc[_df.teamCode == old, "teamCode"] = new

In [7]:
# Update "Sebastian Aho" -> "Sebastian Aho NYI"
_df.loc[(_df.shooterName == "Sebastian Aho") & (_df.teamCode == "NYI"), "shooterName"] = "Sebastian Aho NYI"

# Update "Matt Murray" -> "Matt Murray DAL" (only played 2022-23 and 2023-24, have to go game by game because nothing on goalie side indicates which it is)
_df.loc[(_df.goalieNameForShot == "Matt Murray") & (_df.game_id == 20979) & (_df.season == 2022), "goalieNameForShot"] = "Matt Murray DAL"
_df.loc[(_df.goalieNameForShot == "Matt Murray") & (_df.game_id == 21073) & (_df.season == 2022), "goalieNameForShot"] = "Matt Murray DAL"
_df.loc[(_df.goalieNameForShot == "Matt Murray") & (_df.game_id == 21161) & (_df.season == 2022), "goalieNameForShot"] = "Matt Murray DAL"
_df.loc[(_df.goalieNameForShot == "Matt Murray") & (_df.season == 2023), "goalieNameForShot"] = "Matt Murray DAL"

In [8]:
# Only keep data about shots and goals
_df = _df[_df.event != "MISS"]
print(f"num rows in combined dataframe: {len(_df)}")

num rows in combined dataframe: 393996


In [9]:
# Remove columns that are unneeded, expected values and info about post shot activity
_df = _df.drop([
    "shotID",
    "goalieIdForShot",
    "id",
    "playerNumThatDidEvent",
    "event",
    "playerNumThatDidLastEvent",
    "shooterPlayerId",
    "xFroze",
    "xPlayContinuedInZone",
    "xPlayContinuedOutsideZone",
    "xPlayStopped",
    "xRebound",
    "xShotWasOnGoal", 
    "homeTeamWon",
    "shotAnglePlusRebound",
    "shotGeneratedRebound", 
    "shotGoalieFroze", 
    "shotPlayContinuedInZone",
    "shotPlayContinuedOutsideZone",
    "shotPlayStopped",
    "timeUntilNextEvent"
], axis=1, errors='ignore')
print(f"num rows in combined dataframe: {len(_df)}")

num rows in combined dataframe: 393996


In [10]:
# Remove rows with missing data in max/min time on ice rows (MoneyPuck says that this is denoted when a max is set to 0 and a min set to 999)
times_to_exclude_min = [999]
times_to_exclude_max = [0]
_df = _df[
    (~_df.defendingTeamMaxTimeOnIce.isin(times_to_exclude_max)) &
    (~_df.defendingTeamMaxTimeOnIceOfDefencemen.isin(times_to_exclude_max)) & 
    (~_df.defendingTeamMaxTimeOnIceOfDefencemenSinceFaceoff.isin(times_to_exclude_max)) &
    (~_df.defendingTeamMaxTimeOnIceOfForwards.isin(times_to_exclude_max)) & 
    (~_df.defendingTeamMaxTimeOnIceOfForwardsSinceFaceoff.isin(times_to_exclude_max)) & 
    (~_df.defendingTeamMaxTimeOnIceSinceFaceoff.isin(times_to_exclude_max)) & 
    (~_df.shootingTeamMaxTimeOnIce.isin(times_to_exclude_max)) & 
    (~_df.shootingTeamMaxTimeOnIceOfDefencemen.isin(times_to_exclude_max)) & 
    (~_df.shootingTeamMaxTimeOnIceOfDefencemenSinceFaceoff.isin(times_to_exclude_max)) & 
    (~_df.shootingTeamMaxTimeOnIceOfForwards.isin(times_to_exclude_max)) & 
    (~_df.shootingTeamMaxTimeOnIceOfForwardsSinceFaceoff.isin(times_to_exclude_max)) & 
    (~_df.shootingTeamMaxTimeOnIceSinceFaceoff.isin(times_to_exclude_max)) &
    (~_df.defendingTeamMinTimeOnIce.isin(times_to_exclude_min)) &
    (~_df.defendingTeamMinTimeOnIceOfDefencemen.isin(times_to_exclude_min)) & 
    (~_df.defendingTeamMinTimeOnIceOfDefencemenSinceFaceoff.isin(times_to_exclude_min)) &
    (~_df.defendingTeamMinTimeOnIceOfForwards.isin(times_to_exclude_min)) & 
    (~_df.defendingTeamMinTimeOnIceOfForwardsSinceFaceoff.isin(times_to_exclude_min)) & 
    (~_df.defendingTeamMinTimeOnIceSinceFaceoff.isin(times_to_exclude_min)) & 
    (~_df.shootingTeamMinTimeOnIce.isin(times_to_exclude_min)) & 
    (~_df.shootingTeamMinTimeOnIceOfDefencemen.isin(times_to_exclude_min)) & 
    (~_df.shootingTeamMinTimeOnIceOfDefencemenSinceFaceoff.isin(times_to_exclude_min)) & 
    (~_df.shootingTeamMinTimeOnIceOfForwards.isin(times_to_exclude_min)) & 
    (~_df.shootingTeamMinTimeOnIceOfForwardsSinceFaceoff.isin(times_to_exclude_min)) & 
    (~_df.shootingTeamMinTimeOnIceSinceFaceoff.isin(times_to_exclude_min))
]

print(f"num rows in combined dataframe: {len(_df)}")

num rows in combined dataframe: 391465


In [11]:
# deal with NAs
for c in ["shotType", "playerPositionThatDidEvent", "shooterName", "shooterLeftRight"]:
    _df = _df[_df[c].notna()]
# Fill nan's when there's a goal on an empty net
_df.goalieNameForShot = _df.goalieNameForShot.fillna("Empty Net")
print(f"num rows in combined dataframe: {len(_df)}")

num rows in combined dataframe: 387983


In [12]:
# Remove shots taken by other goaltender
_df = _df[_df["playerPositionThatDidEvent"] != "G"]
print(f"num rows in combined dataframe: {len(_df)}")

num rows in combined dataframe: 387952


In [13]:
# Remove shots with too few or too many skaters on the ice
# There should be between 6 and 11 skaters (possibly 12, but both teams won't have goalies pulled simultaneously)
player_num_cols = ["shootingTeamForwardsOnIce", "shootingTeamDefencemenOnIce", "defendingTeamForwardsOnIce", "defendingTeamDefencemenOnIce"]
_df = _df[_df[player_num_cols].sum(axis=1).between(6, 11)]
print(f"num rows in combined dataframe: {len(_df)}")

num rows in combined dataframe: 387932


In [14]:
# Save cleansed data
_df.to_csv("cleansed_shot_data.csv", index = False)

## Encode Categorical Variables

In [15]:
# Import cleansed data
_df = pd.read_csv("cleansed_shot_data.csv", )
print(f"num rows in combined dataframe: {len(_df)}")
print(f"num cols in combined dataframe: {len(_df.columns)}")

num rows in combined dataframe: 387932
num cols in combined dataframe: 103


In [16]:
# Target Encoding
# https://maxhalford.github.io/blog/target-encoding/
def calc_smooth_mean(df, by, on, m):
    # Compute the global mean
    mean = df[on].mean()
    
    # Compute the number of values and the mean of each group
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']
    
    # Compute the 'smoothed' mean
    smooth = (counts * means + m * mean) / (counts + m)
    
    # Replace each value by the according smoothed mean
    return df[by].map(smooth)

In [17]:
# Create the one hot encoding for low cardinality categorical columns (if <10 class one-hot, otherwise target)
for c in [
    #"homeTeamCode", 32 classes
    #"awayTeamCode", 32 classes
    "team", # 2 classes
    "location", # 3 classes
    "shotType", # 7 classes
    #"lastEventCategory", 17 classes
    #"lastEventTeam", 30 classes
    ##"goalieNameForShot", 237 classes
    ##"shooterName", 2017 classes
    "shooterLeftRight", # 2 classes
    "playerPositionThatDidEvent", # 5 classes
    #"teamCode" 32 classes
]:
    col_enc_df = pd.get_dummies(_df[c], sparse=True).reindex(columns=_df[c].unique(), fill_value=False).astype(int).add_prefix(f"{c}_")
    _df = pd.concat([_df, col_enc_df], axis = 1)
    # Drop the unencoded column
    _df.drop([c], axis=1, inplace=True, errors='ignore')

print(f"num rows in encoded dataframe: {len(_df)}")
print(f"num cols in encoded dataframe: {len(_df.columns)}")

num rows in encoded dataframe: 387932
num cols in encoded dataframe: 116


In [18]:
# Calculate the smoothed target encoding for high cardinality categorical columns
for c in [
    "homeTeamCode", # 32 classes
    "awayTeamCode", # 32 classes
    # "team", 2 classes
    # "location", 3 classes
    # "shotType", 7 classes
    "lastEventCategory", # 17 classes
    "lastEventTeam", # 30 classes
    "goalieNameForShot", # 237 classes
    "shooterName", # 2017 classes
    # "shooterLeftRight", 2 classes
    # "playerPositionThatDidEvent", 5 classes
    "teamCode" # 32 classes
]:
    m = len(_df[c])
    _df[f'{c}_target'] = calc_smooth_mean(_df, by=c, on='goal', m=m)
    
print(f"num rows in encoded dataframe: {len(_df)}")
print(f"num cols in encoded dataframe: {len(_df.columns)}")

num rows in encoded dataframe: 387932
num cols in encoded dataframe: 123


In [19]:
# Save target encodings
_df[[
    "homeTeamCode",
    "awayTeamCode",
    # "team",
    # "location",
    # "shotType",
    "lastEventCategory",
    "lastEventTeam",
    "goalieNameForShot",
    "shooterName",
    # "shooterLeftRight",
    # "playerPositionThatDidEvent",
    "teamCode",
    "homeTeamCode_target",
    "awayTeamCode_target",
    # "team_target",
    # "location_target",
    # "shotType_target",
    "lastEventCategory_target",
    "lastEventTeam_target",
    "goalieNameForShot_target",
    "shooterName_target",
    # "shooterLeftRight_target",
    # "playerPositionThatDidEvent_target",
    "teamCode_target"
]].to_csv("target_encoding_lookups.csv", index = False)

In [20]:
# Drop encoded columns
_df_reduced = _df.drop([ "homeTeamCode",
    "awayTeamCode",
    "team",
    "location",
    "shotType",
    "lastEventCategory",
    "lastEventTeam",
    "goalieNameForShot",
    "shooterName",
    "shooterLeftRight",
    "playerPositionThatDidEvent",
    "teamCode",
    "game_id"], axis=1, inplace=False, errors='ignore')
print(f"num cols in encoded dataframe: {len(_df_reduced.columns)}")

num cols in encoded dataframe: 115


In [21]:
# Save encoded data
_df_reduced.to_csv("encoded_shot_data.csv", index = False)

## Create Training and Testing Datasets

In [22]:
# Import encoded data
_df_reduced = pd.read_csv("encoded_shot_data.csv")
print(f"num rows in encoded dataframe: {len(_df_reduced)}")
print(f"num cols in encoded dataframe: {len(_df_reduced.columns)}")

num rows in encoded dataframe: 387932
num cols in encoded dataframe: 115


In [23]:
# Create training and test sets
train_df = _df_reduced[_df_reduced["season"] != 2023]
test_df = _df_reduced[_df_reduced["season"] == 2023]

print(len(train_df))
print(len(test_df))

X_train = train_df.loc[:, ~train_df.columns.isin(["goal", "xGoal"])]
X_test = test_df.loc[:, ~test_df.columns.isin(["goal", "xGoal"])]
y_train = train_df["goal"]
y_test = test_df["goal"]

306283
81649


In [24]:
# Limit columns to those used in the graph neural network
limit_X_train = X_train[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target", 
                         "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen", "shotDistance"]]
limit_X_test = X_test[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target",
                      "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen", "shotDistance"]]

## Fit Logistic Regression

In [25]:
# Train Logistic Regression
lr = LogisticRegression(random_state = seed, max_iter = 500, verbose=1).fit(limit_X_train, y_train)

 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  1.62777D+01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    8     48     57      1     0     0   7.173D-05   2.972D-01
  F =  0.29722251828284807     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


In [26]:
# Determine model performance
preds = lr.predict_proba(limit_X_test)[:,1]
print("Logistic Regression MSE: ", np.square(np.subtract(y_test,preds)).mean())
print("Logistic Regression Accuracy: ", sum(lr.predict(limit_X_test) == y_test)/len(y_test))

Logistic Regression MSE:  0.08655198899196545
Logistic Regression Accuracy:  0.8992639223995395


## Fit xGBoost

In [27]:
# Split data into folds by year for 4-fold cross-validation
#train1415_df = train_df[train_df["season"] == 2014]
#train1516_df = train_df[train_df["season"] == 2015]
#train1617_df = train_df[train_df["season"] == 2016]
#train1718_df = train_df[train_df["season"] == 2017]
#train1819_df = train_df[train_df["season"] == 2018]
train1920_df = train_df[train_df["season"] == 2019]
train2021_df = train_df[train_df["season"] == 2020]
train2122_df = train_df[train_df["season"] == 2021]
train2223_df = train_df[train_df["season"] == 2022]

#limit_X_train1415 = train1415_df[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target", 
#                                    "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen"]]
#limit_X_train1516 = train1516_df[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target", 
#                                    "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen"]]
#limit_X_train1617 = train1617_df[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target", 
#                                    "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen"]]
#limit_X_train1718 = train1718_df[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target", 
#                                    "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen"]]
#limit_X_train1819 = train1819_df[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target", 
#                                    "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen"]]
limit_X_train1920 = train1920_df[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target", 
                                    "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen", "shotDistance"]]
limit_X_train2021 = train2021_df[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target", 
                                    "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen", "shotDistance"]]
limit_X_train2122 = train2122_df[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target", 
                                    "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen", "shotDistance"]]
limit_X_train2223 = train2223_df[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target", 
                                    "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen", "shotDistance"]]

#y_train1415 = train1415_df["goal"]
#y_train1516 = train1516_df["goal"]
#y_train1617 = train1617_df["goal"]
#y_train1718 = train1718_df["goal"]
#y_train1819 = train1819_df["goal"]
y_train1920 = train1920_df["goal"]
y_train2021 = train2021_df["goal"]
y_train2122 = train2122_df["goal"]
y_train2223 = train2223_df["goal"]

X_trains = [limit_X_train1920, limit_X_train2021, limit_X_train2122, limit_X_train2223]#[limit_X_train1415, limit_X_train1516, limit_X_train1617, limit_X_train1718, limit_X_train1819, limit_X_train1920, limit_X_train2021, limit_X_train2122, limit_X_train2223]
y_trains = [y_train1920, y_train2021, y_train2122, y_train2223]#[y_train1415, y_train1516, y_train1617, y_train1718, y_train1819, y_train1920, y_train2021, y_train2122, y_train2223]

In [28]:
# Intialize MSE and accuracy storage
mses = [[0 for _ in range(4)] for _ in range(32)]
accs = [[0 for _ in range(4)] for _ in range(32)]

# Loop through folds
for fold in range(0, len(X_trains)):
    # Setup data for that fold
    fold_inds = [i for i in range(len(X_trains)) if i != fold]
    fold_dfs = [X_trains[i] for i in fold_inds]
    fold_ys = [y_trains[i] for i in fold_inds]
    all_fold_dfs = pd.concat(fold_dfs)
    all_fold_ys = pd.concat(fold_ys)
    count = 0
    # Loop though possible max depths
    for m_depth in [1,2,3,4]:
        # Loop through possible learning rates
        for lrate in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]:
            # Fit classifier
            xgb_clf = xgb.XGBClassifier(learning_rate = lrate, max_depth = m_depth)
            xgb_clf.fit(all_fold_dfs, all_fold_ys)
            # Make predictions
            # Store performance metrics
            xgb_preds = xgb_clf.predict_proba(limit_X_test)[:,1]
            mses[count][fold] = np.square(np.subtract(y_test,xgb_preds)).mean()
            accs[count][fold] = sum(xgb_clf.predict(limit_X_test) == y_test)/len(y_test)
            print("Fold:", fold," Max Depth: ", m_depth, " Learning Rate: ", lrate, " MSE: ", mses[count][fold], "Accuracy: ", accs[count][fold])
            count = count + 1

Fold: 0  Max Depth:  1  Learning Rate:  0.05  MSE:  0.08240476876422415 Accuracy:  0.9049100417641367
Fold: 0  Max Depth:  1  Learning Rate:  0.1  MSE:  0.08171809288414028 Accuracy:  0.9049100417641367
Fold: 0  Max Depth:  1  Learning Rate:  0.15  MSE:  0.08147989649309799 Accuracy:  0.9049100417641367
Fold: 0  Max Depth:  1  Learning Rate:  0.2  MSE:  0.08138114568067864 Accuracy:  0.9049100417641367
Fold: 0  Max Depth:  1  Learning Rate:  0.25  MSE:  0.08134818559938653 Accuracy:  0.9049100417641367
Fold: 0  Max Depth:  1  Learning Rate:  0.3  MSE:  0.0813319311786151 Accuracy:  0.9049100417641367
Fold: 0  Max Depth:  1  Learning Rate:  0.35  MSE:  0.0813146565918271 Accuracy:  0.9049100417641367
Fold: 0  Max Depth:  1  Learning Rate:  0.4  MSE:  0.08132091674878733 Accuracy:  0.9049100417641367
Fold: 0  Max Depth:  2  Learning Rate:  0.05  MSE:  0.08160830119642921 Accuracy:  0.9049100417641367
Fold: 0  Max Depth:  2  Learning Rate:  0.1  MSE:  0.08127220500129456 Accuracy:  0.9049

In [29]:
# Choose hyperparameters that minimize average MSE across folds
avg_mses = [0] * len(mses)
for i in range(0,len(mses)):
    avg_mses[i] = sum(mses[i])/len(mses[i])

# Optimal is max depth = 3, lr = 0.15
print(min(range(len(avg_mses)), key=avg_mses.__getitem__))

18


In [31]:
# Fit optimal xGBoost model and check performance on testing data
opt_xgb_clf = xgb.XGBClassifier(learning_rate = 0.15, max_depth = 3)
opt_xgb_clf.fit(limit_X_train, y_train)

preds = opt_xgb_clf.predict_proba(limit_X_test)[:,1]
print("xGBoost MSE: ", np.square(np.subtract(y_test,preds)).mean())
print("xGBoost Accuracy: ", sum(opt_xgb_clf.predict(limit_X_test) == y_test)/len(y_test))

xGBoost MSE:  0.08122035048892716
xGBoost Accuracy:  0.9049222893115654


## Create two node graph data for graph neural network (GNN)

In [25]:
# Create class to generate ShotDataset
class ShotDataset(Dataset):
    def __init__(self, root, filename, transform=None, pre_transform=None, pre_filter=None):
        """
        root: where the dataset should be stored, folder is split into raw_dir (downloaded dataset)
        and processed_dir (processed data).
        """
        self.filename = filename
        super(ShotDataset, self).__init__(root, transform=None, pre_transform=None, pre_filter=None)

    @property
    def raw_file_names(self):
        """
        If this file exists in raw_dir, the download is not triggered.
        """
        return self.filename

    @property
    def processed_file_names(self):
        """
        Not implemented
        """
        return 'xxxxx.pt'

    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        for index, shot in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            # Get node features
            node_feats = self._get_node_features(shot)
            # Get edge features
            edge_feats = self._get_edge_features(shot)
            # Get adjacency info
            edge_index = self._get_adjacency_info(shot)
            # Get labels info
            label = self._get_labels(shot["goal"])

        
            # Create data object
            data = Data(x=node_feats, edge_index=edge_index, edge_attr=edge_feats,y=label)
            torch.save(data, os.path.join(self.processed_dir,f'shot_{index}.pt'))

    def _get_node_features(self, shot):
        """
        This will return a matrix/2d array of the shape [# of Nodes, Node Feature size]
        """
        # Initialize features storage
        all_node_feats = []
        shooter_feats = []
        goalie_feats = []

        # Get shooter features
        shooter_feats.append(shot["shooterName_target"])
        shooter_feats.append(shot["shotType_WRIST"])
        shooter_feats.append(shot["shotAngleReboundRoyalRoad"])
        all_node_feats.append(shooter_feats)

        # Get goalie features
        goalie_feats.append(shot["goalieNameForShot_target"])
        goalie_feats.append(shot["defendingTeamAverageTimeOnIce"])
        goalie_feats.append(shot["defendingTeamMaxTimeOnIceOfDefencemen"])
        all_node_feats.append(goalie_feats)

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, shot):
        """
        This will return a matrix/2d array of ths shape [# of Nodes, Node Feature size]
        """
        # Initialize feature storage
        all_edge_feats = []
        edge_feats = []
        # Get edge features
        edge_feats.append(shot["shotDistance"])
        all_edge_feats += [edge_feats, edge_feats]
        all_edge_feats = np.asarray(all_edge_feats)
        return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, shot):
        # Create adjacency matrix
        edge_indices = []
        edge_indices += [[0,1],[1,0]]
        edge_indices = torch.tensor(edge_indices)
        return edge_indices

    def _get_labels(self, label):
        # Get data label (goal or no goal)
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        data = torch.load(os.path.join(self.processed_dir, f'shot_{idx}.pt'))
        return data

In [26]:
# Small subset
# Save the data to the folder to create graphs
train_dataset = _df_reduced[_df_reduced["season"] != 2023]
test_dataset = _df_reduced[_df_reduced["season"] == 2023]
train_dataset.to_csv("COSC5P30/raw/train_dataset.csv", index = False)
test_dataset.to_csv("COSC5P30/raw/test_dataset.csv", index = False)

In [27]:
# Create the graphs
train_dat = ShotDataset(filename = "train_dataset.csv", root="COSC5P30/")
test_dat = ShotDataset(filename = "test_dataset.csv", root="COSC5P30/")

Processing...
100%|█████████████████████████████████| 306283/306283 [01:28<00:00, 3441.56it/s]
Done!
Processing...
100%|███████████████████████████████████| 81649/81649 [00:23<00:00, 3432.04it/s]
Done!


In [28]:
# Check what the data looks like
for p in train_dat[0]:
    print(p)

('x', tensor([[ 0.0975,  1.0000,  0.0000],
        [ 0.0973, 30.0000, 33.0000]]))
('edge_index', tensor([[0, 1],
        [1, 0]]))
('edge_attr', tensor([[39.6989],
        [39.6989]]))
('y', tensor([0]))


In [29]:
# Print information about the dataset
print(f'Dataset: {train_dat}:')
print('====================')
print(f'Number of graphs: {len(train_dat)}')
print(f'Number of features: {train_dat.num_features}')
#print(f'Number of classes: {train_dat.num_classes}')

data = train_dat[0]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of undirected edges: {data.num_edges // 2}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Dataset: ShotDataset(306283):
Number of graphs: 306283
Number of features: 3

Data(x=[2, 3], edge_index=[2, 2], edge_attr=[2, 1], y=[1])
Number of nodes: 2
Number of undirected edges: 1
Average node degree: 1.00
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [30]:
print(f'Number of training graphs: {len(train_dat)}')
print(f'Number of test graphs: {len(test_dat)}')

Number of training graphs: 306283
Number of test graphs: 81649


In [31]:
# Create data loaders
train_loader = DataLoader(train_dat, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dat, batch_size=1024, shuffle=False)

## Fit two node GNN

In [32]:
# Create GCN
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(train_dat.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)

        self.bn = nn.BatchNorm1d(hidden_channels)
        self.lin = Linear(hidden_channels,1)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.bn(x)
        return torch.sigmoid(self.lin(x))

# choose either 64 or 128 hidden channels
model = GCN(hidden_channels=128)
print(model)

GCN(
  (conv1): GCNConv(3, 128)
  (conv2): GCNConv(128, 128)
  (conv3): GCNConv(128, 128)
  (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lin): Linear(in_features=128, out_features=1, bias=True)
)


In [33]:
# Select optimizer and criterion
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.BCEWithLogitsLoss()

# create training function
def train():
    model.train()

    # Iterate through data loader
    for data in train_loader:  
        # Forward pass
        out = model(data.x, data.edge_index, data.batch)
        # Compute loss
        loss = criterion(out, data.y.unsqueeze(1).float())
        # Derive gradient
        loss.backward()
        # Update
        optimizer.step()
        optimizer.zero_grad()

# create testing function
def test(loader):
    model.eval()

    sq_error = 0
    correct = 0
    goals = 0

    # Iterate through data loader
    for data in loader:
        # Forward pass
        out = model(data.x, data.edge_index, data.batch)  
        # Make prediction
        pred = out
        # Compute metrics
        sq_error += (((pred - data.y.unsqueeze(1)))**2).sum()
        correct += (torch.round(pred) == data.y.unsqueeze(1)).sum()
        goals += sum(data.y.unsqueeze(1))
    return sq_error / len(loader.dataset), correct / len(loader.dataset), pred, data.y, goals


for epoch in range(0, 30):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:06d}, Train Acc: {train_acc[1].item():.6f}, Test Acc: {test_acc[1].item():.6f}, Test MSE: {test_acc[0].item():.6f}')

Epoch: 000000, Train Acc: 0.891538, Test Acc: 0.888633, Test MSE: 0.110602
Epoch: 000001, Train Acc: 0.899622, Test Acc: 0.897427, Test MSE: 0.102481
Epoch: 000002, Train Acc: 0.901346, Test Acc: 0.899264, Test MSE: 0.100703
Epoch: 000003, Train Acc: 0.901346, Test Acc: 0.899264, Test MSE: 0.100648
Epoch: 000004, Train Acc: 0.901346, Test Acc: 0.899264, Test MSE: 0.100727
Epoch: 000005, Train Acc: 0.901346, Test Acc: 0.899264, Test MSE: 0.100726
Epoch: 000006, Train Acc: 0.901346, Test Acc: 0.899264, Test MSE: 0.100731
Epoch: 000007, Train Acc: 0.901346, Test Acc: 0.899264, Test MSE: 0.100731
Epoch: 000008, Train Acc: 0.901346, Test Acc: 0.899264, Test MSE: 0.100733
Epoch: 000009, Train Acc: 0.901346, Test Acc: 0.899264, Test MSE: 0.100733
Epoch: 000010, Train Acc: 0.901346, Test Acc: 0.899264, Test MSE: 0.100733
Epoch: 000011, Train Acc: 0.901346, Test Acc: 0.899264, Test MSE: 0.100735
Epoch: 000012, Train Acc: 0.901346, Test Acc: 0.899264, Test MSE: 0.100735
Epoch: 000013, Train Acc:

KeyboardInterrupt: 

## Create graph data for graph neural network (GNN) with all players on the ice

In [25]:
# create class to generate 12 node graphs
class MultiPlayerShotDataset(Dataset):
    def __init__(self, root, filename, transform=None, pre_transform=None, pre_filter=None):
        """
        root: where the dataset should be stored, folder is split into raw_dir (downloaded dataset)
        and processed_dir (processed data).
        """
        self.filename = filename
        super(MultiPlayerShotDataset, self).__init__(root, transform=None, pre_transform=None, pre_filter=None)

    @property
    def raw_file_names(self):
        """
        If this file exists in raw_dir, the download is not triggered.
        """
        return self.filename

    @property
    def processed_file_names(self):
        """
        Not implemented
        """
        return 'xxxxx.pt'

    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        for index, shot in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            # get number of players on the ice for shot
            n_forwards_shooting_team = int(shot["shootingTeamForwardsOnIce"])
            n_forwards_defending_team = int(shot["defendingTeamForwardsOnIce"])
            n_defencemen_shooting_team = int(shot["shootingTeamDefencemenOnIce"])
            n_defencemen_defending_team = int(shot["defendingTeamDefencemenOnIce"])
            n_players_on_ice = n_forwards_shooting_team + n_forwards_defending_team + n_defencemen_shooting_team + n_defencemen_defending_team
            # Get node features
            node_feats = self._get_node_features(shot, n_forwards_shooting_team, n_forwards_defending_team, n_defencemen_shooting_team, n_defencemen_defending_team)
            # Get edge features
            edge_feats = self._get_edge_features(shot, n_players_on_ice + 1)
            # Get adjacency info
            edge_index = self._get_adjacency_info(shot)
            # Get labels info
            label = self._get_labels(shot["goal"])

        
            # Create data object
            data = Data(x=node_feats, edge_index=edge_index.t().contiguous(), edge_attr=edge_feats,y=label)
            torch.save(data, os.path.join(self.processed_dir,f'shot_{index}.pt'))

    def _get_node_features(self, shot, n_forwards_shooting_team, n_forwards_defending_team, n_defencemen_shooting_team, n_defencemen_defending_team):
        """
        This will return a matrix/2d array of the shape [# of Nodes, Node Feature size]
        """
        # set goalie and shooter features
        all_node_feats = []
        shooter_feats = [shot["shooterName_target"], shot["shotType_WRIST"], shot["shotAngleReboundRoyalRoad"]]
        goalie_feats = [shot["goalieNameForShot_target"], shot["defendingTeamAverageTimeOnIce"], shot["defendingTeamMaxTimeOnIceOfDefencemen"]]
        all_node_feats.append(shooter_feats)
        all_node_feats.append(goalie_feats)

        # Create the four different categories of adjacent player, checking if shot was from a defencemen or not
        if shot["playerPositionThatDidEvent_D"] == 1:
            for i in range(n_defencemen_shooting_team - 1):
                adj_player_feats = [shot["shootingTeamAverageTimeOnIceOfDefencemen"], shot["shootingTeamAverageTimeOnIceOfDefencemenSinceFaceoff"], 0]
                all_node_feats.append(adj_player_feats)
            for j in range(n_forwards_shooting_team):
                adj_player_feats = [shot["shootingTeamAverageTimeOnIceOfForwards"], shot["shootingTeamAverageTimeOnIceOfForwardsSinceFaceoff"], 1]
                all_node_feats.append(adj_player_feats)
            for k in range(n_defencemen_defending_team):
                adj_player_feats = [shot["defendingTeamAverageTimeOnIceOfDefencemen"], shot["defendingTeamAverageTimeOnIceOfDefencemenSinceFaceoff"], 0]
                all_node_feats.append(adj_player_feats)
            for l in range(n_forwards_defending_team):
                adj_player_feats = [shot["defendingTeamAverageTimeOnIceOfForwards"], shot["defendingTeamAverageTimeOnIceOfForwardsSinceFaceoff"], 1]
                all_node_feats.append(adj_player_feats)
        elif shot["playerPositionThatDidEvent_D"] == 0:
            for i in range(n_defencemen_shooting_team):
                adj_player_feats = [shot["shootingTeamAverageTimeOnIceOfDefencemen"], shot["shootingTeamAverageTimeOnIceOfDefencemenSinceFaceoff"], 0]
                all_node_feats.append(adj_player_feats)
            for j in range(n_forwards_shooting_team - 1):
                adj_player_feats = [shot["shootingTeamAverageTimeOnIceOfForwards"], shot["shootingTeamAverageTimeOnIceOfForwardsSinceFaceoff"], 1]
                all_node_feats.append(adj_player_feats)
            for k in range(n_defencemen_defending_team):
                adj_player_feats = [shot["defendingTeamAverageTimeOnIceOfDefencemen"], shot["defendingTeamAverageTimeOnIceOfDefencemenSinceFaceoff"], 0]
                all_node_feats.append(adj_player_feats)
            for l in range(n_forwards_defending_team):
                adj_player_feats = [shot["defendingTeamAverageTimeOnIceOfForwards"], shot["defendingTeamAverageTimeOnIceOfForwardsSinceFaceoff"], 1]
                all_node_feats.append(adj_player_feats)

        # Add padding if not maximum players
        n_player_nodes = n_defencemen_shooting_team + n_forwards_shooting_team + n_defencemen_defending_team + n_forwards_defending_team
        if n_player_nodes < 11:
            for i in range(11 - n_player_nodes):
                adj_player_feats = [0, 0, 0]
                all_node_feats.append(adj_player_feats)

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, shot, n_players_on_ice):
        """
        This will return a matrix/2d array of ths shape [# of Nodes, Node Feature size]
        """
        # Get edge features
        all_edge_feats = []
        edge_feats = [shot["shotDistance"]]
        all_edge_feats += [edge_feats, edge_feats]

        # Get random edge features for adjacent players
        for i in range(0, n_players_on_ice):
            for j in range(i+1, n_players_on_ice):
                if i == 0 and j == 1:
                    continue
                else:
                    random_distance = random.uniform(0, 75)
                    edge_feats = [random_distance]
                    all_edge_feats += [edge_feats, edge_feats]

        # Add padding if not maximum players
        if n_players_on_ice < 12:
            for i in range(0, n_players_on_ice):
                for j in range(n_players_on_ice, 12):
                    edge_feats = [0]
                    all_edge_feats += [edge_feats, edge_feats]
                
        all_edge_feats = np.asarray(all_edge_feats)
        return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, shot):
        edge_indices = []

        # There are 12 fully connected nodes in the adjacency matrix
        for i in range(0, 12):
            for j in range(i+1, 12):
                edge_indices.append([i, j])
                edge_indices.append([j, i])
        edge_indices = torch.tensor(edge_indices, dtype=torch.long)
        return edge_indices

    def _get_labels(self, label):
        # Get label (goal or no goal)
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        data = torch.load(os.path.join(self.processed_dir, f'shot_{idx}.pt'))
        return data

In [26]:
# Small subset of data to make graphs
train_dataset_adj = _df_reduced[_df_reduced["season"] != 2023]
test_dataset_adj = _df_reduced[_df_reduced["season"] == 2023]
train_dataset_adj.to_csv("COSC5P30/raw/train_dataset_adj.csv", index = False)
test_dataset_adj.to_csv("COSC5P30/raw/test_dataset_adj.csv", index = False)

In [27]:
# Create the graphs
train_dat_adj = MultiPlayerShotDataset(filename = "train_dataset_adj.csv", root="COSC5P30/")
test_dat_adj = MultiPlayerShotDataset(filename = "test_dataset_adj.csv", root="COSC5P30/")

Processing...
100%|█████████████████████████████████| 306283/306283 [02:09<00:00, 2363.23it/s]
Done!
Processing...
100%|███████████████████████████████████| 81649/81649 [00:33<00:00, 2414.25it/s]
Done!


In [28]:
# Check what the data looks like
for p in train_dat_adj[0]:
    print(p)

('x', tensor([[ 0.0975,  1.0000,  0.0000],
        [ 0.0973, 30.0000, 33.0000],
        [20.5000, 20.5000,  0.0000],
        [25.6667, 24.0000,  1.0000],
        [25.6667, 24.0000,  1.0000],
        [25.6667, 24.0000,  1.0000],
        [32.0000, 26.0000,  0.0000],
        [32.0000, 26.0000,  0.0000],
        [28.6667, 26.0000,  1.0000],
        [28.6667, 26.0000,  1.0000],
        [28.6667, 26.0000,  1.0000],
        [ 0.0000,  0.0000,  0.0000]]))
('edge_index', tensor([[ 0,  1,  0,  2,  0,  3,  0,  4,  0,  5,  0,  6,  0,  7,  0,  8,  0,  9,
          0, 10,  0, 11,  1,  2,  1,  3,  1,  4,  1,  5,  1,  6,  1,  7,  1,  8,
          1,  9,  1, 10,  1, 11,  2,  3,  2,  4,  2,  5,  2,  6,  2,  7,  2,  8,
          2,  9,  2, 10,  2, 11,  3,  4,  3,  5,  3,  6,  3,  7,  3,  8,  3,  9,
          3, 10,  3, 11,  4,  5,  4,  6,  4,  7,  4,  8,  4,  9,  4, 10,  4, 11,
          5,  6,  5,  7,  5,  8,  5,  9,  5, 10,  5, 11,  6,  7,  6,  8,  6,  9,
          6, 10,  6, 11,  7,  8,  7,  9,  7, 10

In [29]:
# Print information about the dataset
print(f'Dataset: {train_dat_adj}:')
print('====================')
print(f'Number of graphs: {len(train_dat_adj)}')
print(f'Number of features: {train_dat_adj.num_features}')
#print(f'Number of classes: {train_dat_adj.num_classes}')

data = train_dat_adj[0]  # Get the first graph object

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of undirected edges: {data.num_edges // 2}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Dataset: MultiPlayerShotDataset(306283):
Number of graphs: 306283
Number of features: 3

Data(x=[12, 3], edge_index=[2, 132], edge_attr=[132, 1], y=[1])
Number of nodes: 12
Number of undirected edges: 66
Average node degree: 11.00
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [30]:
print(f'Number of training graphs: {len(train_dat_adj)}')
print(f'Number of test graphs: {len(test_dat_adj)}')

Number of training graphs: 306283
Number of test graphs: 81649


In [31]:
# Create data loaders
train_loader_adj = DataLoader(train_dat_adj, batch_size=1024, shuffle=True)
test_loader_adj = DataLoader(test_dat_adj, batch_size=1024, shuffle=False)

## Fit GNN with adjacent players

In [32]:
# Create GCN
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(train_dat_adj.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)

        self.bn = nn.BatchNorm1d(hidden_channels)
        self.lin = Linear(hidden_channels,1)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.bn(x)
        return torch.sigmoid(self.lin(x))

# Choose between 64 and 128 hidden channels
model = GCN(hidden_channels=128)
print(model)

GCN(
  (conv1): GCNConv(3, 128)
  (conv2): GCNConv(128, 128)
  (conv3): GCNConv(128, 128)
  (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lin): Linear(in_features=128, out_features=1, bias=True)
)


In [33]:
# Set optimizer and criterion
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.BCEWithLogitsLoss()

# Create training function
def train():
    model.train()

    # Iterate over data loader
    for data in train_loader_adj:  

        # Forward pass
        out = model(data.x, data.edge_index, data.batch)
        # Loss computation
        loss = criterion(out, data.y.unsqueeze(1).float())
        # Gradient
        loss.backward()
        # Update
        optimizer.step() 
        optimizer.zero_grad() 

# Create testing function
def test(loader):
    model.eval()

    sq_error = 0
    correct = 0
    goals = 0

    # Iterate over data loader
    for data in loader:
        # Forward pass
        out = model(data.x, data.edge_index, data.batch)  
        # Make predictions
        pred = out
        # Compute metrics
        sq_error += (((pred - data.y.unsqueeze(1)))**2).sum()
        correct += (torch.round(pred) == data.y.unsqueeze(1)).sum()
        goals += sum(data.y.unsqueeze(1))
    return sq_error / len(loader.dataset), correct / len(loader.dataset), pred, data.y, goals


for epoch in range(0, 30):
    train()
    train_acc = test(train_loader_adj)
    test_acc = test(test_loader_adj)
    print(f'Epoch: {epoch:06d}, Train Acc: {train_acc[1].item():.6f}, Test Acc: {test_acc[1].item():.6f}, Test MSE: {test_acc[0].item():.6f}')

Epoch: 000000, Train Acc: 0.799897, Test Acc: 0.788130, Test MSE: 0.194251
Epoch: 000001, Train Acc: 0.838966, Test Acc: 0.826281, Test MSE: 0.161137
Epoch: 000002, Train Acc: 0.861370, Test Acc: 0.848963, Test MSE: 0.139819
Epoch: 000003, Train Acc: 0.871083, Test Acc: 0.859251, Test MSE: 0.136242
Epoch: 000004, Train Acc: 0.882458, Test Acc: 0.871817, Test MSE: 0.125254
Epoch: 000005, Train Acc: 0.888887, Test Acc: 0.880060, Test MSE: 0.117743
Epoch: 000006, Train Acc: 0.890742, Test Acc: 0.882705, Test MSE: 0.114967
Epoch: 000007, Train Acc: 0.892116, Test Acc: 0.884800, Test MSE: 0.114600
Epoch: 000008, Train Acc: 0.892795, Test Acc: 0.885669, Test MSE: 0.113099
Epoch: 000009, Train Acc: 0.895195, Test Acc: 0.889490, Test MSE: 0.109617
Epoch: 000010, Train Acc: 0.896063, Test Acc: 0.890740, Test MSE: 0.108791
Epoch: 000011, Train Acc: 0.896390, Test Acc: 0.891230, Test MSE: 0.108385
Epoch: 000012, Train Acc: 0.898127, Test Acc: 0.893985, Test MSE: 0.105733
Epoch: 000013, Train Acc:

KeyboardInterrupt: 

## Try models with Synthetic Minority Oversampling Technique (SMOTE) on the training set (using 2022 for training and 2023 for testing due to time limitations)

## SMOTE Logistic Regression

In [32]:
# Create training and testing sets
train_df = _df_reduced[_df_reduced["season"] == 2022]
test_df = _df_reduced[_df_reduced["season"] == 2023]

print("Original Train Size:", len(train_df))
print("Original Test Size:", len(test_df))

X_train = train_df.loc[:, ~train_df.columns.isin(["goal", "xGoal"])]
X_test = test_df.loc[:, ~test_df.columns.isin(["goal", "xGoal"])]
y_train = train_df["goal"]
y_test = test_df["goal"]

# Perform SMOTE
smote = SMOTE(random_state = 12345)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("SMOTE Train Size:", len(y_train_smote))
print("SMOTE Test Size:", len(test_df)) # doesn't get oversampled

Original Train Size: 86240
Original Test Size: 81649
SMOTE Train Size: 155356
SMOTE Test Size: 81649


In [33]:
# Limit columns to those used in the graph neural network
limit_X_train_smote = X_train_smote[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target", 
                         "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen", "shotDistance"]]
limit_X_test_smote = X_test[["shooterName_target", "shotType_WRIST", "shotAngleReboundRoyalRoad", "goalieNameForShot_target",
                      "defendingTeamAverageTimeOnIce", "defendingTeamMaxTimeOnIceOfDefencemen", "shotDistance"]]

In [34]:
# Train Logistic Regression
lr = LogisticRegression(random_state = seed, max_iter = 500, verbose=1).fit(limit_X_train_smote, y_train_smote)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  3.15296D+00


 This problem is unconstrained.



At iterate   50    f=  5.89250D-01    |proj g|=  6.96236D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    8     50     60      1     0     0   6.962D-05   5.892D-01
  F =  0.58924964495572663     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


In [35]:
# Determine model performance
preds = lr.predict_proba(limit_X_test_smote)[:,1]
print("Logistic Regression MSE: ", np.square(np.subtract(y_test,preds)).mean())
print("Logistic Regression Accuracy: ", sum(lr.predict(limit_X_test_smote) == y_test)/len(y_test))

# Performance on training data
train_preds = lr.predict_proba(limit_X_train_smote)[:,1]
print("Train Logistic Regression MSE: ", np.square(np.subtract(y_train_smote,train_preds)).mean())
print("Train Logistic Regression Accuracy: ", sum(lr.predict(limit_X_train_smote) == y_train_smote)/len(y_train_smote))

Logistic Regression MSE:  0.20797548786088288
Logistic Regression Accuracy:  0.6652500336807554
Train Logistic Regression MSE:  0.20098287962261818
Train Logistic Regression Accuracy:  0.6946883287417287


## SMOTE xGBoost

In [36]:
# Split SMOTE data into four folds of size 155356/4 = 38839
X_trains_smote = [limit_X_train_smote[:38839], limit_X_train_smote[38839:77678], limit_X_train_smote[77678:116517], limit_X_train_smote[116517:]]
y_trains_smote = [y_train_smote[:38839], y_train_smote[38839:77678], y_train_smote[77678:116517], y_train_smote[116517:]]

In [37]:
# Intialize MSE and accuracy storage
mses = [[0 for _ in range(4)] for _ in range(32)]
accs = [[0 for _ in range(4)] for _ in range(32)]

# Loop though folds
for fold in range(0, len(X_trains_smote)):
    # Set up data for the fold
    fold_inds = [i for i in range(len(X_trains_smote)) if i != fold]
    fold_dfs = [X_trains_smote[i] for i in fold_inds]
    fold_ys = [y_trains_smote[i] for i in fold_inds]
    all_fold_dfs = pd.concat(fold_dfs)
    all_fold_ys = pd.concat(fold_ys)
    count = 0
    # Loop through possible max depths
    for m_depth in [6,7,8,9]:
        # Loop through possible learning rates
        for lrate in [0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]:
            # Fit classifier
            xgb_clf_smote = xgb.XGBClassifier(learning_rate = lrate, max_depth = m_depth)
            xgb_clf_smote.fit(all_fold_dfs, all_fold_ys)
            # Make predictions
            xgb_preds = xgb_clf_smote.predict_proba(limit_X_test)[:,1]
            # Compute metrics
            mses[count][fold] = np.square(np.subtract(y_test,xgb_preds)).mean()
            accs[count][fold] = sum(xgb_clf.predict(limit_X_test_smote) == y_test)/len(y_test)
            print("Fold:", fold," Max Depth: ", m_depth, " Learning Rate: ", lrate, " MSE: ", mses[count][fold], "Accuracy: ", accs[count][fold])
            count = count + 1

Fold: 0  Max Depth:  6  Learning Rate:  0.25  MSE:  0.1697770084181263 Accuracy:  0.90470183345785
Fold: 0  Max Depth:  6  Learning Rate:  0.3  MSE:  0.16565650789742675 Accuracy:  0.90470183345785
Fold: 0  Max Depth:  6  Learning Rate:  0.35  MSE:  0.16491887279236966 Accuracy:  0.90470183345785
Fold: 0  Max Depth:  6  Learning Rate:  0.4  MSE:  0.1641637750175 Accuracy:  0.90470183345785
Fold: 0  Max Depth:  6  Learning Rate:  0.45  MSE:  0.16250512550374238 Accuracy:  0.90470183345785
Fold: 0  Max Depth:  6  Learning Rate:  0.5  MSE:  0.16355182622428813 Accuracy:  0.90470183345785
Fold: 0  Max Depth:  6  Learning Rate:  0.55  MSE:  0.16369131265960163 Accuracy:  0.90470183345785
Fold: 0  Max Depth:  6  Learning Rate:  0.6  MSE:  0.16170862783994053 Accuracy:  0.90470183345785
Fold: 0  Max Depth:  7  Learning Rate:  0.25  MSE:  0.1657832962669558 Accuracy:  0.90470183345785
Fold: 0  Max Depth:  7  Learning Rate:  0.3  MSE:  0.16368638837375402 Accuracy:  0.90470183345785
Fold: 0  Ma

In [38]:
# Choose hyperparameters that minimize average MSE across folds
avg_mses = [0] * len(mses)
for i in range(0,len(mses)):
    avg_mses[i] = sum(mses[i])/len(mses[i])

# Optimal is max depth = 7, lr = 0.4
print(min(range(len(avg_mses)), key=avg_mses.__getitem__))

11


In [39]:
# Fit optimal xGBoost model and check performance on testing data
opt_xgb_clf_smote = xgb.XGBClassifier(learning_rate = 0.4, max_depth = 7)
opt_xgb_clf_smote.fit(limit_X_train_smote, y_train_smote)

preds = opt_xgb_clf_smote.predict_proba(limit_X_test_smote)[:,1]
print("xGBoost MSE: ", np.square(np.subtract(y_test,preds)).mean())
print("xGBoost Accuracy: ", sum(opt_xgb_clf_smote.predict(limit_X_test_smote) == y_test)/len(y_test))

# Check performance on training
preds = opt_xgb_clf_smote.predict_proba(limit_X_train_smote)[:,1]
print("Train xGBoost MSE: ", np.square(np.subtract(y_train_smote,preds)).mean())
print("Train xGBoost Accuracy: ", sum(opt_xgb_clf_smote.predict(limit_X_train_smote) == y_train_smote)/len(y_train_smote))

xGBoost MSE:  0.1383792211943327
xGBoost Accuracy:  0.8046883611556785
Train xGBoost MSE:  0.08109596267549896
Train xGBoost Accuracy:  0.8898787301423826


## SMOTE Create two node graph data for graph neural network (GNN)

In [87]:
# create class for two node graph generation
class ShotDataset(Dataset):
    def __init__(self, root, filename, transform=None, pre_transform=None, pre_filter=None):
        """
        root: where the dataset should be stored, folder is split into raw_dir (downloaded dataset)
        and processed_dir (processed data).
        """
        self.filename = filename
        super(ShotDataset, self).__init__(root, transform=None, pre_transform=None, pre_filter=None)

    @property
    def raw_file_names(self):
        """
        If this file exists in raw_dir, the download is not triggered.
        """
        return self.filename

    @property
    def processed_file_names(self):
        """
        Not implemented
        """
        return 'xxxxx.pt'

    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        for index, shot in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            # Get node features
            node_feats = self._get_node_features(shot)
            # Get edge features
            edge_feats = self._get_edge_features(shot)
            # Get adjacency info
            edge_index = self._get_adjacency_info(shot)
            # Get labels info
            label = self._get_labels(shot["goal"])

        
            # Create data object
            data = Data(x=node_feats, edge_index=edge_index, edge_attr=edge_feats,y=label)
            torch.save(data, os.path.join(self.processed_dir,f'shot_{index}.pt'))

    def _get_node_features(self, shot):
        """
        This will return a matrix/2d array of the shape [# of Nodes, Node Feature size]
        """
        # Initialize feature storage
        all_node_feats = []
        shooter_feats = []
        goalie_feats = []

        # Get shooter features
        shooter_feats.append(shot["shooterName_target"])
        shooter_feats.append(shot["shotType_WRIST"])
        shooter_feats.append(shot["shotAngleReboundRoyalRoad"])
        all_node_feats.append(shooter_feats)

        # Get goalie features
        goalie_feats.append(shot["goalieNameForShot_target"])
        goalie_feats.append(shot["defendingTeamAverageTimeOnIce"])
        goalie_feats.append(shot["defendingTeamMaxTimeOnIceOfDefencemen"])
        all_node_feats.append(goalie_feats)

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, shot):
        """
        This will return a matrix/2d array of ths shape [# of Nodes, Node Feature size]
        """
        # Get edge features
        all_edge_feats = []
        edge_feats = []
        edge_feats.append(shot["shotDistance"])
        all_edge_feats += [edge_feats, edge_feats]
        all_edge_feats = np.asarray(all_edge_feats)
        return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, shot):
        # Create adjacency matrix
        edge_indices = []
        edge_indices += [[0,1],[1,0]]
        edge_indices = torch.tensor(edge_indices)
        return edge_indices

    def _get_labels(self, label):
        # Get labels (goal or no goal)
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        data = torch.load(os.path.join(self.processed_dir, f'shot_{idx}.pt'))
        return data

In [88]:
# Put X and y back together for graph generation
X_train_smote["goal"] = y_train_smote
X_test["goal"] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["goal"] = y_test


In [89]:
# Small subset for SMOTE two node graph generation
X_train_smote.to_csv("COSC5P30/raw/train_dataset_smote.csv", index = False)
X_test.to_csv("COSC5P30/raw/test_dataset_smote.csv", index = False)

In [90]:
# Create the graphs
train_dat_smote = ShotDataset(filename = "train_dataset_smote.csv", root="COSC5P30/")
test_dat_smote = ShotDataset(filename = "test_dataset_smote.csv", root="COSC5P30/")

Processing...
100%|█████████████████████████████████| 155356/155356 [00:47<00:00, 3290.66it/s]
Done!
Processing...
100%|███████████████████████████████████| 81649/81649 [00:21<00:00, 3789.05it/s]
Done!


In [91]:
# Check what the data looks like
for p in train_dat_smote[0]:
    print(p)

('x', tensor([[ 0.0975,  1.0000,  0.0000],
        [ 0.0973, 30.0000, 33.0000]]))
('edge_index', tensor([[0, 1],
        [1, 0]]))
('edge_attr', tensor([[39.6989],
        [39.6989]]))
('y', tensor([0]))


In [92]:
# Print information about the dataset
print(f'Dataset: {train_dat_smote}:')
print('====================')
print(f'Number of graphs: {len(train_dat_smote)}')
print(f'Number of features: {train_dat_smote.num_features}')
#print(f'Number of classes: {train_dat_smote.num_classes}')

data = train_dat_smote[0]  # Get the first graph object

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of undirected edges: {data.num_edges // 2}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Dataset: ShotDataset(155356):
Number of graphs: 155356
Number of features: 3

Data(x=[2, 3], edge_index=[2, 2], edge_attr=[2, 1], y=[1])
Number of nodes: 2
Number of undirected edges: 1
Average node degree: 1.00
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [93]:
print(f'Number of training graphs: {len(train_dat_smote)}')
print(f'Number of test graphs: {len(test_dat_smote)}')

Number of training graphs: 155356
Number of test graphs: 81649


In [94]:
# Create data loader
train_loader_smote = DataLoader(train_dat_smote, batch_size=1024, shuffle=True)
test_loader_smote = DataLoader(test_dat_smote, batch_size=1024, shuffle=False)

## Fit two node SMOTE GNN

In [97]:

# Create GCNclass GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(train_dat_smote.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)

        self.bn = nn.BatchNorm1d(hidden_channels)
        self.lin = Linear(hidden_channels,1)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.bn(x)
        return torch.sigmoid(self.lin(x))

# choose either 64 or 128 hidden channels
model = GCN(hidden_channels=64)
print(model)

GCN(
  (conv1): GCNConv(3, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lin): Linear(in_features=64, out_features=1, bias=True)
)


In [98]:
# Set optimizer and criterion
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.BCEWithLogitsLoss()

# Create training function
def train():
    model.train()

    # Iterate over data loader
    for data in train_loader_smote:  
        # Forward
        out = model(data.x, data.edge_index, data.batch)
        # Compute loss
        loss = criterion(out, data.y.unsqueeze(1).float())
        # Gradient
        loss.backward()
        # Update
        optimizer.step()
        optimizer.zero_grad()

# Create testing function
def test(loader):
    model.eval()

    sq_error = 0
    correct = 0
    goals = 0
    # Iterate over data loader
    for data in loader:
        # Forward
        out = model(data.x, data.edge_index, data.batch) 
        # Make prediction
        pred = out
        # Compute metrics
        sq_error += (((pred - data.y.unsqueeze(1)))**2).sum()
        correct += (torch.round(pred) == data.y.unsqueeze(1)).sum()
        goals += sum(data.y.unsqueeze(1))
     return sq_error / len(loader.dataset), correct / len(loader.dataset), pred, data.y, goals


for epoch in range(0, 30):
    train()
    train_acc = test(train_loader_smote)
    test_acc = test(test_loader_smote)
    print(f'Epoch: {epoch:06d}, Train Acc: {train_acc[1].item():.6f}, Test Acc: {test_acc[1].item():.6f}, Test MSE: {test_acc[0].item():.6f}')

Epoch: 000000, Train Acc: 0.541543, Test Acc: 0.611410, Test MSE: 0.247975
Epoch: 000001, Train Acc: 0.539297, Test Acc: 0.625960, Test MSE: 0.246215
Epoch: 000002, Train Acc: 0.536786, Test Acc: 0.654607, Test MSE: 0.232960
Epoch: 000003, Train Acc: 0.533941, Test Acc: 0.675893, Test MSE: 0.221482
Epoch: 000004, Train Acc: 0.530543, Test Acc: 0.699188, Test MSE: 0.205572
Epoch: 000005, Train Acc: 0.528618, Test Acc: 0.716273, Test MSE: 0.196426
Epoch: 000006, Train Acc: 0.525818, Test Acc: 0.740560, Test MSE: 0.183655
Epoch: 000007, Train Acc: 0.523424, Test Acc: 0.764345, Test MSE: 0.171649
Epoch: 000008, Train Acc: 0.519858, Test Acc: 0.795833, Test MSE: 0.155739
Epoch: 000009, Train Acc: 0.514090, Test Acc: 0.838663, Test MSE: 0.132843
Epoch: 000010, Train Acc: 0.512185, Test Acc: 0.850825, Test MSE: 0.126639
Epoch: 000011, Train Acc: 0.508670, Test Acc: 0.871560, Test MSE: 0.115424
Epoch: 000012, Train Acc: 0.512275, Test Acc: 0.851768, Test MSE: 0.124811
Epoch: 000013, Train Acc:

## Create graph data for SMOTE graph neural network (GNN) with all players on the ice

In [117]:
# Create class for 12 node SMOTE graph generation
class MultiPlayerShotDataset(Dataset):
    def __init__(self, root, filename, transform=None, pre_transform=None, pre_filter=None):
        """
        root: where the dataset should be stored, folder is split into raw_dir (downloaded dataset)
        and processed_dir (processed data).
        """
        self.filename = filename
        super(MultiPlayerShotDataset, self).__init__(root, transform=None, pre_transform=None, pre_filter=None)

    @property
    def raw_file_names(self):
        """
        If this file exists in raw_dir, the download is not triggered.
        """
        return self.filename

    @property
    def processed_file_names(self):
        """
        Not implemented
        """
        return 'xxxxx.pt'

    def download(self):
        pass

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        for index, shot in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            # Get number of players on the ice
            n_forwards_shooting_team = int(shot["shootingTeamForwardsOnIce"])
            n_forwards_defending_team = int(shot["defendingTeamForwardsOnIce"])
            n_defencemen_shooting_team = int(shot["shootingTeamDefencemenOnIce"])
            n_defencemen_defending_team = int(shot["defendingTeamDefencemenOnIce"])
            n_players_on_ice = n_forwards_shooting_team + n_forwards_defending_team + n_defencemen_shooting_team + n_defencemen_defending_team
            # Get node features
            node_feats = self._get_node_features(shot, n_forwards_shooting_team, n_forwards_defending_team, n_defencemen_shooting_team, n_defencemen_defending_team)
            # Get edge features
            edge_feats = self._get_edge_features(shot, n_players_on_ice + 1)
            # Get adjacency info
            edge_index = self._get_adjacency_info(shot)
            # Get labels info
            label = self._get_labels(shot["goal"])

        
            # Create data object
            data = Data(x=node_feats, edge_index=edge_index.t().contiguous(), edge_attr=edge_feats,y=label)
            torch.save(data, os.path.join(self.processed_dir,f'shot_{index}.pt'))

    def _get_node_features(self, shot, n_forwards_shooting_team, n_forwards_defending_team, n_defencemen_shooting_team, n_defencemen_defending_team):
        """
        This will return a matrix/2d array of the shape [# of Nodes, Node Feature size]
        """
        # Get shooter and goalie features
        all_node_feats = []
        shooter_feats = [shot["shooterName_target"], shot["shotType_WRIST"], shot["shotAngleReboundRoyalRoad"]]
        goalie_feats = [shot["goalieNameForShot_target"], shot["defendingTeamAverageTimeOnIce"], shot["defendingTeamMaxTimeOnIceOfDefencemen"]]
        all_node_feats.append(shooter_feats)
        all_node_feats.append(goalie_feats)

        # Get features for all four adjacent player categories, checking if a defencemen took the shot
        if shot["playerPositionThatDidEvent_D"] == 1:
            for i in range(n_defencemen_shooting_team - 1):
                adj_player_feats = [shot["shootingTeamAverageTimeOnIceOfDefencemen"], shot["shootingTeamAverageTimeOnIceOfDefencemenSinceFaceoff"], 0]
                all_node_feats.append(adj_player_feats)
            for j in range(n_forwards_shooting_team):
                adj_player_feats = [shot["shootingTeamAverageTimeOnIceOfForwards"], shot["shootingTeamAverageTimeOnIceOfForwardsSinceFaceoff"], 1]
                all_node_feats.append(adj_player_feats)
            for k in range(n_defencemen_defending_team):
                adj_player_feats = [shot["defendingTeamAverageTimeOnIceOfDefencemen"], shot["defendingTeamAverageTimeOnIceOfDefencemenSinceFaceoff"], 0]
                all_node_feats.append(adj_player_feats)
            for l in range(n_forwards_defending_team):
                adj_player_feats = [shot["defendingTeamAverageTimeOnIceOfForwards"], shot["defendingTeamAverageTimeOnIceOfForwardsSinceFaceoff"], 1]
                all_node_feats.append(adj_player_feats)
        elif shot["playerPositionThatDidEvent_D"] == 0:
            for i in range(n_defencemen_shooting_team):
                adj_player_feats = [shot["shootingTeamAverageTimeOnIceOfDefencemen"], shot["shootingTeamAverageTimeOnIceOfDefencemenSinceFaceoff"], 0]
                all_node_feats.append(adj_player_feats)
            for j in range(n_forwards_shooting_team - 1):
                adj_player_feats = [shot["shootingTeamAverageTimeOnIceOfForwards"], shot["shootingTeamAverageTimeOnIceOfForwardsSinceFaceoff"], 1]
                all_node_feats.append(adj_player_feats)
            for k in range(n_defencemen_defending_team):
                adj_player_feats = [shot["defendingTeamAverageTimeOnIceOfDefencemen"], shot["defendingTeamAverageTimeOnIceOfDefencemenSinceFaceoff"], 0]
                all_node_feats.append(adj_player_feats)
            for l in range(n_forwards_defending_team):
                adj_player_feats = [shot["defendingTeamAverageTimeOnIceOfForwards"], shot["defendingTeamAverageTimeOnIceOfForwardsSinceFaceoff"], 1]
                all_node_feats.append(adj_player_feats)

        # Add padding if not maximum players
        n_player_nodes = n_defencemen_shooting_team + n_forwards_shooting_team + n_defencemen_defending_team + n_forwards_defending_team
        if n_player_nodes < 11:
            for i in range(11 - n_player_nodes):
                adj_player_feats = [0, 0, 0]
                all_node_feats.append(adj_player_feats)

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, shot, n_players_on_ice):
        """
        This will return a matrix/2d array of ths shape [# of Nodes, Node Feature size]
        """
        # Get edge feature for shooter and goalie
        all_edge_feats = []
        edge_feats = [shot["shotDistance"]]
        all_edge_feats += [edge_feats, edge_feats]

        # Get edge features for adjacent players
        for i in range(0, n_players_on_ice):
            for j in range(i+1, n_players_on_ice):
                if i == 0 and j == 1:
                    continue
                else:
                    random_distance = random.uniform(0, 75)
                    edge_feats = [random_distance]
                    all_edge_feats += [edge_feats, edge_feats]

        # Add padding if not maximum players
        if n_players_on_ice < 12:
            for i in range(0, n_players_on_ice):
                for j in range(n_players_on_ice, 12):
                    edge_feats = [0]
                    all_edge_feats += [edge_feats, edge_feats]
                
        all_edge_feats = np.asarray(all_edge_feats)
        return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, shot):
        # Create adjacency matrix
        edge_indices = []

        # There are 12 fully connected nodes
        for i in range(0, 12):
            for j in range(i+1, 12):
                edge_indices.append([i, j])
                edge_indices.append([j, i])
        edge_indices = torch.tensor(edge_indices, dtype=torch.long)
        return edge_indices

    def _get_labels(self, label):
        # Get labels (goal or no goal)
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        data = torch.load(os.path.join(self.processed_dir, f'shot_{idx}.pt'))
        return data

In [118]:
# Small subset for SMOTE 12 node graph generation
X_train_smote.to_csv("COSC5P30/raw/train_dataset_smote12.csv", index = False)
X_test.to_csv("COSC5P30/raw/test_dataset_smote12.csv", index = False)

In [119]:
# Create the graphs
train_dat_smote12 = MultiPlayerShotDataset(filename = "train_dataset_smote12.csv", root="COSC5P30/")
test_dat_smote12 = MultiPlayerShotDataset(filename = "test_dataset_smote12.csv", root="COSC5P30/")

Processing...
100%|█████████████████████████████████| 155356/155356 [01:03<00:00, 2449.05it/s]
Done!
Processing...
100%|███████████████████████████████████| 81649/81649 [00:33<00:00, 2471.84it/s]
Done!


In [120]:
# Check what the data looks like
for p in train_dat_smote12[0]:
    print(p)

('x', tensor([[ 0.0975,  1.0000,  0.0000],
        [ 0.0973, 30.0000, 33.0000],
        [20.5000, 20.5000,  0.0000],
        [25.6667, 24.0000,  1.0000],
        [25.6667, 24.0000,  1.0000],
        [25.6667, 24.0000,  1.0000],
        [32.0000, 26.0000,  0.0000],
        [32.0000, 26.0000,  0.0000],
        [28.6667, 26.0000,  1.0000],
        [28.6667, 26.0000,  1.0000],
        [28.6667, 26.0000,  1.0000],
        [ 0.0000,  0.0000,  0.0000]]))
('edge_index', tensor([[ 0,  1,  0,  2,  0,  3,  0,  4,  0,  5,  0,  6,  0,  7,  0,  8,  0,  9,
          0, 10,  0, 11,  1,  2,  1,  3,  1,  4,  1,  5,  1,  6,  1,  7,  1,  8,
          1,  9,  1, 10,  1, 11,  2,  3,  2,  4,  2,  5,  2,  6,  2,  7,  2,  8,
          2,  9,  2, 10,  2, 11,  3,  4,  3,  5,  3,  6,  3,  7,  3,  8,  3,  9,
          3, 10,  3, 11,  4,  5,  4,  6,  4,  7,  4,  8,  4,  9,  4, 10,  4, 11,
          5,  6,  5,  7,  5,  8,  5,  9,  5, 10,  5, 11,  6,  7,  6,  8,  6,  9,
          6, 10,  6, 11,  7,  8,  7,  9,  7, 10

In [121]:
# Print information about the dataset
print(f'Dataset: {train_dat_smote12}:')
print('====================')
print(f'Number of graphs: {len(train_dat_smote12)}')
print(f'Number of features: {train_dat_smote12.num_features}')
#print(f'Number of classes: {train_dat_adj.num_classes}')

data = train_dat_smote12[0]  # Get the first graph object

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of undirected edges: {data.num_edges // 2}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Dataset: MultiPlayerShotDataset(155356):
Number of graphs: 155356
Number of features: 3

Data(x=[12, 3], edge_index=[2, 132], edge_attr=[132, 1], y=[1])
Number of nodes: 12
Number of undirected edges: 66
Average node degree: 11.00
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [122]:
print(f'Number of training graphs: {len(train_dat_smote12)}')
print(f'Number of test graphs: {len(test_dat_smote12)}')

Number of training graphs: 155356
Number of test graphs: 81649


In [123]:
# Creat data loader
train_loader_smote12 = DataLoader(train_dat_smote12, batch_size=1024, shuffle=True)
test_loader_smote12 = DataLoader(test_dat_smote12, batch_size=1024, shuffle=False)

## Fit SMOTE GNN with adjacent players

In [124]:
# Create GCN
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(train_dat_smote12.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)

        self.bn = nn.BatchNorm1d(hidden_channels)
        self.lin = Linear(hidden_channels,1)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.bn(x)
        return torch.sigmoid(self.lin(x))

# Choose between 64 and 128 hidden channels
model = GCN(hidden_channels=128)
print(model)

GCN(
  (conv1): GCNConv(3, 128)
  (conv2): GCNConv(128, 128)
  (conv3): GCNConv(128, 128)
  (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lin): Linear(in_features=128, out_features=1, bias=True)
)


In [125]:
# Set optimizer and criterion
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = torch.nn.BCEWithLogitsLoss()

# Create training function
def train():
    model.train()

    # Iterate over data loader
    for data in train_loader_smote12:
        # Forward
        out = model(data.x, data.edge_index, data.batch)
        # Loss computation
        loss = criterion(out, data.y.unsqueeze(1).float())
        # Gradient
        loss.backward()
        # Update
        optimizer.step()
        optimizer.zero_grad()

# Create testing function
def test(loader):
    model.eval()

    sq_error = 0
    correct = 0
    goals = 0
    # Iterate over data loader
    for data in loader:
        # Forward
        out = model(data.x, data.edge_index, data.batch)  
        # Make predictions
        pred = out
        # Compute metrics
        sq_error += (((pred - data.y.unsqueeze(1)))**2).sum()
        correct += (torch.round(pred) == data.y.unsqueeze(1)).sum()
        goals += sum(data.y.unsqueeze(1))
    return sq_error / len(loader.dataset), correct / len(loader.dataset), pred, data.y, goals


for epoch in range(0, 30):
    train()
    train_acc = test(train_loader_smote12)
    test_acc = test(test_loader_smote12)
    print(f'Epoch: {epoch:06d}, Train Acc: {train_acc[1].item():.6f}, Test Acc: {test_acc[1].item():.6f}, Test MSE: {test_acc[0].item():.6f}')

Epoch: 000000, Train Acc: 0.533671, Test Acc: 0.668667, Test MSE: 0.224576
Epoch: 000001, Train Acc: 0.534900, Test Acc: 0.741748, Test MSE: 0.193008
Epoch: 000002, Train Acc: 0.540597, Test Acc: 0.754976, Test MSE: 0.186255
Epoch: 000003, Train Acc: 0.552422, Test Acc: 0.706549, Test MSE: 0.230756
Epoch: 000004, Train Acc: 0.549345, Test Acc: 0.739507, Test MSE: 0.205387
Epoch: 000005, Train Acc: 0.559399, Test Acc: 0.674656, Test MSE: 0.252732
Epoch: 000006, Train Acc: 0.541736, Test Acc: 0.771093, Test MSE: 0.182785
Epoch: 000007, Train Acc: 0.548250, Test Acc: 0.759409, Test MSE: 0.192915
Epoch: 000008, Train Acc: 0.531334, Test Acc: 0.809759, Test MSE: 0.149540
Epoch: 000009, Train Acc: 0.526945, Test Acc: 0.828841, Test MSE: 0.136673
Epoch: 000010, Train Acc: 0.524177, Test Acc: 0.842827, Test MSE: 0.125913
Epoch: 000011, Train Acc: 0.521911, Test Acc: 0.850372, Test MSE: 0.122494
Epoch: 000012, Train Acc: 0.521596, Test Acc: 0.854499, Test MSE: 0.120571
Epoch: 000013, Train Acc: