### 1. Imports, Configuration

In [361]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler 

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier

from scipy.stats import randint, loguniform


In [328]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

### 2. Reading, splitting data

In [329]:
dfs = []
for f in Path("data").iterdir():
    df = pd.read_csv(f)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)    

In [330]:
df_playoff = pd.read_csv("playoff.csv")
df = df.merge(
    df_playoff,
    on=["team", "season"],
    how="left"
)

In [331]:
df.to_csv("raw_data.csv", index=False) # keeping raw data as a separate file, a fallback

In [332]:
df = df[df["situation"] == "all"] # only using combined metric(5v5 + 4v5 + 5v4...) for predictions
df.shape

(554, 110)

In [333]:
df_train, df_test = train_test_split(df, random_state=67)

### 3. EDA

In [334]:
df_train.head()

Unnamed: 0,team,season,name,team.1,position,situation,games_played,xGoalsPercentage,corsiPercentage,fenwickPercentage,iceTime,xOnGoalFor,xGoalsFor,xReboundsFor,xFreezeFor,xPlayStoppedFor,xPlayContinuedInZoneFor,xPlayContinuedOutsideZoneFor,flurryAdjustedxGoalsFor,scoreVenueAdjustedxGoalsFor,flurryScoreVenueAdjustedxGoalsFor,shotsOnGoalFor,missedShotsFor,blockedShotAttemptsFor,shotAttemptsFor,goalsFor,reboundsFor,reboundGoalsFor,freezeFor,playStoppedFor,playContinuedInZoneFor,playContinuedOutsideZoneFor,savedShotsOnGoalFor,savedUnblockedShotAttemptsFor,penaltiesFor,penalityMinutesFor,faceOffsWonFor,hitsFor,takeawaysFor,giveawaysFor,lowDangerShotsFor,mediumDangerShotsFor,highDangerShotsFor,lowDangerxGoalsFor,mediumDangerxGoalsFor,highDangerxGoalsFor,lowDangerGoalsFor,mediumDangerGoalsFor,highDangerGoalsFor,scoreAdjustedShotsAttemptsFor,unblockedShotAttemptsFor,scoreAdjustedUnblockedShotAttemptsFor,dZoneGiveawaysFor,xGoalsFromxReboundsOfShotsFor,xGoalsFromActualReboundsOfShotsFor,reboundxGoalsFor,totalShotCreditFor,scoreAdjustedTotalShotCreditFor,scoreFlurryAdjustedTotalShotCreditFor,xOnGoalAgainst,xGoalsAgainst,xReboundsAgainst,xFreezeAgainst,xPlayStoppedAgainst,xPlayContinuedInZoneAgainst,xPlayContinuedOutsideZoneAgainst,flurryAdjustedxGoalsAgainst,scoreVenueAdjustedxGoalsAgainst,flurryScoreVenueAdjustedxGoalsAgainst,shotsOnGoalAgainst,missedShotsAgainst,blockedShotAttemptsAgainst,shotAttemptsAgainst,goalsAgainst,reboundsAgainst,reboundGoalsAgainst,freezeAgainst,playStoppedAgainst,playContinuedInZoneAgainst,playContinuedOutsideZoneAgainst,savedShotsOnGoalAgainst,savedUnblockedShotAttemptsAgainst,penaltiesAgainst,penalityMinutesAgainst,faceOffsWonAgainst,hitsAgainst,takeawaysAgainst,giveawaysAgainst,lowDangerShotsAgainst,mediumDangerShotsAgainst,highDangerShotsAgainst,lowDangerxGoalsAgainst,mediumDangerxGoalsAgainst,highDangerxGoalsAgainst,lowDangerGoalsAgainst,mediumDangerGoalsAgainst,highDangerGoalsAgainst,scoreAdjustedShotsAttemptsAgainst,unblockedShotAttemptsAgainst,scoreAdjustedUnblockedShotAttemptsAgainst,dZoneGiveawaysAgainst,xGoalsFromxReboundsOfShotsAgainst,xGoalsFromActualReboundsOfShotsAgainst,reboundxGoalsAgainst,totalShotCreditAgainst,scoreAdjustedTotalShotCreditAgainst,scoreFlurryAdjustedTotalShotCreditAgainst,penalitiesFor,penalitiesAgainst,playoff
1811,MTL,2019,MTL,MTL,Team Level,all,71,0.53,0.53,0.52,258596.0,2386.76,223.07,163.0,558.6,78.11,1309.56,969.79,213.98,224.14,215.04,2424.0,880.0,1148.0,4452.0,208.0,188.0,34.0,532.0,106.0,1053.0,1217.0,2216.0,3096.0,229.0,490.0,2140.0,1882.0,489.0,874.0,2434.0,626.0,244.0,64.85,77.88,80.34,66.0,67.0,75.0,4475.95,3304.0,3322.1,676.0,35.76,39.18,39.18,219.52,220.49,215.66,2142.02,201.0,150.35,501.51,71.42,1196.33,869.51,193.64,201.39,193.99,2210.0,782.0,928.0,3920.0,220.0,162.0,31.0,520.0,72.0,945.0,1073.0,1990.0,2772.0,205.0,442.0,2104.0,1693.0,531.0,911.0,2229.0,564.0,199.0,61.68,70.37,68.95,69.0,86.0,65.0,3920.91,2992.0,2993.71,686.0,33.29,31.16,31.21,202.96,203.28,198.78,,,0
2231,MIN,2022,MIN,MIN,Team Level,all,82,0.51,0.5,0.51,301108.0,2614.25,258.75,181.28,591.9,85.41,1428.64,1061.03,246.01,260.21,247.45,2535.0,1072.0,1187.0,4794.0,239.0,245.0,35.0,533.0,74.0,1159.0,1357.0,2296.0,3368.0,336.0,781.0,2181.0,1784.0,541.0,533.0,2572.0,775.0,260.0,73.51,94.96,90.27,60.0,98.0,81.0,4829.45,3607.0,3632.57,269.0,40.65,49.9,49.9,249.5,250.89,242.84,2532.86,248.36,173.68,585.17,84.28,1401.06,1008.46,235.35,247.3,234.4,2552.0,949.0,1318.0,4819.0,219.0,225.0,27.0,480.0,98.0,1128.0,1351.0,2333.0,3282.0,336.0,790.0,2393.0,1638.0,475.0,590.0,2535.0,720.0,246.0,74.37,86.14,87.86,69.0,84.0,66.0,4792.63,3501.0,3488.72,327.0,38.38,47.43,47.53,239.21,238.53,231.07,,,1
1046,OTT,2014,OTT,OTT,Team Level,all,82,0.52,0.51,0.5,301584.0,2523.5,225.45,168.13,582.83,81.27,1399.27,1056.04,217.68,225.92,218.15,2540.0,973.0,1300.0,4813.0,232.0,176.0,40.0,521.0,76.0,1134.0,1374.0,2308.0,3281.0,325.0,759.0,2421.0,2279.0,648.0,719.0,2691.0,606.0,216.0,73.86,72.94,78.64,69.0,76.0,87.0,4824.66,3513.0,3521.02,438.0,37.33,41.09,42.26,220.51,221.12,215.88,2552.27,207.73,169.79,595.31,83.24,1436.51,1077.43,201.63,207.44,201.36,2635.0,935.0,1045.0,4615.0,208.0,169.0,31.0,615.0,104.0,1151.0,1323.0,2427.0,3362.0,346.0,803.0,2599.0,2121.0,536.0,736.0,2814.0,563.0,193.0,75.1,68.11,64.53,78.0,65.0,65.0,4605.52,3570.0,3566.98,453.0,37.23,35.31,36.42,208.53,208.05,204.38,,,1
1146,TOR,2015,TOR,TOR,Team Level,all,82,0.51,0.52,0.51,299792.0,2658.51,254.52,184.12,612.18,89.44,1486.16,1075.64,243.61,253.16,242.32,2515.0,1188.0,1333.0,5036.0,192.0,196.0,41.0,633.0,105.0,1235.0,1342.0,2323.0,3511.0,324.0,694.0,2617.0,2119.0,669.0,626.0,2720.0,732.0,251.0,77.98,89.48,87.05,52.0,62.0,78.0,4973.34,3703.0,3669.07,411.0,41.24,41.81,44.8,250.9,249.31,242.91,2534.21,242.03,169.08,582.83,82.99,1405.96,1040.1,232.75,244.53,235.16,2503.0,1020.0,1035.0,4558.0,240.0,144.0,40.0,655.0,101.0,1140.0,1243.0,2263.0,3283.0,322.0,676.0,2556.0,2143.0,630.0,722.0,2585.0,735.0,203.0,76.66,88.48,76.88,81.0,86.0,73.0,4622.93,3523.0,3564.4,440.0,37.21,36.08,38.95,240.27,242.86,238.18,,,0
1076,BOS,2015,BOS,BOS,Team Level,all,82,0.48,0.5,0.5,298711.0,2560.21,222.66,171.53,601.5,84.87,1418.25,1057.18,214.58,223.68,215.47,2620.0,936.0,1136.0,4692.0,236.0,139.0,36.0,679.0,82.0,1103.0,1318.0,2384.0,3320.0,348.0,809.0,2592.0,2164.0,501.0,705.0,2714.0,658.0,184.0,75.57,79.84,67.26,72.0,87.0,77.0,4710.84,3556.0,3572.38,317.0,37.94,33.03,35.65,224.96,225.63,221.18,2528.12,237.95,172.27,572.95,82.95,1406.15,1055.73,230.38,238.92,231.35,2492.0,1036.0,1184.0,4712.0,228.0,136.0,45.0,597.0,104.0,1092.0,1371.0,2264.0,3300.0,309.0,745.0,2548.0,1969.0,616.0,859.0,2652.0,644.0,232.0,75.95,77.2,84.8,62.0,68.0,98.0,4703.85,3528.0,3525.32,428.0,38.02,37.5,38.94,237.03,237.87,232.98,,,0


In [335]:
df_train.shape

(415, 110)

In [336]:
df_train.head()

Unnamed: 0,team,season,name,team.1,position,situation,games_played,xGoalsPercentage,corsiPercentage,fenwickPercentage,iceTime,xOnGoalFor,xGoalsFor,xReboundsFor,xFreezeFor,xPlayStoppedFor,xPlayContinuedInZoneFor,xPlayContinuedOutsideZoneFor,flurryAdjustedxGoalsFor,scoreVenueAdjustedxGoalsFor,flurryScoreVenueAdjustedxGoalsFor,shotsOnGoalFor,missedShotsFor,blockedShotAttemptsFor,shotAttemptsFor,goalsFor,reboundsFor,reboundGoalsFor,freezeFor,playStoppedFor,playContinuedInZoneFor,playContinuedOutsideZoneFor,savedShotsOnGoalFor,savedUnblockedShotAttemptsFor,penaltiesFor,penalityMinutesFor,faceOffsWonFor,hitsFor,takeawaysFor,giveawaysFor,lowDangerShotsFor,mediumDangerShotsFor,highDangerShotsFor,lowDangerxGoalsFor,mediumDangerxGoalsFor,highDangerxGoalsFor,lowDangerGoalsFor,mediumDangerGoalsFor,highDangerGoalsFor,scoreAdjustedShotsAttemptsFor,unblockedShotAttemptsFor,scoreAdjustedUnblockedShotAttemptsFor,dZoneGiveawaysFor,xGoalsFromxReboundsOfShotsFor,xGoalsFromActualReboundsOfShotsFor,reboundxGoalsFor,totalShotCreditFor,scoreAdjustedTotalShotCreditFor,scoreFlurryAdjustedTotalShotCreditFor,xOnGoalAgainst,xGoalsAgainst,xReboundsAgainst,xFreezeAgainst,xPlayStoppedAgainst,xPlayContinuedInZoneAgainst,xPlayContinuedOutsideZoneAgainst,flurryAdjustedxGoalsAgainst,scoreVenueAdjustedxGoalsAgainst,flurryScoreVenueAdjustedxGoalsAgainst,shotsOnGoalAgainst,missedShotsAgainst,blockedShotAttemptsAgainst,shotAttemptsAgainst,goalsAgainst,reboundsAgainst,reboundGoalsAgainst,freezeAgainst,playStoppedAgainst,playContinuedInZoneAgainst,playContinuedOutsideZoneAgainst,savedShotsOnGoalAgainst,savedUnblockedShotAttemptsAgainst,penaltiesAgainst,penalityMinutesAgainst,faceOffsWonAgainst,hitsAgainst,takeawaysAgainst,giveawaysAgainst,lowDangerShotsAgainst,mediumDangerShotsAgainst,highDangerShotsAgainst,lowDangerxGoalsAgainst,mediumDangerxGoalsAgainst,highDangerxGoalsAgainst,lowDangerGoalsAgainst,mediumDangerGoalsAgainst,highDangerGoalsAgainst,scoreAdjustedShotsAttemptsAgainst,unblockedShotAttemptsAgainst,scoreAdjustedUnblockedShotAttemptsAgainst,dZoneGiveawaysAgainst,xGoalsFromxReboundsOfShotsAgainst,xGoalsFromActualReboundsOfShotsAgainst,reboundxGoalsAgainst,totalShotCreditAgainst,scoreAdjustedTotalShotCreditAgainst,scoreFlurryAdjustedTotalShotCreditAgainst,penalitiesFor,penalitiesAgainst,playoff
1811,MTL,2019,MTL,MTL,Team Level,all,71,0.53,0.53,0.52,258596.0,2386.76,223.07,163.0,558.6,78.11,1309.56,969.79,213.98,224.14,215.04,2424.0,880.0,1148.0,4452.0,208.0,188.0,34.0,532.0,106.0,1053.0,1217.0,2216.0,3096.0,229.0,490.0,2140.0,1882.0,489.0,874.0,2434.0,626.0,244.0,64.85,77.88,80.34,66.0,67.0,75.0,4475.95,3304.0,3322.1,676.0,35.76,39.18,39.18,219.52,220.49,215.66,2142.02,201.0,150.35,501.51,71.42,1196.33,869.51,193.64,201.39,193.99,2210.0,782.0,928.0,3920.0,220.0,162.0,31.0,520.0,72.0,945.0,1073.0,1990.0,2772.0,205.0,442.0,2104.0,1693.0,531.0,911.0,2229.0,564.0,199.0,61.68,70.37,68.95,69.0,86.0,65.0,3920.91,2992.0,2993.71,686.0,33.29,31.16,31.21,202.96,203.28,198.78,,,0
2231,MIN,2022,MIN,MIN,Team Level,all,82,0.51,0.5,0.51,301108.0,2614.25,258.75,181.28,591.9,85.41,1428.64,1061.03,246.01,260.21,247.45,2535.0,1072.0,1187.0,4794.0,239.0,245.0,35.0,533.0,74.0,1159.0,1357.0,2296.0,3368.0,336.0,781.0,2181.0,1784.0,541.0,533.0,2572.0,775.0,260.0,73.51,94.96,90.27,60.0,98.0,81.0,4829.45,3607.0,3632.57,269.0,40.65,49.9,49.9,249.5,250.89,242.84,2532.86,248.36,173.68,585.17,84.28,1401.06,1008.46,235.35,247.3,234.4,2552.0,949.0,1318.0,4819.0,219.0,225.0,27.0,480.0,98.0,1128.0,1351.0,2333.0,3282.0,336.0,790.0,2393.0,1638.0,475.0,590.0,2535.0,720.0,246.0,74.37,86.14,87.86,69.0,84.0,66.0,4792.63,3501.0,3488.72,327.0,38.38,47.43,47.53,239.21,238.53,231.07,,,1
1046,OTT,2014,OTT,OTT,Team Level,all,82,0.52,0.51,0.5,301584.0,2523.5,225.45,168.13,582.83,81.27,1399.27,1056.04,217.68,225.92,218.15,2540.0,973.0,1300.0,4813.0,232.0,176.0,40.0,521.0,76.0,1134.0,1374.0,2308.0,3281.0,325.0,759.0,2421.0,2279.0,648.0,719.0,2691.0,606.0,216.0,73.86,72.94,78.64,69.0,76.0,87.0,4824.66,3513.0,3521.02,438.0,37.33,41.09,42.26,220.51,221.12,215.88,2552.27,207.73,169.79,595.31,83.24,1436.51,1077.43,201.63,207.44,201.36,2635.0,935.0,1045.0,4615.0,208.0,169.0,31.0,615.0,104.0,1151.0,1323.0,2427.0,3362.0,346.0,803.0,2599.0,2121.0,536.0,736.0,2814.0,563.0,193.0,75.1,68.11,64.53,78.0,65.0,65.0,4605.52,3570.0,3566.98,453.0,37.23,35.31,36.42,208.53,208.05,204.38,,,1
1146,TOR,2015,TOR,TOR,Team Level,all,82,0.51,0.52,0.51,299792.0,2658.51,254.52,184.12,612.18,89.44,1486.16,1075.64,243.61,253.16,242.32,2515.0,1188.0,1333.0,5036.0,192.0,196.0,41.0,633.0,105.0,1235.0,1342.0,2323.0,3511.0,324.0,694.0,2617.0,2119.0,669.0,626.0,2720.0,732.0,251.0,77.98,89.48,87.05,52.0,62.0,78.0,4973.34,3703.0,3669.07,411.0,41.24,41.81,44.8,250.9,249.31,242.91,2534.21,242.03,169.08,582.83,82.99,1405.96,1040.1,232.75,244.53,235.16,2503.0,1020.0,1035.0,4558.0,240.0,144.0,40.0,655.0,101.0,1140.0,1243.0,2263.0,3283.0,322.0,676.0,2556.0,2143.0,630.0,722.0,2585.0,735.0,203.0,76.66,88.48,76.88,81.0,86.0,73.0,4622.93,3523.0,3564.4,440.0,37.21,36.08,38.95,240.27,242.86,238.18,,,0
1076,BOS,2015,BOS,BOS,Team Level,all,82,0.48,0.5,0.5,298711.0,2560.21,222.66,171.53,601.5,84.87,1418.25,1057.18,214.58,223.68,215.47,2620.0,936.0,1136.0,4692.0,236.0,139.0,36.0,679.0,82.0,1103.0,1318.0,2384.0,3320.0,348.0,809.0,2592.0,2164.0,501.0,705.0,2714.0,658.0,184.0,75.57,79.84,67.26,72.0,87.0,77.0,4710.84,3556.0,3572.38,317.0,37.94,33.03,35.65,224.96,225.63,221.18,2528.12,237.95,172.27,572.95,82.95,1406.15,1055.73,230.38,238.92,231.35,2492.0,1036.0,1184.0,4712.0,228.0,136.0,45.0,597.0,104.0,1092.0,1371.0,2264.0,3300.0,309.0,745.0,2548.0,1969.0,616.0,859.0,2652.0,644.0,232.0,75.95,77.2,84.8,62.0,68.0,98.0,4703.85,3528.0,3525.32,428.0,38.02,37.5,38.94,237.03,237.87,232.98,,,0


In [337]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 415 entries, 1811 to 2266
Columns: 110 entries, team to playoff
dtypes: float64(102), int64(3), object(5)
memory usage: 359.9+ KB


In [338]:
df_train.describe()

Unnamed: 0,season,games_played,xGoalsPercentage,corsiPercentage,fenwickPercentage,iceTime,xOnGoalFor,xGoalsFor,xReboundsFor,xFreezeFor,xPlayStoppedFor,xPlayContinuedInZoneFor,xPlayContinuedOutsideZoneFor,flurryAdjustedxGoalsFor,scoreVenueAdjustedxGoalsFor,flurryScoreVenueAdjustedxGoalsFor,shotsOnGoalFor,missedShotsFor,blockedShotAttemptsFor,shotAttemptsFor,goalsFor,reboundsFor,reboundGoalsFor,freezeFor,playStoppedFor,playContinuedInZoneFor,playContinuedOutsideZoneFor,savedShotsOnGoalFor,savedUnblockedShotAttemptsFor,penaltiesFor,penalityMinutesFor,faceOffsWonFor,hitsFor,takeawaysFor,giveawaysFor,lowDangerShotsFor,mediumDangerShotsFor,highDangerShotsFor,lowDangerxGoalsFor,mediumDangerxGoalsFor,highDangerxGoalsFor,lowDangerGoalsFor,mediumDangerGoalsFor,highDangerGoalsFor,scoreAdjustedShotsAttemptsFor,unblockedShotAttemptsFor,scoreAdjustedUnblockedShotAttemptsFor,dZoneGiveawaysFor,xGoalsFromxReboundsOfShotsFor,xGoalsFromActualReboundsOfShotsFor,reboundxGoalsFor,totalShotCreditFor,scoreAdjustedTotalShotCreditFor,scoreFlurryAdjustedTotalShotCreditFor,xOnGoalAgainst,xGoalsAgainst,xReboundsAgainst,xFreezeAgainst,xPlayStoppedAgainst,xPlayContinuedInZoneAgainst,xPlayContinuedOutsideZoneAgainst,flurryAdjustedxGoalsAgainst,scoreVenueAdjustedxGoalsAgainst,flurryScoreVenueAdjustedxGoalsAgainst,shotsOnGoalAgainst,missedShotsAgainst,blockedShotAttemptsAgainst,shotAttemptsAgainst,goalsAgainst,reboundsAgainst,reboundGoalsAgainst,freezeAgainst,playStoppedAgainst,playContinuedInZoneAgainst,playContinuedOutsideZoneAgainst,savedShotsOnGoalAgainst,savedUnblockedShotAttemptsAgainst,penaltiesAgainst,penalityMinutesAgainst,faceOffsWonAgainst,hitsAgainst,takeawaysAgainst,giveawaysAgainst,lowDangerShotsAgainst,mediumDangerShotsAgainst,highDangerShotsAgainst,lowDangerxGoalsAgainst,mediumDangerxGoalsAgainst,highDangerxGoalsAgainst,lowDangerGoalsAgainst,mediumDangerGoalsAgainst,highDangerGoalsAgainst,scoreAdjustedShotsAttemptsAgainst,unblockedShotAttemptsAgainst,scoreAdjustedUnblockedShotAttemptsAgainst,dZoneGiveawaysAgainst,xGoalsFromxReboundsOfShotsAgainst,xGoalsFromActualReboundsOfShotsAgainst,reboundxGoalsAgainst,totalShotCreditAgainst,scoreAdjustedTotalShotCreditAgainst,scoreFlurryAdjustedTotalShotCreditAgainst,penalitiesFor,penalitiesAgainst,playoff
count,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,393.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,393.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,22.0,22.0,415.0
mean,2016.749398,75.233735,0.498843,0.499542,0.499373,274519.46988,2290.424747,214.467831,156.969036,526.062771,75.225084,1269.095855,942.051494,205.750964,214.995422,206.266675,2279.872289,910.163855,1088.568675,4278.604819,215.563855,179.026506,34.183133,505.209639,67.180723,1025.513253,1197.592771,2064.308434,2974.472289,291.977099,668.306024,2180.959036,1683.575904,509.053012,695.983133,2383.13494,598.768675,208.13253,68.752024,72.808434,72.906867,72.077108,73.975904,69.510843,4283.876193,3190.036145,3195.072867,384.66988,34.710048,39.113181,39.838289,208.932145,209.368723,204.351783,2294.475735,215.45494,157.454072,526.622506,75.392819,1271.516193,943.350434,206.694289,216.102506,207.324,2284.224096,911.677108,1091.368675,4287.26988,217.479518,179.807229,34.433735,505.4,66.898795,1027.339759,1199.028916,2066.744578,2978.421687,291.458015,666.53012,2185.378313,1682.250602,508.843373,695.720482,2384.828916,601.607229,209.46506,68.919711,73.188,73.347084,72.949398,74.544578,69.985542,4296.36012,3195.901205,3203.309614,382.672289,34.819181,39.273807,40.008145,209.861904,210.408651,205.361277,280.5,276.863636,0.508434
std,5.319715,13.164541,0.033015,0.028053,0.027959,47992.924366,439.913683,44.95389,30.07213,101.685485,14.563432,244.675024,186.376105,42.613621,45.283737,42.939025,456.018593,185.381633,231.88908,830.346402,46.326974,52.05118,9.979941,112.524187,34.350968,210.110482,233.01383,416.593815,576.969882,80.658843,203.655531,416.256889,399.78494,154.38911,209.507156,462.567265,132.125908,54.900519,13.411701,16.117039,19.121635,17.279566,18.394218,17.040283,838.804654,616.93942,622.316283,126.169981,6.673882,11.208283,11.08634,42.747584,43.071518,41.779493,433.50937,44.381106,29.967013,99.188333,14.282357,242.618922,184.230592,42.130146,44.834833,42.565506,453.23465,180.636427,236.805372,819.532186,47.554281,50.187104,9.862084,112.315696,34.613384,207.429233,232.187631,413.003009,568.796384,78.700504,198.213028,413.516274,386.354365,146.629053,194.945066,456.12175,129.139263,53.594047,13.350117,15.774179,18.744435,18.370577,17.897516,16.882652,829.783795,610.139643,616.470206,114.061529,6.634432,10.52255,10.381141,42.23313,42.621628,41.33533,21.014168,20.08149,0.500532
min,2008.0,39.0,0.4,0.38,0.39,143241.0,1113.78,107.97,78.0,260.38,38.17,596.53,416.24,103.62,107.64,103.34,1014.0,450.0,557.0,2121.0,102.0,72.0,7.0,199.0,1.0,522.0,522.0,902.0,1386.0,118.0,263.0,1012.0,708.0,164.0,219.0,1075.0,264.0,80.0,32.11,31.99,32.03,31.0,25.0,28.0,2128.93,1511.0,1506.58,127.0,17.57,15.16,15.69,100.97,100.85,98.81,1107.1,102.23,76.79,259.54,37.01,597.64,435.46,98.7,102.73,99.17,1001.0,435.0,544.0,2124.0,89.0,68.0,9.0,201.0,0.0,496.0,483.0,880.0,1383.0,121.0,259.0,1010.0,689.0,163.0,263.0,1120.0,274.0,80.0,32.81,33.76,27.35,29.0,25.0,25.0,2126.1,1517.0,1519.61,141.0,17.2,13.93,15.91,98.46,96.76,93.94,242.0,250.0,0.0
25%,2012.0,81.0,0.48,0.48,0.48,295638.5,2193.67,197.85,149.91,496.465,71.34,1210.515,897.815,190.455,197.75,190.6,2191.5,831.5,981.5,4062.0,196.0,142.5,28.0,459.0,50.5,932.5,1167.5,1987.5,2840.0,243.0,535.0,2088.0,1445.5,424.0,565.5,2288.0,539.0,176.0,64.8,65.755,61.375,61.0,63.0,60.0,4043.475,3057.0,3037.07,289.0,33.14,31.555,32.125,195.53,195.345,191.35,2219.34,195.905,150.615,502.955,72.26,1221.495,915.78,189.315,196.04,189.745,2214.0,820.5,967.5,4086.0,197.0,145.5,28.0,454.5,51.5,934.5,1161.0,2002.5,2879.5,250.0,551.5,2113.5,1490.5,425.0,572.5,2300.5,550.5,176.0,66.115,66.68,61.535,62.5,65.0,61.0,4111.145,3095.5,3092.005,297.0,33.265,32.46,33.0,196.33,196.305,192.365,265.0,260.5,0.0
50%,2017.0,82.0,0.5,0.5,0.5,298641.0,2414.71,222.35,165.95,552.08,79.24,1344.47,1001.88,213.98,223.44,214.71,2413.0,937.0,1128.0,4532.0,223.0,171.0,34.0,529.0,75.0,1068.0,1278.0,2181.0,3152.0,292.0,651.0,2329.0,1745.0,521.0,670.0,2519.0,619.0,204.0,72.94,75.21,72.25,73.0,75.0,71.0,4532.21,3380.0,3373.99,367.0,36.59,38.07,38.82,218.45,218.44,213.35,2428.76,224.72,166.05,554.83,79.73,1338.2,994.68,216.73,225.41,216.61,2416.0,947.0,1127.0,4558.0,222.0,174.0,35.0,528.0,74.0,1077.0,1273.0,2185.0,3135.0,292.0,645.0,2335.0,1735.0,523.0,676.0,2522.0,625.0,208.0,72.01,76.18,73.08,72.0,76.0,72.0,4553.46,3371.0,3365.68,366.0,36.62,38.08,39.17,219.55,219.51,214.98,280.0,277.0,1.0
75%,2021.5,82.0,0.52,0.52,0.52,299605.0,2577.285,240.655,175.885,592.055,84.975,1425.315,1063.54,231.815,241.34,232.055,2571.0,1029.5,1241.0,4819.5,245.0,210.5,40.5,583.5,89.0,1162.5,1342.0,2322.0,3336.0,346.0,792.5,2441.5,1955.5,605.5,804.5,2683.5,684.0,242.0,77.175,82.985,84.105,84.0,85.0,81.0,4845.915,3578.0,3595.58,460.0,38.92,46.145,46.8,234.085,234.665,229.05,2573.45,243.08,176.31,586.67,84.19,1425.175,1060.53,233.495,244.38,234.35,2571.0,1026.0,1251.0,4817.0,248.5,208.0,42.0,586.0,89.0,1168.0,1347.0,2326.5,3350.0,343.0,782.5,2455.5,1928.5,608.5,789.0,2664.0,689.0,240.5,77.67,83.675,84.71,84.0,86.0,80.0,4838.265,3588.5,3610.515,453.5,38.87,45.405,45.885,238.71,238.9,232.745,296.75,288.75,1.0
max,2025.0,82.0,0.59,0.6,0.59,306407.0,3026.48,319.35,216.43,744.55,103.59,1675.11,1221.49,302.83,321.53,305.0,3062.0,1527.0,1670.0,5760.0,337.0,360.0,65.0,741.0,187.0,1549.0,1594.0,2733.0,3963.0,525.0,1316.0,2747.0,2685.0,948.0,1414.0,3079.0,979.0,376.0,93.11,120.44,139.4,112.0,127.0,114.0,5806.33,4225.0,4287.03,772.0,48.88,76.23,76.78,303.4,305.69,296.92,3146.67,320.76,225.16,714.33,104.37,1752.87,1246.5,304.99,326.35,310.33,3208.0,1365.0,1673.0,5826.0,335.0,344.0,62.0,712.0,196.0,1528.0,1571.0,2873.0,4029.0,489.0,1219.0,2721.0,2537.0,947.0,1349.0,3105.0,929.0,383.0,93.9,115.13,133.02,126.0,126.0,114.0,5989.34,4364.0,4473.82,760.0,49.81,73.94,73.98,305.12,308.56,299.43,320.0,314.0,1.0


In [339]:
df.select_dtypes(include=["object"]).columns.tolist()

['team', 'name', 'team.1', 'position', 'situation']

In [340]:
df.select_dtypes(include=["int64"]).columns.tolist()

['season', 'games_played', 'playoff']

In [341]:
with pd.option_context('display.max_rows', None):
    print(df_train.isna().any())

team                                         False
season                                       False
name                                         False
team.1                                       False
position                                     False
situation                                    False
games_played                                 False
xGoalsPercentage                             False
corsiPercentage                              False
fenwickPercentage                            False
iceTime                                      False
xOnGoalFor                                   False
xGoalsFor                                    False
xReboundsFor                                 False
xFreezeFor                                   False
xPlayStoppedFor                              False
xPlayContinuedInZoneFor                      False
xPlayContinuedOutsideZoneFor                 False
flurryAdjustedxGoalsFor                      False
scoreVenueAdjustedxGoalsFor    

In [342]:
with pd.option_context('display.max_rows', None):
    print(df_train.isnull().sum())

team                                           0
season                                         0
name                                           0
team.1                                         0
position                                       0
situation                                      0
games_played                                   0
xGoalsPercentage                               0
corsiPercentage                                0
fenwickPercentage                              0
iceTime                                        0
xOnGoalFor                                     0
xGoalsFor                                      0
xReboundsFor                                   0
xFreezeFor                                     0
xPlayStoppedFor                                0
xPlayContinuedInZoneFor                        0
xPlayContinuedOutsideZoneFor                   0
flurryAdjustedxGoalsFor                        0
scoreVenueAdjustedxGoalsFor                    0
flurryScoreVenueAdju

commented histogram code because very long output. feel free to uncomment, but weird stuff happens in notebook sometimes (at least in vscode)

In [343]:
# # this is without games_played normalization.
# cols_to_plot = df_train.select_dtypes(include=["float64"]).columns.tolist()
# fig, axes = plt.subplots(nrows=len(cols_to_plot), ncols=1, figsize=(10, 3*len(cols_to_plot)))

# for i, c in enumerate(cols_to_plot):
#     df_train.groupby("playoff")[c].plot.hist(
#         bins=50,
#         alpha=0.5, 
#         legend=True, 
#         density=True, 
#         ax=axes[i]
#     )
    
#     axes[i].set_title(f"Histogram of {c}")
#     axes[i].set_xlabel(c)

# plt.tight_layout()
# plt.show()


In [344]:
df_train[["xFreezeFor", "xPlayStoppedFor", "xPlayContinuedInZoneFor", "playoff"]].corr() 

Unnamed: 0,xFreezeFor,xPlayStoppedFor,xPlayContinuedInZoneFor,playoff
xFreezeFor,1.0,0.994001,0.976669,0.112922
xPlayStoppedFor,0.994001,1.0,0.986277,0.125181
xPlayContinuedInZoneFor,0.976669,0.986277,1.0,0.133454
playoff,0.112922,0.125181,0.133454,1.0


lets just include one of these...i'll choose xPlayContinuedInZoneFor because highest correlation with playoff (albeit minimally, should not read too much into it). as an extension i'll also use the other xPlayContinued(In/Outside)Zone(For/Against)
<br>
<br>

In [345]:
df_train[["xGoalsPercentage", "flurryScoreVenueAdjustedxGoalsFor", "playoff"]].corr()

Unnamed: 0,xGoalsPercentage,flurryScoreVenueAdjustedxGoalsFor,playoff
xGoalsPercentage,1.0,0.343666,0.610125
flurryScoreVenueAdjustedxGoalsFor,0.343666,1.0,0.197803
playoff,0.610125,0.197803,1.0


In [346]:
df_train[["corsiPercentage", "fenwickPercentage", "xOnGoalFor", "playoff"]].corr()

Unnamed: 0,corsiPercentage,fenwickPercentage,xOnGoalFor,playoff
corsiPercentage,1.0,0.964455,0.333351,0.434631
fenwickPercentage,0.964455,1.0,0.318613,0.476752
xOnGoalFor,0.333351,0.318613,1.0,0.130911
playoff,0.434631,0.476752,0.130911,1.0


hmm... just fenwick or both? my gut is saying try out with both first so i'll do that. i wanna learn more systematic/data driven approaches to make these decisions rather than just my gut...
<br>
<br>

In [347]:
df_train[df_train["season"] == 2025][["xGoalsPercentage", 
                                      "xGoalsFor", 
                                      "xOnGoalFor", 
                                      "xPlayContinuedInZoneFor",
                                      "xPlayContinuedOutsideZoneFor",
                                      "xPlayContinuedInZoneAgainst",
                                      "xPlayContinuedOutsideZoneAgainst",
                                      "flurryScoreVenueAdjustedxGoalsFor",
                                      "lowDangerxGoalsFor",
                                      "mediumDangerxGoalsFor"]].head()

Unnamed: 0,xGoalsPercentage,xGoalsFor,xOnGoalFor,xPlayContinuedInZoneFor,xPlayContinuedOutsideZoneFor,xPlayContinuedInZoneAgainst,xPlayContinuedOutsideZoneAgainst,flurryScoreVenueAdjustedxGoalsFor,lowDangerxGoalsFor,mediumDangerxGoalsFor
2626,0.51,119.59,1271.8,686.13,496.12,637.67,472.58,112.91,36.78,37.11
2621,0.51,132.41,1310.37,702.01,511.82,700.09,500.51,126.21,40.74,41.35
2631,0.51,129.45,1357.5,733.88,522.22,644.06,462.66,124.77,40.97,43.64
2611,0.48,128.02,1312.02,710.66,504.2,706.48,502.84,121.41,39.56,43.13
2616,0.46,123.33,1324.86,727.08,512.48,758.93,542.46,119.39,37.16,42.03


In [348]:
df_train[df_train["season"] != 2025][["xGoalsPercentage", 
                                      "xGoalsFor", 
                                      "xOnGoalFor", 
                                      "xPlayContinuedInZoneFor",
                                      "xPlayContinuedOutsideZoneFor",
                                      "xPlayContinuedInZoneAgainst",
                                      "xPlayContinuedOutsideZoneAgainst",
                                      "flurryScoreVenueAdjustedxGoalsFor",
                                      "lowDangerxGoalsFor",
                                      "mediumDangerxGoalsFor"]].head()

Unnamed: 0,xGoalsPercentage,xGoalsFor,xOnGoalFor,xPlayContinuedInZoneFor,xPlayContinuedOutsideZoneFor,xPlayContinuedInZoneAgainst,xPlayContinuedOutsideZoneAgainst,flurryScoreVenueAdjustedxGoalsFor,lowDangerxGoalsFor,mediumDangerxGoalsFor
1811,0.53,223.07,2386.76,1309.56,969.79,1196.33,869.51,215.04,64.85,77.88
2231,0.51,258.75,2614.25,1428.64,1061.03,1401.06,1008.46,247.45,73.51,94.96
1046,0.52,225.45,2523.5,1399.27,1056.04,1436.51,1077.43,218.15,73.86,72.94
1146,0.51,254.52,2658.51,1486.16,1075.64,1405.96,1040.1,242.32,77.98,89.48
1076,0.48,222.66,2560.21,1418.25,1057.18,1406.15,1055.73,215.47,75.57,79.84


mhm all but goals percentage (and ofc corsi & fenwick) need adjusting (i will NOT be using xGoalsFor as xGoalsPercentage is derived from it)
<br>
<br>

### 4. Feature Engineering

we want features that can be used regardless of games played. only 3 in the current dataset fit this description ("xGoalsPercentage", "corsiPercentage", "fenwickPercentage"). we'll create some more ratios/percentages using relevant features and the EDA done above

-> these initial relevant features were identified by using histograms and corr(). some decisions were also made with my (limited) domain knowledge/gut instinct

features i chose in eda above:
- corsiPercentage
- fenwickPercentage
- xGoalsPercentage
- xOnGoalFor
- xPlayContinuedInZoneFor
- xPlayContinuedOutsideZoneFor
- xPlayContinuedInZoneAgainst
- xPlayContinuedOutsideZoneAgainst
- flurryScoreVenueAdjustedxGoalsFor
- lowDangerShotsFor
- lowDangerxGoalsFor
- lowDangerGoalsFor
- lowDangerShotsAgainst
- lowDangerxGoalsAgainst
- lowDangerGoalsAgainst

In [349]:
def engineer_features(df):
    df = df.copy()
    df["xOnGoalFor_PerGame"] = df["xOnGoalFor"] / df["games_played"]
    df["xPlayContinuedInZoneFor_PerGame"] = df["xPlayContinuedInZoneFor"] / df["games_played"]
    df["xPlayContinuedOutsideZoneFor_PerGame"] = df["xPlayContinuedOutsideZoneFor"] / df["games_played"]
    df["xPlayContinuedInZoneAgainst_PerGame"] = df["xPlayContinuedInZoneAgainst"] / df["games_played"]
    df["xPlayContinuedOutsideZoneAgainst_PerGame"] = df["xPlayContinuedOutsideZoneAgainst"] / df["games_played"]
    df["flurryScoreVenueAdjustedxGoalsFor_PerGame"] = df["flurryScoreVenueAdjustedxGoalsFor"] / df["games_played"]
    df["lowDangerShotsFor_PerGame"] = df["lowDangerShotsFor"] / df["games_played"]
    df["lowDangerxGoalsFor_PerGame"] = df["lowDangerxGoalsFor"] / df["games_played"]
    df["lowDangerGoalsFor_PerGame"] = df["lowDangerGoalsFor"] / df["games_played"]
    df["lowDangerShotsAgainst_PerGame"] = df["lowDangerShotsAgainst"] / df["games_played"]
    df["lowDangerxGoalsAgainst_PerGame"] = df["lowDangerxGoalsAgainst"] / df["games_played"]
    df["lowDangerGoalsAgainst_PerGame"] = df["lowDangerGoalsAgainst"] / df["games_played"]
    
    df = df[["corsiPercentage",
             "fenwickPercentage",
             "xGoalsPercentage",
             "xOnGoalFor_PerGame",
             "xPlayContinuedInZoneFor_PerGame",
             "xPlayContinuedOutsideZoneFor_PerGame",
             "xPlayContinuedInZoneAgainst_PerGame",
             "xPlayContinuedOutsideZoneAgainst_PerGame",
             "flurryScoreVenueAdjustedxGoalsFor_PerGame",
             "lowDangerShotsFor_PerGame",
             "lowDangerxGoalsFor_PerGame",
             "lowDangerGoalsFor_PerGame",
             "lowDangerShotsAgainst_PerGame",
             "lowDangerxGoalsAgainst_PerGame",
             "lowDangerGoalsAgainst_PerGame",
             "playoff"]]
    return df

In [350]:
df_train = engineer_features(df_train)
df_test = engineer_features(df_test)

In [351]:
print(df_train.shape)
print(df_test.shape)

(415, 16)
(139, 16)


In [352]:
df_train.describe()

Unnamed: 0,corsiPercentage,fenwickPercentage,xGoalsPercentage,xOnGoalFor_PerGame,xPlayContinuedInZoneFor_PerGame,xPlayContinuedOutsideZoneFor_PerGame,xPlayContinuedInZoneAgainst_PerGame,xPlayContinuedOutsideZoneAgainst_PerGame,flurryScoreVenueAdjustedxGoalsFor_PerGame,lowDangerShotsFor_PerGame,lowDangerxGoalsFor_PerGame,lowDangerGoalsFor_PerGame,lowDangerShotsAgainst_PerGame,lowDangerxGoalsAgainst_PerGame,lowDangerGoalsAgainst_PerGame,playoff
count,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0,415.0
mean,0.499542,0.499373,0.498843,30.430503,16.850833,12.49143,16.888999,12.512154,2.740694,31.64053,0.913229,0.960773,31.67392,0.915814,0.972173,0.508434
std,0.028053,0.027959,0.033015,2.166567,1.165354,0.885424,1.145318,0.832302,0.287476,2.304952,0.070897,0.16676,2.180172,0.07163,0.182931,0.500532
min,0.38,0.39,0.4,23.309512,12.859268,9.654878,13.570732,10.158902,2.007805,23.54878,0.636341,0.463415,25.060976,0.712073,0.573171,0.0
25%,0.48,0.48,0.48,28.804573,16.040188,11.932425,16.077012,11.91939,2.548171,30.036585,0.867853,0.841463,30.25588,0.864512,0.841463,0.0
50%,0.5,0.5,0.5,30.376463,16.878393,12.460854,16.924024,12.494756,2.712899,31.597561,0.912561,0.95122,31.560976,0.914167,0.95122,1.0
75%,0.52,0.52,0.52,31.819024,17.564512,13.09689,17.580854,13.061762,2.913171,33.060976,0.954878,1.073171,33.071797,0.963232,1.085366,1.0
max,0.6,0.59,0.59,37.434146,20.428171,14.89622,21.376463,15.20122,3.719512,38.073171,1.166829,1.682927,37.865854,1.145122,1.585366,1.0


std now reasonable numbers, now features can be used to analyze regardless of games played! 
(for an extra check, i also ran a session that kept season and compared 2025 and non 2025 numbers, the values were the same range)

### 5. Preprocessing (and a DummyClassifier first)

In [353]:
X_train = df_train.drop(columns=["playoff"])
y_train = df_train.loc[:, "playoff"]

X_test = df_test.drop(columns=["playoff"])
y_test = df_test.loc[:, "playoff"]

In [354]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(415, 15)
(415,)
(139, 15)
(139,)


In [356]:
dummy = DummyClassifier()
dummy_df = pd.DataFrame(cross_validate(dummy, X_train, y_train, return_train_score=True))
dummy_df

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.000615,0.000448,0.506024,0.509036
1,0.000597,0.000587,0.506024,0.509036
2,0.000671,0.000701,0.506024,0.509036
3,0.000555,0.000426,0.506024,0.509036
4,0.000418,0.00036,0.518072,0.506024


In [358]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 415 entries, 1811 to 2266
Data columns (total 15 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   corsiPercentage                            415 non-null    float64
 1   fenwickPercentage                          415 non-null    float64
 2   xGoalsPercentage                           415 non-null    float64
 3   xOnGoalFor_PerGame                         415 non-null    float64
 4   xPlayContinuedInZoneFor_PerGame            415 non-null    float64
 5   xPlayContinuedOutsideZoneFor_PerGame       415 non-null    float64
 6   xPlayContinuedInZoneAgainst_PerGame        415 non-null    float64
 7   xPlayContinuedOutsideZoneAgainst_PerGame   415 non-null    float64
 8   flurryScoreVenueAdjustedxGoalsFor_PerGame  415 non-null    float64
 9   lowDangerShotsFor_PerGame                  415 non-null    float64
 10  lowDangerxGoalsFor_PerGame 

In [360]:
X_train.isna().any().any().item()

False

all numeric, no missing values, all i need is StandardScaler(). no column transformer needed.

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier

In [362]:
models = {
    "lr": LogisticRegression(max_iter=1000),
    "rbf_svm": SVC(),
    "random_forest": RandomForestClassifier(),
    "xgboost": XGBClassifier(),
    "lightgbm": LGBMClassifier(),
    "catboost": CatBoostClassifier(verbose=0)
}