In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import hvplot.pandas
from sklearn.cluster import KMeans

In [41]:
# Loading data
nhl_df = pd.read_csv('../hockey_starting_data/nhl_analysis_79-11.csv')
nhl_df.head()

Unnamed: 0,year,lgID,tmID,playoff,made_playoff,made_QF,made_CF,made_F,G,W,...,SHA,PKG,PKC,GF/gm,GA/gm,Goal_spread,PPG/gm,PPG_eff,PKG/gm,PK_eff
0,1979,NHL,ATF,PRE,1,0,0,0,80,35,...,7,52,272,3.52,3.36,0.16,0.64,0.236,0.65,0.191
1,1979,NHL,BOS,QF,1,1,0,0,80,46,...,4,53,312,3.88,2.92,0.96,0.75,0.244,0.66,0.17
2,1979,NHL,BUF,SF,1,1,1,0,80,47,...,4,43,252,3.98,2.51,1.47,0.84,0.244,0.54,0.171
3,1979,NHL,CHI,QF,1,1,0,0,80,34,...,9,56,293,3.01,3.12,-0.11,0.8,0.238,0.7,0.191
4,1979,NHL,COR,,0,0,0,0,80,19,...,3,52,233,2.92,3.85,-0.93,0.66,0.174,0.65,0.223


In [42]:
# Visualize a scatter plot of the data
nhl_df.hvplot.scatter(x="Goal_spread", y="Pts/gm")

In [43]:
selected_columns = ["made_playoff", "Pts/gm", "Goal_spread"]
nhl_playoff_predictor_df = nhl_df[selected_columns]
nhl_playoff_predictor_df.head()

Unnamed: 0,made_playoff,Pts/gm,Goal_spread
0,1,1.04,0.16
1,1,1.31,0.96
2,1,1.38,1.47
3,1,1.09,-0.11
4,0,0.64,-0.93


In [44]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1, 11))

# Create an empty list to store the inertia values
inertia = []

for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(nhl_playoff_predictor_df)
    inertia.append(k_model.inertia_)

In [45]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data_original = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_original = pd.DataFrame(elbow_data_original)

In [46]:
df_elbow_original.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

** Creating a model with 2 clusters

In [47]:
model = KMeans(n_clusters=2, random_state=1)
model

In [48]:
model.fit(nhl_playoff_predictor_df)

In [49]:
playoff_prediction = model.predict(nhl_playoff_predictor_df)
playoff_prediction_swapped = [1 if label == 0 else 0 for label in playoff_prediction]

In [50]:
nhl_predictions_df = nhl_playoff_predictor_df.copy()

nhl_predictions_df["playoff_prediction"] = playoff_prediction_swapped

nhl_predictions_df.head()


Unnamed: 0,made_playoff,Pts/gm,Goal_spread,playoff_prediction
0,1,1.04,0.16,1
1,1,1.31,0.96,1
2,1,1.38,1.47,1
3,1,1.09,-0.11,1
4,0,0.64,-0.93,0


In [31]:
legend_labels = {0: '0 (Missed Playoffs)', 1: '1 (Made Playoffs)'}
nhl_predictions_df["playoff_prediction"] = nhl_predictions_df["playoff_prediction"].replace(legend_labels)

In [52]:
goal_spread1 = nhl_predictions_df.hvplot.scatter(x='Goal_spread', y='Pts/gm', by="playoff_prediction", 
                                  hover_cols=['made_playoff'], legend='bottom_right')

** Testing the model with 4 clusters

In [34]:
model = KMeans(n_clusters=4, random_state=2)
model

In [35]:
model.fit(nhl_playoff_predictor_df)

In [36]:
playoff_prediction = model.predict(nhl_playoff_predictor_df)

In [37]:
nhl_predictions_df = nhl_playoff_predictor_df.copy()

nhl_predictions_df["cluster"] = playoff_prediction

nhl_predictions_df.head()

Unnamed: 0,made_playoff,Pts/gm,Goal_spread,cluster
0,1,1.04,0.16,1
1,1,1.31,0.96,3
2,1,1.38,1.47,3
3,1,1.09,-0.11,1
4,0,0.64,-0.93,0


In [38]:
goal_spread2 = nhl_predictions_df.hvplot.scatter(x='Goal_spread', y='Pts/gm', by="cluster", 
                                  hover_cols=['made_playoff'], legend='bottom_right')

hvplot.show(goal_spread2)

Launching server at http://localhost:61668


<panel.io.server.Server at 0x12a545510>