In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import hvplot.pandas
from sklearn.cluster import KMeans



In [2]:
# Loading data
nhl_df = pd.read_csv('../nhl_analysis_79-11.csv')
nhl_df.head()

Unnamed: 0,year,lgID,tmID,playoff,made_playoff,made_QF,made_CF,made_F,G,W,...,PPC,SHA,PKG,PKC,GF/gm,GA/gm,PPG/gm,PPG_eff,PKG/gm,PK_eff
0,1979,NHL,ATF,PRE,1,0,0,0,80,35,...,216,7,52,272,3.52,3.36,0.64,0.236,0.65,0.191
1,1979,NHL,BOS,QF,1,1,0,0,80,46,...,246,4,53,312,3.88,2.92,0.75,0.244,0.66,0.17
2,1979,NHL,BUF,SF,1,1,1,0,80,47,...,275,4,43,252,3.98,2.51,0.84,0.244,0.54,0.171
3,1979,NHL,CHI,QF,1,1,0,0,80,34,...,269,9,56,293,3.01,3.12,0.8,0.238,0.7,0.191
4,1979,NHL,COR,,0,0,0,0,80,19,...,304,3,52,233,2.92,3.85,0.66,0.174,0.65,0.223


In [3]:
# Visualize a scatter plot of the data
nhl_df.hvplot.scatter(x="GF/gm", y="Pts/gm")

In [4]:
selected_columns = ["made_playoff", "Pts/gm", "GF/gm"]
nhl_playoff_predictor_df = nhl_df[selected_columns]
nhl_playoff_predictor_df.head()

Unnamed: 0,made_playoff,Pts/gm,GF/gm
0,1,1.04,3.52
1,1,1.31,3.88
2,1,1.38,3.98
3,1,1.09,3.01
4,0,0.64,2.92


In [5]:
model = KMeans(n_clusters=2, random_state=1)
model

In [6]:
model.fit(nhl_playoff_predictor_df)

  super()._check_params_vs_input(X, default_n_init=10)


In [7]:
playoff_prediction = model.predict(nhl_playoff_predictor_df)
playoff_prediction_swapped = [1 if label == 0 else 0 for label in playoff_prediction]

In [8]:
nhl_predictions_df = nhl_playoff_predictor_df.copy()

nhl_predictions_df["playoff_prediction"] = playoff_prediction_swapped

nhl_predictions_df.head()


Unnamed: 0,made_playoff,Pts/gm,GF/gm,playoff_prediction
0,1,1.04,3.52,1
1,1,1.31,3.88,1
2,1,1.38,3.98,1
3,1,1.09,3.01,1
4,0,0.64,2.92,0


In [9]:
legend_labels = {0: '0 (Missed Playoffs)', 1: '1 (Made Playoffs)'}
nhl_predictions_df["playoff_prediction"] = nhl_predictions_df["playoff_prediction"].replace(legend_labels)

In [10]:
nhl_predictions_df.hvplot.scatter(x='GF/gm', y='Pts/gm', by="playoff_prediction", 
                                  hover_cols=['made_playoff'], legend='bottom_right')

** Testing the model with 3 clusters

In [11]:
model = KMeans(n_clusters=4, random_state=2)
model

In [12]:
model.fit(nhl_playoff_predictor_df)

  super()._check_params_vs_input(X, default_n_init=10)


In [13]:
playoff_prediction = model.predict(nhl_playoff_predictor_df)

In [14]:
nhl_predictions_df = nhl_playoff_predictor_df.copy()

nhl_predictions_df["playoff_prediction"] = playoff_prediction

nhl_predictions_df.head()

Unnamed: 0,made_playoff,Pts/gm,GF/gm,playoff_prediction
0,1,1.04,3.52,2
1,1,1.31,3.88,2
2,1,1.38,3.98,2
3,1,1.09,3.01,0
4,0,0.64,2.92,1


In [15]:
nhl_predictions_df.hvplot.scatter(x='GF/gm', y='Pts/gm', by="playoff_prediction", 
                                  hover_cols=['made_playoff'], legend='bottom_right')