In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
nhl_df = pd.read_csv('../hockey_starting_data/nhl_analysis_79-11.csv')
nhl_df.head()

Unnamed: 0,tm_no,year,lgID,tmID,playoff,made_playoff,made_QF,made_CF,made_F,G,...,SHA,PKG,PKC,GF/gm,GA/gm,Goal_spread,PPG/gm,PPG_eff,PKG/gm,PK_eff
0,1,1979,NHL,ATF,PRE,1,0,0,0,80,...,7,52,272,3.52,3.36,0.16,0.64,0.236,0.65,0.191
1,2,1979,NHL,BOS,QF,1,1,0,0,80,...,4,53,312,3.88,2.92,0.96,0.75,0.244,0.66,0.17
2,3,1979,NHL,BUF,SF,1,1,1,0,80,...,4,43,252,3.98,2.51,1.47,0.84,0.244,0.54,0.171
3,4,1979,NHL,CHI,QF,1,1,0,0,80,...,9,56,293,3.01,3.12,-0.11,0.8,0.238,0.7,0.191
4,5,1979,NHL,COR,,0,0,0,0,80,...,3,52,233,2.92,3.85,-0.93,0.66,0.174,0.65,0.223


***Below we are determining the best result for optimizing our machine learning model to predict the teams that qualify for the playoffs based on their season statistics***

**Option #1: Use penalty minutes and power play/penalty kill goals**

In [3]:
selected_columns = ["made_playoff", "PIM/gm", "PPG/gm", "PKG/gm"]
nhl_playoff_predictor_df = nhl_df[selected_columns]
nhl_playoff_predictor_df.head()

Unnamed: 0,made_playoff,PIM/gm,PPG/gm,PKG/gm
0,1,13.1,0.64,0.65
1,1,18.3,0.75,0.66
2,1,12.1,0.84,0.54
3,1,16.6,0.8,0.7
4,0,12.8,0.66,0.65


In [4]:
# Define target and feature variables
y = nhl_playoff_predictor_df["made_playoff"]
X = nhl_playoff_predictor_df.copy()
X.drop("made_playoff", axis=1, inplace=True)

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)
classifier

# Fit the model using training data
classifier.fit(X_train, y_train)

# Scoring the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7813559322033898
Testing Data Score: 0.7258883248730964


In [5]:
# Make a prediction using the testing data
predicitons = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predicitons, "Actual": y_test}).reset_index(drop=True)

# Generate a confusion matrix for the model
confusion_matrix(y_test, predicitons)

array([[ 34,  34],
       [ 20, 109]])

In [6]:
# Print the classification report for the model
target_names = ["0 (Did not make playoffs)", "1 (Made playoffs)"]
print(classification_report(y_test, predicitons, target_names=target_names))

                           precision    recall  f1-score   support

0 (Did not make playoffs)       0.63      0.50      0.56        68
        1 (Made playoffs)       0.76      0.84      0.80       129

                 accuracy                           0.73       197
                macro avg       0.70      0.67      0.68       197
             weighted avg       0.72      0.73      0.72       197



**Option #2: Add in goals for and against to above optimization attempt**

In [7]:
selected_columns = ["made_playoff", "GF/gm", "GA/gm", "PIM/gm", "PPG/gm", "PKG/gm"]
nhl_playoff_predictor_df = nhl_df[selected_columns]
nhl_playoff_predictor_df.head()

Unnamed: 0,made_playoff,GF/gm,GA/gm,PIM/gm,PPG/gm,PKG/gm
0,1,3.52,3.36,13.1,0.64,0.65
1,1,3.88,2.92,18.3,0.75,0.66
2,1,3.98,2.51,12.1,0.84,0.54
3,1,3.01,3.12,16.6,0.8,0.7
4,0,2.92,3.85,12.8,0.66,0.65


In [8]:
# Define target and feature variables
y = nhl_playoff_predictor_df["made_playoff"]
X = nhl_playoff_predictor_df.copy()
X.drop("made_playoff", axis=1, inplace=True)

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)
classifier

# Fit the model using training data
classifier.fit(X_train, y_train)

# Scoring the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8847457627118644
Testing Data Score: 0.883248730964467


In [9]:
# Make a prediction using the testing data
predicitons = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predicitons, "Actual": y_test}).reset_index(drop=True)

# Generate a confusion matrix for the model
confusion_matrix(y_test, predicitons)

array([[ 52,  16],
       [  7, 122]])

In [10]:
# Print the classification report for the model
target_names = ["0 (Did not make playoffs)", "1 (Made playoffs)"]
print(classification_report(y_test, predicitons, target_names=target_names))

                           precision    recall  f1-score   support

0 (Did not make playoffs)       0.88      0.76      0.82        68
        1 (Made playoffs)       0.88      0.95      0.91       129

                 accuracy                           0.88       197
                macro avg       0.88      0.86      0.87       197
             weighted avg       0.88      0.88      0.88       197



**Option #3: Testing goals for and against without penalty statistics**

In [11]:
selected_columns = ["made_playoff", "GF/gm", "GA/gm"]
nhl_playoff_predictor_df = nhl_df[selected_columns]
nhl_playoff_predictor_df.head()

Unnamed: 0,made_playoff,GF/gm,GA/gm
0,1,3.52,3.36
1,1,3.88,2.92
2,1,3.98,2.51
3,1,3.01,3.12
4,0,2.92,3.85


In [12]:
# Define target and feature variables
y = nhl_playoff_predictor_df["made_playoff"]
X = nhl_playoff_predictor_df.copy()
X.drop("made_playoff", axis=1, inplace=True)

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)
classifier

# Fit the model using training data
classifier.fit(X_train, y_train)

# Scoring the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8847457627118644
Testing Data Score: 0.8984771573604061


In [13]:
# Make a prediction using the testing data
predicitons = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predicitons, "Actual": y_test}).reset_index(drop=True)

# Generate a confusion matrix for the model
confusion_matrix(y_test, predicitons)

array([[ 53,  15],
       [  5, 124]])

In [14]:
# Print the classification report for the model
target_names = ["0 (Did not make playoffs)", "1 (Made playoffs)"]
print(classification_report(y_test, predicitons, target_names=target_names))

                           precision    recall  f1-score   support

0 (Did not make playoffs)       0.91      0.78      0.84        68
        1 (Made playoffs)       0.89      0.96      0.93       129

                 accuracy                           0.90       197
                macro avg       0.90      0.87      0.88       197
             weighted avg       0.90      0.90      0.90       197



**Option #4: Use power play and penalty kill efficiency metrics instead of amounts per game**

In [15]:
selected_columns = ["made_playoff", "GF/gm", "GA/gm", "PIM/gm", "PPG_eff", "PK_eff"]
nhl_playoff_predictor_df = nhl_df[selected_columns]
nhl_playoff_predictor_df.head()

Unnamed: 0,made_playoff,GF/gm,GA/gm,PIM/gm,PPG_eff,PK_eff
0,1,3.52,3.36,13.1,0.236,0.191
1,1,3.88,2.92,18.3,0.244,0.17
2,1,3.98,2.51,12.1,0.244,0.171
3,1,3.01,3.12,16.6,0.238,0.191
4,0,2.92,3.85,12.8,0.174,0.223


In [16]:
# Define target and feature variables
y = nhl_playoff_predictor_df["made_playoff"]
X = nhl_playoff_predictor_df.copy()
X.drop("made_playoff", axis=1, inplace=True)

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)
classifier

# Fit the model using training data
classifier.fit(X_train, y_train)

# Scoring the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8847457627118644
Testing Data Score: 0.8883248730964467


In [17]:
# Make a prediction using the testing data
predicitons = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predicitons, "Actual": y_test}).reset_index(drop=True)

# Generate a confusion matrix for the model
confusion_matrix(y_test, predicitons)

array([[ 52,  16],
       [  6, 123]])

In [18]:
# Print the classification report for the model
target_names = ["0 (Did not make playoffs)", "1 (Made playoffs)"]
print(classification_report(y_test, predicitons, target_names=target_names))

                           precision    recall  f1-score   support

0 (Did not make playoffs)       0.90      0.76      0.83        68
        1 (Made playoffs)       0.88      0.95      0.92       129

                 accuracy                           0.89       197
                macro avg       0.89      0.86      0.87       197
             weighted avg       0.89      0.89      0.89       197



**Option #5: Remove penalty minutes incurred from the model**

In [19]:
selected_columns = ["made_playoff", "GF/gm", "GA/gm", "PPG_eff", "PK_eff"]
nhl_playoff_predictor_df = nhl_df[selected_columns]
nhl_playoff_predictor_df.head()

Unnamed: 0,made_playoff,GF/gm,GA/gm,PPG_eff,PK_eff
0,1,3.52,3.36,0.236,0.191
1,1,3.88,2.92,0.244,0.17
2,1,3.98,2.51,0.244,0.171
3,1,3.01,3.12,0.238,0.191
4,0,2.92,3.85,0.174,0.223


In [20]:
# Define target and feature variables
y = nhl_playoff_predictor_df["made_playoff"]
X = nhl_playoff_predictor_df.copy()
X.drop("made_playoff", axis=1, inplace=True)

# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)
classifier

# Fit the model using training data
classifier.fit(X_train, y_train)

# Scoring the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8864406779661017
Testing Data Score: 0.9035532994923858


In [21]:
# Make a prediction using the testing data
predicitons = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predicitons, "Actual": y_test}).reset_index(drop=True)

# Generate a confusion matrix for the model
confusion_matrix(y_test, predicitons)

array([[ 53,  15],
       [  4, 125]])

In [22]:
# Print the classification report for the model
target_names = ["0 (Did not make playoffs)", "1 (Made playoffs)"]
print(classification_report(y_test, predicitons, target_names=target_names))

                           precision    recall  f1-score   support

0 (Did not make playoffs)       0.93      0.78      0.85        68
        1 (Made playoffs)       0.89      0.97      0.93       129

                 accuracy                           0.90       197
                macro avg       0.91      0.87      0.89       197
             weighted avg       0.91      0.90      0.90       197



**This model achieved the best accuracy results among all of the attempts. We did not want to use the team's 'Pts/gm' metric in an effort to build a model that would not be influenced directly by their wins and losses, but rather indirectly by their goal and efficiency metrics.**