## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
import warnings

#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('data/nhl_data_new_features.csv')

#### Show Top 5 Records

In [3]:
df.head()

Unnamed: 0,awayTeamCode,game_id,homeTeamCode,homeTeamWon,isPlayoffGame,season,total_games_played_by_home,total_games_played_by_away,total_wins_home,total_losses_home,total_wins_away,total_losses_away,last_10_games_win_home,last_10_games_win_away,last_meeting_result,last_game_result_home,last_game_result_away
0,ANA,20001,LAK,1,0,2007,0,0,0,0,0,0,0,0,1,,
1,LAK,20002,ANA,1,0,2007,1,1,0,1,1,0,0,1,1,0.0,1.0
2,MTL,20003,CAR,0,0,2007,0,0,0,0,0,0,0,0,1,,
3,ANA,20004,DET,1,0,2007,0,2,0,0,1,1,0,1,0,,1.0
4,OTT,20005,TOR,0,0,2007,0,0,0,0,0,0,0,0,0,,


In [4]:
# fill null values 
df.fillna(0,inplace=True)

In [5]:
# Encode team codes

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoders
le_home = LabelEncoder()
le_away = LabelEncoder()

# Apply the LabelEncoder
df['homeTeamCode_encoded'] = le_home.fit_transform(df['homeTeamCode'])
df['awayTeamCode_encoded'] = le_away.fit_transform(df['awayTeamCode'])

#### Preparing X and Y variables

In [6]:
X = df.drop(columns=['game_id','homeTeamWon','homeTeamCode','awayTeamCode'],axis=1)

In [7]:
X.tail()

Unnamed: 0,isPlayoffGame,season,total_games_played_by_home,total_games_played_by_away,total_wins_home,total_losses_home,total_wins_away,total_losses_away,last_10_games_win_home,last_10_games_win_away,last_meeting_result,last_game_result_home,last_game_result_away,homeTeamCode_encoded,awayTeamCode_encoded
21739,1,2023,102,101,61,41,66,35,6,7,1,0.0,1.0,11,12
21740,1,2023,103,102,61,42,67,35,5,8,1,0.0,1.0,11,12
21741,1,2023,103,104,67,36,62,42,7,5,1,0.0,1.0,12,11
21742,1,2023,105,104,63,42,67,37,5,6,1,1.0,0.0,11,12
21743,1,2023,105,106,67,38,64,42,6,6,1,0.0,1.0,12,11


In [8]:
y = df['homeTeamWon']

In [9]:
y

0        1
1        1
2        0
3        1
4        0
        ..
21739    0
21740    1
21741    0
21742    1
21743    1
Name: homeTeamWon, Length: 21744, dtype: int64

In [10]:
X.shape

(21744, 15)

#### Training and testing on base model

In [14]:
X_b = X[['isPlayoffGame', 'season','homeTeamCode_encoded', 'awayTeamCode_encoded']]

y_b = y

In [15]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b,y_b,test_size=0.2,random_state=42)
X_train_b.shape, X_test_b.shape

((17395, 4), (4349, 4))

#### Create an Evaluate Function to give all metrics after model Training

In [17]:
def evaluate_classification_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    # Confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    return accuracy, precision, recall, f1, conf_matrix

In [20]:
# Define base model for classification
model = RandomForestClassifier(random_state=42)

model.fit(X_train_b, y_train_b)  # Train model


# Predict binary win/loss outcomes for the test set
y_pred = model.predict(X_test_b)


accuracy, precision, recall, f1, conf_matrix = evaluate_classification_model(y_test_b,y_pred)


print('----------------------------------')
print(model)
print('Model performance for Test set')
print("- Accuracy: {:.4f}".format(accuracy))
print("- F1 Score: {:.4f}".format(f1))
print("- Precision: {:.4f}".format(precision))
print("- Recall: {:.4f}".format(recall))
print("-Confusion Matrix: ",conf_matrix)
# Store the F1 score for comparison
    
print('=' * 35)
print('\n')

----------------------------------
RandomForestClassifier(random_state=42)
Model performance for Test set
- Accuracy: 0.5176
- F1 Score: 0.5555
- Precision: 0.5634
- Recall: 0.5478
-Confusion Matrix:  [[ 940 1016]
 [1082 1311]]




#### Modelling using the new features

In [25]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=42)
X_train.shape, X_test.shape

((19569, 15), (2175, 15))

In [27]:
# Define your models for classification
models = {
   LogisticRegression(),
   RandomForestClassifier(random_state=42),
   RandomForestClassifier(
        n_estimators=50,        # Number of trees
        max_depth=5,            # Maximum depth
        min_samples_split=2,    # Minimum samples to split a node
        min_samples_leaf=10,    # Minimum samples in each leaf
        random_state=42)
}

for model in models:
    model.fit(X_train, y_train)  # Train model

   
    # Predict binary win/loss outcomes for the test set
    y_pred = model.predict(X_test)
    
    
    accuracy, precision, recall, f1, conf_matrix = evaluate_classification_model(y_test,y_pred)

    
    print('----------------------------------')
    print(model)
    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(accuracy))
    print("- F1 Score: {:.4f}".format(f1))
    print("- Precision: {:.4f}".format(precision))
    print("- Recall: {:.4f}".format(recall))
    print("-Confusion Matrix: ",conf_matrix)
    # Store the F1 score for comparison
        
    print('=' * 35)
    print('\n')

----------------------------------
RandomForestClassifier(random_state=42)
Model performance for Test set
- Accuracy: 0.5651
- F1 Score: 0.6276
- Precision: 0.5970
- Recall: 0.6614
-Confusion Matrix:  [[432 538]
 [408 797]]


----------------------------------
LogisticRegression()
Model performance for Test set
- Accuracy: 0.5793
- F1 Score: 0.6784
- Precision: 0.5884
- Recall: 0.8008
-Confusion Matrix:  [[295 675]
 [240 965]]




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


----------------------------------
RandomForestClassifier(max_depth=5, min_samples_leaf=10, n_estimators=50,
                       random_state=42)
Model performance for Test set
- Accuracy: 0.5867
- F1 Score: 0.7086
- Precision: 0.5814
- Recall: 0.9071
-Confusion Matrix:  [[ 183  787]
 [ 112 1093]]




### Results

In [1]:
# Best model is RandomForest with the best Params at 0.7086 F1 score compared to base line model of 0.6276 F1 Score