## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('data/nhl_data_new_features.csv')

#### Show Top 5 Records

In [3]:
df.head()

Unnamed: 0,awayTeamCode,game_id,homeTeamCode,homeTeamWon,isPlayoffGame,season,total_games_played_by_home,total_games_played_by_away,total_wins_home,total_losses_home,total_wins_away,total_losses_away,last_10_games_win_home,last_10_games_win_away,last_meeting_result,last_game_result_home,last_game_result_away
0,MTL,20001,TOR,0,0,2015,0,0,0,0,0,0,0,0,1,,
1,NYR,20002,CHI,0,0,2015,0,0,0,0,0,0,0,0,0,,
2,VAN,20003,CGY,0,0,2015,0,0,0,0,0,0,0,0,0,,
3,SJS,20004,LAK,0,0,2015,0,0,0,0,0,0,0,0,0,,
4,WPG,20005,BOS,0,0,2015,0,0,0,0,0,0,0,0,1,,


In [4]:
# fill null values 
df.fillna(2,inplace=True)

In [5]:
df.drop('game_id',axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,awayTeamCode,homeTeamCode,homeTeamWon,isPlayoffGame,season,total_games_played_by_home,total_games_played_by_away,total_wins_home,total_losses_home,total_wins_away,total_losses_away,last_10_games_win_home,last_10_games_win_away,last_meeting_result,last_game_result_home,last_game_result_away
0,MTL,TOR,0,0,2015,0,0,0,0,0,0,0,0,1,2.0,2.0
1,NYR,CHI,0,0,2015,0,0,0,0,0,0,0,0,0,2.0,2.0
2,VAN,CGY,0,0,2015,0,0,0,0,0,0,0,0,0,2.0,2.0
3,SJS,LAK,0,0,2015,0,0,0,0,0,0,0,0,0,2.0,2.0
4,WPG,BOS,0,0,2015,0,0,0,0,0,0,0,0,1,2.0,2.0


#### Preparing X and Y variables

In [7]:
# Define features (X) and target variable (y)
X = df.drop(columns=['homeTeamWon'])
y = df['homeTeamWon']

In [8]:
X.tail()

Unnamed: 0,awayTeamCode,homeTeamCode,isPlayoffGame,season,total_games_played_by_home,total_games_played_by_away,total_wins_home,total_losses_home,total_wins_away,total_losses_away,last_10_games_win_home,last_10_games_win_away,last_meeting_result,last_game_result_home,last_game_result_away
11710,FLA,EDM,1,2023,102,101,61,41,66,35,6,7,1,0.0,1.0
11711,FLA,EDM,1,2023,103,102,61,42,67,35,5,8,1,0.0,1.0
11712,EDM,FLA,1,2023,103,104,67,36,62,42,7,5,1,0.0,1.0
11713,FLA,EDM,1,2023,105,104,63,42,67,37,5,6,1,1.0,0.0
11714,EDM,FLA,1,2023,105,106,67,38,64,42,6,6,1,0.0,1.0


In [9]:
X.shape

(11715, 15)

#### Training and testing

In [20]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [21]:
# Define the column transformer for preprocessing
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['awayTeamCode', 'homeTeamCode']),
    ],
    remainder='passthrough'  # Leave the other columns unchanged
)

In [24]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Store the results
results = {}

for model_name, model in models.items():
    # Create a pipeline with the ColumnTransformer and the model
    pipeline = Pipeline(steps=[
        ('preprocessor', column_transformer),
        ('classifier', model)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # Store the results
    results[model_name] = {
        "accuracy": accuracy,
        "f1_score": f1,
        "precision": precision,
        "recall": recall,
        "confusion_matrix": cm
    }

# Print the results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1 Score: {metrics['f1_score']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"Confusion Matrix:\n{metrics['confusion_matrix']}\n")


Model: Random Forest
Accuracy: 0.6041
F1 Score: 0.6003
Precision: 0.6153
Recall: 0.6938
Confusion Matrix:
[[273 272]
 [192 435]]

Model: Logistic Regression
Accuracy: 0.5973
F1 Score: 0.5911
Precision: 0.6054
Recall: 0.7097
Confusion Matrix:
[[255 290]
 [182 445]]

Model: XGBoost
Accuracy: 0.6032
F1 Score: 0.5995
Precision: 0.6147
Recall: 0.6922
Confusion Matrix:
[[273 272]
 [193 434]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.



### Initial Results

All these give weighted f1 scores of ~60%, which is not bad, but let's see if we can fine tune them

### Fine tuning

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Define a scoring metric
scorer = make_scorer(f1_score, average='weighted')

# Initialize models and their hyperparameters
models = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20, 30]
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10],
            'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none']
        }
    },
    'XGBoost': {
        'model': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [3, 5, 7],
            'classifier__learning_rate': [0.01, 0.1, 0.2]
        }
    }
}

# Store the results
results = {}

for model_name, config in models.items():
    # Create a pipeline with the ColumnTransformer and the model
    pipeline = Pipeline(steps=[
        ('preprocessor', column_transformer),
        ('classifier', config['model'])
    ])
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=config['params'], scoring=scorer, cv=5)

    # Train the model
    grid_search.fit(X_train, y_train)

    # Make predictions
    y_pred = grid_search.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    # Store the results
    results[model_name] = {
        "best_params": grid_search.best_params_,
        "accuracy": accuracy,
        "f1_score": f1,
        "precision": precision,
        "recall": recall,
        "confusion_matrix": cm
    }

# Print the results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Best Parameters: {metrics['best_params']}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1 Score: {metrics['f1_score']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"Confusion Matrix:\n{metrics['confusion_matrix']}\n")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
75 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the fa

Model: Random Forest
Best Parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 100}
Accuracy: 0.6152
F1 Score: 0.6009
Precision: 0.6185
Recall: 0.6152
Confusion Matrix:
[[228 317]
 [134 493]]

Model: Logistic Regression
Best Parameters: {'classifier__C': 1, 'classifier__penalty': 'l2'}
Accuracy: 0.5896
F1 Score: 0.5805
Precision: 0.5875
Recall: 0.5896
Confusion Matrix:
[[237 308]
 [173 454]]

Model: XGBoost
Best Parameters: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__n_estimators': 100}
Accuracy: 0.6041
F1 Score: 0.5927
Precision: 0.6042
Recall: 0.6041
Confusion Matrix:
[[234 311]
 [153 474]]



Parameters: { "use_label_encoder" } are not used.



This gives us a better score, and is more balanced as opposed to unweighted f1 scores

Final model we can try to use is Random Forest Classifier with parameters 'classifier__max_depth': 10, 'classifier__n_estimators': 100

The best model also yields 

Best found model on both training and testing dataset
F1 Score: 0.6026
Accuracy: 0.6195
Precision: 0.6092
Recall: 0.8054

Accuracy is ~62%