<div class="alert alert-danger">
    <h4 style="font-weight: bold; font-size: 28px;">Random Forest Models with Basic Feature Set</h4>
    <h5 style="font-weight: bold; font-size: 24px;">Hyperparameter Tuning using Expanding Window</h5>
    <p style="font-size: 20px;">NBA API Seasons 2021-22 to 2023-24</p>
</div>

<a name="Models"></a>

# Table of Contents

[Setup](#Setup)

[Data](#Data)

[Inspect Expanding Training Window](#Inspect-Training-Windows)

**[1. Target: Total Points (over / under)](#1.-Target:-Total-Points-(over-/-under))**
  
**[2. Target: Difference in Points (plus / minus)](#2.-Target:-Difference-in-Points-(plus-/-minus))**

**[3. Target: Game Winner (moneyline)](#3.-Target:-Game-Winner-(moneyline))**

# Setup

[Return to top](#Models)

In [1]:
# basic modules
import os
import time
import random as rn
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from functools import reduce
import itertools
import json

# plotting style
plt.style.use('seaborn-v0_8-notebook')
sns.set_style('white')
#sns.set_style('darkgrid')

# pandas tricks for better display
pd.options.display.max_columns = 50  
pd.options.display.max_rows = 500     
pd.options.display.max_colwidth = 100
pd.options.display.precision = 3

# preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
 
# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (
  BaggingClassifier, BaggingRegressor, RandomForestClassifier, RandomForestRegressor,
  AdaBoostClassifier, GradientBoostingClassifier
)
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor

# metrics & utilities 
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import (
  accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_auc_score,  
  roc_curve, RocCurveDisplay, auc, average_precision_score, precision_recall_curve, 
  PrecisionRecallDisplay, precision_score, recall_score, f1_score, mean_squared_error
)
from sklearn.utils import resample, class_weight

# variable importance
import shap
from shap.explainers import Tree
from lime.lime_tabular import LimeTabularExplainer
from sklearn.inspection import permutation_importance

# warnings
import warnings
warnings.filterwarnings("ignore")

# user defined functions
import utility_functions as utl

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


# Data

[Return to top](#Models)

Data splits:

- Define NBA Season 2021-22 as the TRAINING set: regular season is 2021-10-19 to 2022-04-10. 
- Define NBA Season 2022-23 as the VALIDATION set: regular season is 2022-10-18 to 2023-04-09.
- Define NBA Season 2023-24 as the TESTING set: regular season is 2023-10-24 to 2024-04-14.

In [2]:
# load, filter (by time) and scale data
pts_scaled_df, pm_scaled_df, res_scaled_df = utl.load_and_scale_data(
    file_path='../data/processed/nba_team_matchups_rolling_box_scores_2022_2024_r05.csv',
    seasons_to_keep=['2021-22', '2022-23', '2023-24'], 
    training_season='2021-22',
    scaler_type='minmax', 
    scale_target=False
)

Season 2021-22: 1186 games
Season 2022-23: 1181 games
Season 2023-24: 692 games
Total number of games across sampled seasons: 3059 games


In [3]:
# define number of games in seasons
season_22_ngames = 1186
season_23_ngames = 1181

In [4]:
pts_scaled_df.head()

Unnamed: 0,ROLL_HOME_PTS,ROLL_HOME_FGM,ROLL_HOME_FGA,ROLL_HOME_FG_PCT,ROLL_HOME_FG3M,ROLL_HOME_FG3A,ROLL_HOME_FG3_PCT,ROLL_HOME_FTM,ROLL_HOME_FTA,ROLL_HOME_FT_PCT,ROLL_HOME_OREB,ROLL_HOME_DREB,ROLL_HOME_REB,ROLL_HOME_AST,ROLL_HOME_STL,ROLL_HOME_BLK,ROLL_HOME_TOV,ROLL_HOME_PF,ROLL_AWAY_PTS,ROLL_AWAY_FGM,ROLL_AWAY_FGA,ROLL_AWAY_FG_PCT,ROLL_AWAY_FG3M,ROLL_AWAY_FG3A,ROLL_AWAY_FG3_PCT,ROLL_AWAY_FTM,ROLL_AWAY_FTA,ROLL_AWAY_FT_PCT,ROLL_AWAY_OREB,ROLL_AWAY_DREB,ROLL_AWAY_REB,ROLL_AWAY_AST,ROLL_AWAY_STL,ROLL_AWAY_BLK,ROLL_AWAY_TOV,ROLL_AWAY_PF,TOTAL_PTS
1,113.0,45.0,94.0,0.479,15.0,35.0,0.429,8.0,9.0,0.889,6.0,49.0,55.0,31.0,8.0,9.0,13.0,16.0,82.0,34.0,88.0,0.386,5.0,28.0,0.179,9.0,15.0,0.6,11.0,42.0,53.0,14.0,9.0,8.0,20.0,17.0,226
2,117.5,45.5,92.0,0.495,13.5,33.5,0.402,13.0,15.0,0.873,8.0,44.0,52.0,27.5,9.5,6.0,13.0,17.5,101.333,37.333,86.667,0.434,7.667,30.333,0.259,19.0,25.0,0.761,9.333,41.0,50.333,18.0,9.0,6.667,16.0,18.0,229
3,117.667,43.0,89.0,0.483,13.333,33.667,0.395,18.333,19.667,0.915,9.667,40.667,50.333,26.333,9.333,5.667,12.333,17.333,109.5,37.0,85.5,0.435,13.75,40.75,0.336,21.75,24.75,0.886,11.0,40.75,51.75,17.0,8.25,6.0,16.25,17.75,214
4,112.75,41.0,87.25,0.469,11.75,32.25,0.359,19.0,21.25,0.889,8.5,37.25,45.75,24.25,9.75,5.25,11.5,19.0,110.8,41.4,92.2,0.45,16.8,41.8,0.396,11.2,16.0,0.703,12.4,36.0,48.4,23.6,7.4,4.8,12.0,15.8,220
5,114.2,42.2,89.2,0.472,12.4,32.8,0.373,17.4,19.6,0.88,9.8,37.0,46.8,23.6,9.0,4.4,11.6,19.0,105.4,37.0,85.0,0.435,14.6,39.2,0.384,16.8,22.8,0.724,10.4,37.4,47.8,21.8,5.0,5.6,15.8,22.2,240


In [5]:
pm_scaled_df.head()

Unnamed: 0,ROLL_HOME_PTS,ROLL_HOME_FGM,ROLL_HOME_FGA,ROLL_HOME_FG_PCT,ROLL_HOME_FG3M,ROLL_HOME_FG3A,ROLL_HOME_FG3_PCT,ROLL_HOME_FTM,ROLL_HOME_FTA,ROLL_HOME_FT_PCT,ROLL_HOME_OREB,ROLL_HOME_DREB,ROLL_HOME_REB,ROLL_HOME_AST,ROLL_HOME_STL,ROLL_HOME_BLK,ROLL_HOME_TOV,ROLL_HOME_PF,ROLL_AWAY_PTS,ROLL_AWAY_FGM,ROLL_AWAY_FGA,ROLL_AWAY_FG_PCT,ROLL_AWAY_FG3M,ROLL_AWAY_FG3A,ROLL_AWAY_FG3_PCT,ROLL_AWAY_FTM,ROLL_AWAY_FTA,ROLL_AWAY_FT_PCT,ROLL_AWAY_OREB,ROLL_AWAY_DREB,ROLL_AWAY_REB,ROLL_AWAY_AST,ROLL_AWAY_STL,ROLL_AWAY_BLK,ROLL_AWAY_TOV,ROLL_AWAY_PF,PLUS_MINUS
1,113.0,45.0,94.0,0.479,15.0,35.0,0.429,8.0,9.0,0.889,6.0,49.0,55.0,31.0,8.0,9.0,13.0,16.0,82.0,34.0,88.0,0.386,5.0,28.0,0.179,9.0,15.0,0.6,11.0,42.0,53.0,14.0,9.0,8.0,20.0,17.0,18.0
2,117.5,45.5,92.0,0.495,13.5,33.5,0.402,13.0,15.0,0.873,8.0,44.0,52.0,27.5,9.5,6.0,13.0,17.5,101.333,37.333,86.667,0.434,7.667,30.333,0.259,19.0,25.0,0.761,9.333,41.0,50.333,18.0,9.0,6.667,16.0,18.0,7.0
3,117.667,43.0,89.0,0.483,13.333,33.667,0.395,18.333,19.667,0.915,9.667,40.667,50.333,26.333,9.333,5.667,12.333,17.333,109.5,37.0,85.5,0.435,13.75,40.75,0.336,21.75,24.75,0.886,11.0,40.75,51.75,17.0,8.25,6.0,16.25,17.75,-18.0
4,112.75,41.0,87.25,0.469,11.75,32.25,0.359,19.0,21.25,0.889,8.5,37.25,45.75,24.25,9.75,5.25,11.5,19.0,110.8,41.4,92.2,0.45,16.8,41.8,0.396,11.2,16.0,0.703,12.4,36.0,48.4,23.6,7.4,4.8,12.0,15.8,20.0
5,114.2,42.2,89.2,0.472,12.4,32.8,0.373,17.4,19.6,0.88,9.8,37.0,46.8,23.6,9.0,4.4,11.6,19.0,105.4,37.0,85.0,0.435,14.6,39.2,0.384,16.8,22.8,0.724,10.4,37.4,47.8,21.8,5.0,5.6,15.8,22.2,18.0


In [6]:
res_scaled_df.head()

Unnamed: 0,ROLL_HOME_PTS,ROLL_HOME_FGM,ROLL_HOME_FGA,ROLL_HOME_FG_PCT,ROLL_HOME_FG3M,ROLL_HOME_FG3A,ROLL_HOME_FG3_PCT,ROLL_HOME_FTM,ROLL_HOME_FTA,ROLL_HOME_FT_PCT,ROLL_HOME_OREB,ROLL_HOME_DREB,ROLL_HOME_REB,ROLL_HOME_AST,ROLL_HOME_STL,ROLL_HOME_BLK,ROLL_HOME_TOV,ROLL_HOME_PF,ROLL_AWAY_PTS,ROLL_AWAY_FGM,ROLL_AWAY_FGA,ROLL_AWAY_FG_PCT,ROLL_AWAY_FG3M,ROLL_AWAY_FG3A,ROLL_AWAY_FG3_PCT,ROLL_AWAY_FTM,ROLL_AWAY_FTA,ROLL_AWAY_FT_PCT,ROLL_AWAY_OREB,ROLL_AWAY_DREB,ROLL_AWAY_REB,ROLL_AWAY_AST,ROLL_AWAY_STL,ROLL_AWAY_BLK,ROLL_AWAY_TOV,ROLL_AWAY_PF,GAME_RESULT
1,113.0,45.0,94.0,0.479,15.0,35.0,0.429,8.0,9.0,0.889,6.0,49.0,55.0,31.0,8.0,9.0,13.0,16.0,82.0,34.0,88.0,0.386,5.0,28.0,0.179,9.0,15.0,0.6,11.0,42.0,53.0,14.0,9.0,8.0,20.0,17.0,1
2,117.5,45.5,92.0,0.495,13.5,33.5,0.402,13.0,15.0,0.873,8.0,44.0,52.0,27.5,9.5,6.0,13.0,17.5,101.333,37.333,86.667,0.434,7.667,30.333,0.259,19.0,25.0,0.761,9.333,41.0,50.333,18.0,9.0,6.667,16.0,18.0,1
3,117.667,43.0,89.0,0.483,13.333,33.667,0.395,18.333,19.667,0.915,9.667,40.667,50.333,26.333,9.333,5.667,12.333,17.333,109.5,37.0,85.5,0.435,13.75,40.75,0.336,21.75,24.75,0.886,11.0,40.75,51.75,17.0,8.25,6.0,16.25,17.75,0
4,112.75,41.0,87.25,0.469,11.75,32.25,0.359,19.0,21.25,0.889,8.5,37.25,45.75,24.25,9.75,5.25,11.5,19.0,110.8,41.4,92.2,0.45,16.8,41.8,0.396,11.2,16.0,0.703,12.4,36.0,48.4,23.6,7.4,4.8,12.0,15.8,1
5,114.2,42.2,89.2,0.472,12.4,32.8,0.373,17.4,19.6,0.88,9.8,37.0,46.8,23.6,9.0,4.4,11.6,19.0,105.4,37.0,85.0,0.435,14.6,39.2,0.384,16.8,22.8,0.724,10.4,37.4,47.8,21.8,5.0,5.6,15.8,22.2,1


# Inspect Expanding Training Window

[Return to top](#Models)

In [7]:
# expanding window configuration
initial_train_size = 10  # starting size of the training set
test_size = 1            # leave-one-out (LOO) cross-validation

counter = 0
max_splits_to_show = 15

# show first few splits
for train_indices, test_indices in utl.expanding_window_ts_split(pts_scaled_df, initial_train_size, test_size=test_size):
    print("TRAIN:", train_indices, "TEST:", test_indices)
    counter += 1
    if counter >= max_splits_to_show:
        break

TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10] TEST: [11]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11] TEST: [12]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] TEST: [13]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13] TEST: [14]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14] TEST: [15]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] TEST: [16]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16] TEST: [17]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17] TEST: [18]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18] TEST: [19]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] TEST: [20]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20] TEST: [21]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21] TEST: [22]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22] TEST: [23]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9

<a name="1.-Target:-Total-Points-(over-/-under)"></a>
# 1. Target: Total Points (over / under)

[Return to top](#Models)

In [8]:
# configuration for expanding window
initial_train_size = season_22_ngames # starting size of the training set
expansion_limit = 50      # maximum number of new training observations in expansion
test_size = 1             # leave-one-out (LOO) cross-validation
df = pts_scaled_df        # data set to use
target_col = 'TOTAL_PTS'  # target column name

# constant parameters
constant_params = {
    'random_state': 599,
    'n_jobs': -1,
    'n_estimators': 500
}

# parameters to explore
explore_params = {
    'max_depth': [15, 20],                # tried: 10, 15, 20, 25
    'min_samples_split': [2, 4],          # tried: 2, 4, 6
    'min_samples_leaf': [1, 2],           # tried: 1, 2
    'max_features': [0.3, 0.5],           # tried: 0.3, 0.5
    'min_impurity_decrease': [0.1, 0.3]   # tried: 0.1, 0.3
}

# dict to store results
results = {}

# generate all combinations of hyperparameters to explore
keys, values = zip(*explore_params.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

start_time = time.time()

# iterate over each combination of hyperparameters
for i, explore_param in enumerate(param_combinations):

    print('Parameters currently explored:', explore_param)
    
    # instantiate the model with combined parameters
    model = RandomForestRegressor(**constant_params, **explore_param)

    # train over expanding window
    model_outputs, y_true = utl.train_with_expanding_window(
        df=df,  
        initial_train_size=initial_train_size, 
        expansion_limit=expansion_limit,
        test_size=test_size,           
        target_col=target_col, 
        model=model
    )
    
    # store outputs and true values in the results dictionary
    results[f"run_{i}"] = {
        "params": {**explore_param},
        "model_outputs": model_outputs,
        "y_true": y_true
    }

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

Parameters currently explored: {'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.3, 'min_impurity_decrease': 0.1}
Total time taken: 25.45 seconds
Parameters currently explored: {'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.3, 'min_impurity_decrease': 0.3}
Total time taken: 24.75 seconds
Parameters currently explored: {'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'min_impurity_decrease': 0.1}
Total time taken: 31.63 seconds
Parameters currently explored: {'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'min_impurity_decrease': 0.3}
Total time taken: 30.83 seconds
Parameters currently explored: {'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 0.3, 'min_impurity_decrease': 0.1}
Total time taken: 24.43 seconds
Parameters currently explored: {'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_feat

In [None]:
# get metrics for each combination of parameter values
results_df = utl.compile_results_to_dataframe(results)

# print best hyperparameter settings
results_df.sort_values(by='average_rmse', ascending=True).head()

In [10]:
# get best parameters from validation as dictionary
best_params = utl.get_best_params(results_df, metric='average_rmse')

# save the dictionary to a file
with open('../hyperparameters/RF_pts_best_params.json', 'w') as json_file:
    json.dump(best_params, json_file, default=utl.handle_non_serializable, indent=4)

<a name="2.-Target:-Difference-in-Points-(plus-/-minus)"></a>
# 2. Target: Difference in Points (plus / minus)

[Return to top](#Models)

In [11]:
# configuration for expanding window
initial_train_size = season_22_ngames # starting size of the training set
expansion_limit = 50      # maximum number of new training observations in expansion
test_size = 1             # leave-one-out (LOO) cross-validation
df = pm_scaled_df         # data set to use
target_col = 'PLUS_MINUS' # target column name

# constant parameters
constant_params = {
    'random_state': 599,
    'n_jobs': -1,
    'n_estimators': 500
}

# parameters to explore
explore_params = {
    'max_depth': [15, 20],                # tried: 10, 15, 20, 25
    'min_samples_split': [2, 4],          # tried: 2, 4, 6
    'min_samples_leaf': [1, 2],           # tried: 1, 2
    'max_features': [0.3, 0.5],           # tried: 0.3, 0.5
    'min_impurity_decrease': [0.1, 0.3]   # tried: 0.1, 0.3
}

# dict to store results
results = {}

# generate all combinations of hyperparameters to explore
keys, values = zip(*explore_params.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

start_time = time.time()

# iterate over each combination of hyperparameters
for i, explore_param in enumerate(param_combinations):

    print('Parameters currently explored:', explore_param)
    
    # instantiate the model with combined parameters
    model = RandomForestRegressor(**constant_params, **explore_param)
    
    # train over expanding window
    model_outputs, y_true = utl.train_with_expanding_window(
        df=df,  
        initial_train_size=initial_train_size, 
        expansion_limit=expansion_limit,
        test_size=test_size,           
        target_col=target_col, 
        model=model
    )
    
    # store outputs and true values in the results dictionary
    results[f"run_{i}"] = {
        "params": {**explore_param},
        "model_outputs": model_outputs,
        "y_true": y_true
    }

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

Parameters currently explored: {'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.3, 'min_impurity_decrease': 0.1}
Total time taken: 25.60 seconds
Parameters currently explored: {'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.3, 'min_impurity_decrease': 0.3}
Total time taken: 24.52 seconds
Parameters currently explored: {'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'min_impurity_decrease': 0.1}
Total time taken: 30.76 seconds
Parameters currently explored: {'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.5, 'min_impurity_decrease': 0.3}
Total time taken: 30.04 seconds
Parameters currently explored: {'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 0.3, 'min_impurity_decrease': 0.1}
Total time taken: 23.93 seconds
Parameters currently explored: {'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_feat

In [12]:
# get metrics for each combination of parameter values
results_df = utl.compile_results_to_dataframe(results)

# print best hyperparameter settings
results_df.sort_values(by='average_rmse', ascending=True).head()

Unnamed: 0,run_id,average_rmse,max_depth,max_features,min_impurity_decrease,min_samples_leaf,min_samples_split
2,run_2,12.02,15,0.5,0.1,1,2
30,run_30,12.024,20,0.5,0.1,2,4
22,run_22,12.024,20,0.5,0.1,2,2
6,run_6,12.148,15,0.5,0.1,2,2
14,run_14,12.148,15,0.5,0.1,2,4


In [13]:
# get best parameters from validation as dictionary
best_params = utl.get_best_params(results_df, metric='average_rmse')

# save the dictionary to a file
with open('../hyperparameters/RF_pm_best_params.json', 'w') as json_file:
    json.dump(best_params, json_file, default=utl.handle_non_serializable, indent=4)

<a name="3.-Target:-Game-Winner-(moneyline)"></a>
# 3. Target: Game Winner (moneyline)

[Return to top](#Models)

In [14]:
# configuration for expanding window
initial_train_size = season_22_ngames # starting size of the training set
expansion_limit = 50       # maximum number of new training observations in expansion
test_size = 1              # leave-one-out (LOO) cross-validation
df = res_scaled_df         # data set to use
target_col = 'GAME_RESULT' # target column name

# constant parameters
constant_params = {
    'random_state': 599,
    'n_jobs': -1,
    'n_estimators': 500,
    'max_features': 'sqrt'
}

# parameters to explore
explore_params = {
    'criterion': ['gini', 'entropy'],     # tried: 'gini', 'entropy'
    'max_depth': [15, 20],                # tried: 10, 15, 20, 25
    'min_samples_split': [2, 4],          # tried: 2, 4, 6
    'min_samples_leaf': [1, 2],           # tried: 1, 2
    'min_impurity_decrease': [0.1, 0.3]   # tried: 0.1, 0.3
}

# dict to store results
results = {}

# generate all combinations of hyperparameters to explore
keys, values = zip(*explore_params.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

start_time = time.time()

# iterate over each combination of hyperparameters
for i, explore_param in enumerate(param_combinations):

    print('Parameters currently explored:', explore_param)
    
    # instantiate the model with combined parameters
    model = RandomForestClassifier(**constant_params, **explore_param)
    
    # train over expanding window
    model_outputs, y_true = utl.train_with_expanding_window(
        df=df,  
        initial_train_size=initial_train_size, 
        expansion_limit=expansion_limit,
        test_size=test_size,           
        target_col=target_col, 
        model=model
    )
    
    # store outputs and true values in the results dictionary
    results[f"run_{i}"] = {
        "params": {**explore_param},
        "model_outputs": model_outputs,
        "y_true": y_true
    }

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

Parameters currently explored: {'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.1}
Total time taken: 17.40 seconds
Parameters currently explored: {'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.3}
Total time taken: 17.29 seconds
Parameters currently explored: {'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.1}
Total time taken: 17.36 seconds
Parameters currently explored: {'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 2, 'min_impurity_decrease': 0.3}
Total time taken: 17.78 seconds
Parameters currently explored: {'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 4, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.1}
Total time taken: 16.90 seconds
Parameters currently explored: {'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 4, 'min_sample

In [15]:
# get metrics for each combination of parameter values
results_df = utl.compile_results_to_dataframe(results)

# print best hyperparameter settings
results_df.sort_values(by='average_accuracy', ascending=False).head()

Unnamed: 0,run_id,average_accuracy,average_f1_score,criterion,max_depth,min_impurity_decrease,min_samples_leaf,min_samples_split,overall_auc,pred_labels
0,run_0,0.64,0.78,gini,15,0.1,1,2,0.408,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
1,run_1,0.64,0.78,gini,15,0.3,1,2,0.408,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
30,run_30,0.64,0.78,entropy,20,0.1,2,4,0.408,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
29,run_29,0.64,0.78,entropy,20,0.3,1,4,0.408,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
28,run_28,0.64,0.78,entropy,20,0.1,1,4,0.408,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."


In [16]:
# get best parameters from validation as dictionary
best_params = utl.get_best_params(results_df, metric='average_accuracy')

# save the dictionary to a file
with open('../hyperparameters/RF_res_best_params.json', 'w') as json_file:
    json.dump(best_params, json_file, default=utl.handle_non_serializable, indent=4)