<div class="alert alert-danger">
    <h4 style="font-weight: bold; font-size: 28px;">Extreme Gradient Boosting with Basic Feature Set</h4>
    <h5 style="font-weight: bold; font-size: 24px;">Hyperparameter Tuning using Expanding Window</h5>
    <p style="font-size: 20px;">NBA API Seasons 2021-22 to 2023-24</p>
</div>

<a name="Models"></a>

# Table of Contents

[Setup](#Setup)

[Data](#Data)

[Inspect Expanding Training Window](#Inspect-Training-Windows)

[Functions](#Functions)

**[1. Target: Total Points (over / under)](#1.-Target:-Total-Points-(over-/-under))**
  
**[2. Target: Difference in Points (plus / minus)](#2.-Target:-Difference-in-Points-(plus-/-minus))**

**[3. Target: Game Winner (moneyline)](#3.-Target:-Game-Winner-(moneyline))**

# Setup

[Return to top](#Models)

In [1]:
# basic modules
import os
import time
import random as rn
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from functools import reduce
import itertools
import json

# plotting style
plt.style.use('seaborn-v0_8-notebook')
sns.set_style('white')
#sns.set_style('darkgrid')

# pandas tricks for better display
pd.options.display.max_columns = 50  
pd.options.display.max_rows = 500     
pd.options.display.max_colwidth = 100
pd.options.display.precision = 3

# preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
 
# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (
  BaggingClassifier, BaggingRegressor, RandomForestClassifier, RandomForestRegressor,
  AdaBoostClassifier, GradientBoostingClassifier
)
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import StackingClassifier, StackingRegressor

# metrics & utilities 
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import (
  accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_auc_score,  
  roc_curve, RocCurveDisplay, auc, average_precision_score, precision_recall_curve, 
  PrecisionRecallDisplay, precision_score, recall_score, f1_score, mean_squared_error
)
from sklearn.utils import resample, class_weight

# variable importance
import shap
from shap.explainers import Tree
from lime.lime_tabular import LimeTabularExplainer
from sklearn.inspection import permutation_importance

# warnings
import warnings
warnings.filterwarnings("ignore")

# user defined functions
import utility_functions as utl

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


# Data

[Return to top](#Models)

Data splits:

- Define NBA Season 2021-22 as the TRAINING set: regular season is 2021-10-19 to 2022-04-10. 
- Define NBA Season 2022-23 as the VALIDATION set: regular season is 2022-10-18 to 2023-04-09.
- Define NBA Season 2023-24 as the TESTING set: regular season is 2023-10-24 to 2024-04-14.

In [2]:
# load, filter (by time) and scale data
pts_scaled_df, pm_scaled_df, res_scaled_df = utl.load_and_scale_data(
    file_path='../data/processed/nba_team_matchups_rolling_box_scores_2022_2024_r05.csv',
    seasons_to_keep=['2021-22', '2022-23', '2023-24'], 
    training_season='2021-22',
    scaler_type='minmax', 
    scale_target=False
)

Season 2021-22: 1186 games
Season 2022-23: 1181 games
Season 2023-24: 692 games
Total number of games across sampled seasons: 3059 games


In [3]:
# define number of games in seasons
season_22_ngames = 1186
season_23_ngames = 1181

In [4]:
pts_scaled_df.head()

Unnamed: 0,ROLL_HOME_PTS,ROLL_HOME_FGM,ROLL_HOME_FGA,ROLL_HOME_FG_PCT,ROLL_HOME_FG3M,ROLL_HOME_FG3A,ROLL_HOME_FG3_PCT,ROLL_HOME_FTM,ROLL_HOME_FTA,ROLL_HOME_FT_PCT,ROLL_HOME_OREB,ROLL_HOME_DREB,ROLL_HOME_REB,ROLL_HOME_AST,ROLL_HOME_STL,ROLL_HOME_BLK,ROLL_HOME_TOV,ROLL_HOME_PF,ROLL_AWAY_PTS,ROLL_AWAY_FGM,ROLL_AWAY_FGA,ROLL_AWAY_FG_PCT,ROLL_AWAY_FG3M,ROLL_AWAY_FG3A,ROLL_AWAY_FG3_PCT,ROLL_AWAY_FTM,ROLL_AWAY_FTA,ROLL_AWAY_FT_PCT,ROLL_AWAY_OREB,ROLL_AWAY_DREB,ROLL_AWAY_REB,ROLL_AWAY_AST,ROLL_AWAY_STL,ROLL_AWAY_BLK,ROLL_AWAY_TOV,ROLL_AWAY_PF,TOTAL_PTS
1,113.0,45.0,94.0,0.479,15.0,35.0,0.429,8.0,9.0,0.889,6.0,49.0,55.0,31.0,8.0,9.0,13.0,16.0,82.0,34.0,88.0,0.386,5.0,28.0,0.179,9.0,15.0,0.6,11.0,42.0,53.0,14.0,9.0,8.0,20.0,17.0,226
2,117.5,45.5,92.0,0.495,13.5,33.5,0.402,13.0,15.0,0.873,8.0,44.0,52.0,27.5,9.5,6.0,13.0,17.5,101.333,37.333,86.667,0.434,7.667,30.333,0.259,19.0,25.0,0.761,9.333,41.0,50.333,18.0,9.0,6.667,16.0,18.0,229
3,117.667,43.0,89.0,0.483,13.333,33.667,0.395,18.333,19.667,0.915,9.667,40.667,50.333,26.333,9.333,5.667,12.333,17.333,109.5,37.0,85.5,0.435,13.75,40.75,0.336,21.75,24.75,0.886,11.0,40.75,51.75,17.0,8.25,6.0,16.25,17.75,214
4,112.75,41.0,87.25,0.469,11.75,32.25,0.359,19.0,21.25,0.889,8.5,37.25,45.75,24.25,9.75,5.25,11.5,19.0,110.8,41.4,92.2,0.45,16.8,41.8,0.396,11.2,16.0,0.703,12.4,36.0,48.4,23.6,7.4,4.8,12.0,15.8,220
5,114.2,42.2,89.2,0.472,12.4,32.8,0.373,17.4,19.6,0.88,9.8,37.0,46.8,23.6,9.0,4.4,11.6,19.0,105.4,37.0,85.0,0.435,14.6,39.2,0.384,16.8,22.8,0.724,10.4,37.4,47.8,21.8,5.0,5.6,15.8,22.2,240


In [5]:
pm_scaled_df.head()

Unnamed: 0,ROLL_HOME_PTS,ROLL_HOME_FGM,ROLL_HOME_FGA,ROLL_HOME_FG_PCT,ROLL_HOME_FG3M,ROLL_HOME_FG3A,ROLL_HOME_FG3_PCT,ROLL_HOME_FTM,ROLL_HOME_FTA,ROLL_HOME_FT_PCT,ROLL_HOME_OREB,ROLL_HOME_DREB,ROLL_HOME_REB,ROLL_HOME_AST,ROLL_HOME_STL,ROLL_HOME_BLK,ROLL_HOME_TOV,ROLL_HOME_PF,ROLL_AWAY_PTS,ROLL_AWAY_FGM,ROLL_AWAY_FGA,ROLL_AWAY_FG_PCT,ROLL_AWAY_FG3M,ROLL_AWAY_FG3A,ROLL_AWAY_FG3_PCT,ROLL_AWAY_FTM,ROLL_AWAY_FTA,ROLL_AWAY_FT_PCT,ROLL_AWAY_OREB,ROLL_AWAY_DREB,ROLL_AWAY_REB,ROLL_AWAY_AST,ROLL_AWAY_STL,ROLL_AWAY_BLK,ROLL_AWAY_TOV,ROLL_AWAY_PF,PLUS_MINUS
1,113.0,45.0,94.0,0.479,15.0,35.0,0.429,8.0,9.0,0.889,6.0,49.0,55.0,31.0,8.0,9.0,13.0,16.0,82.0,34.0,88.0,0.386,5.0,28.0,0.179,9.0,15.0,0.6,11.0,42.0,53.0,14.0,9.0,8.0,20.0,17.0,18.0
2,117.5,45.5,92.0,0.495,13.5,33.5,0.402,13.0,15.0,0.873,8.0,44.0,52.0,27.5,9.5,6.0,13.0,17.5,101.333,37.333,86.667,0.434,7.667,30.333,0.259,19.0,25.0,0.761,9.333,41.0,50.333,18.0,9.0,6.667,16.0,18.0,7.0
3,117.667,43.0,89.0,0.483,13.333,33.667,0.395,18.333,19.667,0.915,9.667,40.667,50.333,26.333,9.333,5.667,12.333,17.333,109.5,37.0,85.5,0.435,13.75,40.75,0.336,21.75,24.75,0.886,11.0,40.75,51.75,17.0,8.25,6.0,16.25,17.75,-18.0
4,112.75,41.0,87.25,0.469,11.75,32.25,0.359,19.0,21.25,0.889,8.5,37.25,45.75,24.25,9.75,5.25,11.5,19.0,110.8,41.4,92.2,0.45,16.8,41.8,0.396,11.2,16.0,0.703,12.4,36.0,48.4,23.6,7.4,4.8,12.0,15.8,20.0
5,114.2,42.2,89.2,0.472,12.4,32.8,0.373,17.4,19.6,0.88,9.8,37.0,46.8,23.6,9.0,4.4,11.6,19.0,105.4,37.0,85.0,0.435,14.6,39.2,0.384,16.8,22.8,0.724,10.4,37.4,47.8,21.8,5.0,5.6,15.8,22.2,18.0


In [6]:
res_scaled_df.head()

Unnamed: 0,ROLL_HOME_PTS,ROLL_HOME_FGM,ROLL_HOME_FGA,ROLL_HOME_FG_PCT,ROLL_HOME_FG3M,ROLL_HOME_FG3A,ROLL_HOME_FG3_PCT,ROLL_HOME_FTM,ROLL_HOME_FTA,ROLL_HOME_FT_PCT,ROLL_HOME_OREB,ROLL_HOME_DREB,ROLL_HOME_REB,ROLL_HOME_AST,ROLL_HOME_STL,ROLL_HOME_BLK,ROLL_HOME_TOV,ROLL_HOME_PF,ROLL_AWAY_PTS,ROLL_AWAY_FGM,ROLL_AWAY_FGA,ROLL_AWAY_FG_PCT,ROLL_AWAY_FG3M,ROLL_AWAY_FG3A,ROLL_AWAY_FG3_PCT,ROLL_AWAY_FTM,ROLL_AWAY_FTA,ROLL_AWAY_FT_PCT,ROLL_AWAY_OREB,ROLL_AWAY_DREB,ROLL_AWAY_REB,ROLL_AWAY_AST,ROLL_AWAY_STL,ROLL_AWAY_BLK,ROLL_AWAY_TOV,ROLL_AWAY_PF,GAME_RESULT
1,113.0,45.0,94.0,0.479,15.0,35.0,0.429,8.0,9.0,0.889,6.0,49.0,55.0,31.0,8.0,9.0,13.0,16.0,82.0,34.0,88.0,0.386,5.0,28.0,0.179,9.0,15.0,0.6,11.0,42.0,53.0,14.0,9.0,8.0,20.0,17.0,1
2,117.5,45.5,92.0,0.495,13.5,33.5,0.402,13.0,15.0,0.873,8.0,44.0,52.0,27.5,9.5,6.0,13.0,17.5,101.333,37.333,86.667,0.434,7.667,30.333,0.259,19.0,25.0,0.761,9.333,41.0,50.333,18.0,9.0,6.667,16.0,18.0,1
3,117.667,43.0,89.0,0.483,13.333,33.667,0.395,18.333,19.667,0.915,9.667,40.667,50.333,26.333,9.333,5.667,12.333,17.333,109.5,37.0,85.5,0.435,13.75,40.75,0.336,21.75,24.75,0.886,11.0,40.75,51.75,17.0,8.25,6.0,16.25,17.75,0
4,112.75,41.0,87.25,0.469,11.75,32.25,0.359,19.0,21.25,0.889,8.5,37.25,45.75,24.25,9.75,5.25,11.5,19.0,110.8,41.4,92.2,0.45,16.8,41.8,0.396,11.2,16.0,0.703,12.4,36.0,48.4,23.6,7.4,4.8,12.0,15.8,1
5,114.2,42.2,89.2,0.472,12.4,32.8,0.373,17.4,19.6,0.88,9.8,37.0,46.8,23.6,9.0,4.4,11.6,19.0,105.4,37.0,85.0,0.435,14.6,39.2,0.384,16.8,22.8,0.724,10.4,37.4,47.8,21.8,5.0,5.6,15.8,22.2,1


# Inspect Expanding Training Window

[Return to top](#Models)

In [7]:
# expanding window configuration
initial_train_size = 10  # starting size of the training set
test_size = 1            # leave-one-out (LOO) cross-validation

counter = 0
max_splits_to_show = 15

# show first few splits
for train_indices, test_indices in utl.expanding_window_ts_split(pts_scaled_df, initial_train_size, test_size=test_size):
    print("TRAIN:", train_indices, "TEST:", test_indices)
    counter += 1
    if counter >= max_splits_to_show:
        break

TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10] TEST: [11]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11] TEST: [12]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12] TEST: [13]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13] TEST: [14]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14] TEST: [15]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] TEST: [16]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16] TEST: [17]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17] TEST: [18]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18] TEST: [19]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] TEST: [20]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20] TEST: [21]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21] TEST: [22]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22] TEST: [23]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9

# Functions

[Return to top](#Models)

For some idiotic reason 'XGBoost' requires the training function to be in the notebook rather than imported, at least while using early stopping. Also, we need to pass in validation set data directly to the `.fit` method, which requires a slight modification to the function in the `utl` library.

In [8]:
def train_with_expanding_window(df, initial_train_size, test_size, target_col, model, ensure_diversity=False, expansion_limit=None):
    """
    Trains a given model using an expanding window approach on a specified DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the features and target variable.
    - initial_train_size (int): The initial size of the training dataset.
    - test_size (int): The size of the test dataset for each split, typically 1 for LOO CV.
    - target_col (str): The name of the target column in `df`.
    - model (model object): The instantiated model to be trained, e.g., LinearRegression() or LogisticRegression().
    - ensure_diversity (bool, optional): For logistic regression, ensures the initial training data includes both classes. Default is False.
    - expansion_limit (int, optional): The maximum number of times the training set is expanded by 1 observation during the expanding window process. This parameter controls the total number of train-test splits generated, indirectly determining the final size of the training set. If set, the training process will stop once this limit is reached, potentially leaving some data unused. If None, the training set will expand until all but the last observation are used for training.

    Returns:
    - model_outputs (list): A list of model predictions or probabilities for the test sets across all splits.
    - y_true (list): A list of the actual target values corresponding to each prediction in `model_outputs`.

    This function iterates over the dataset using an expanding window to create training and test splits, 
    trains the specified `model` on each training split, and stores the model's predictions or probabilities.
    """
    import time
    from xgboost import XGBClassifier, XGBRegressor
    
    start_time = time.time()

    # initialize storage for model outputs and true labels
    model_outputs = []  # store predictions or probabilities
    y_true = []

    for train_indices, test_indices in utl.expanding_window_ts_split(
        df, initial_train_size, test_size=test_size, ensure_diversity=ensure_diversity, 
        target_col=target_col if ensure_diversity else None, expansion_limit=expansion_limit):
        
        # get training and testing data for this window
        X_train = df.iloc[train_indices].drop(columns=target_col)
        y_train = df.iloc[train_indices][target_col]
        X_test = df.iloc[test_indices].drop(columns=target_col)
        y_test = df.iloc[test_indices][target_col]
        
        # train the model
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) # for XGBoost
        
        # check if the model has the predict_proba method (i.e., likely a classifier)
        if hasattr(model, 'predict_proba'):
            # store predicted probabilities of the positive class
            proba = model.predict_proba(X_test)[:, 1]
            model_outputs.extend(proba)
        elif hasattr(model, 'predict'):
            # for models that support predict (regressors and classifiers without predict_proba)
            predictions = model.predict(X_test)
            model_outputs.extend(predictions)
        else:
            raise ValueError("Model does not support required prediction methods.")

        # store true labels for evaluation
        y_true.extend(y_test)

    end_time = time.time()
    print(f"Total time taken: {end_time - start_time:.2f} seconds")

    return model_outputs, y_true

<a name="1.-Target:-Total-Points-(over-/-under)"></a>
# 1. Target: Total Points (over / under)

[Return to top](#Models)

In [9]:
# configuration for expanding window
initial_train_size = season_22_ngames # starting size of the training set
expansion_limit = 50      # maximum number of new training observations in expansion
test_size = 1             # leave-one-out (LOO) cross-validation
df = pts_scaled_df        # data set to use
target_col = 'TOTAL_PTS'  # target column name

# constant parameters
constant_params = {
    'random_state': 599,
    'n_jobs': -1,
    'objective': 'reg:squarederror',
    'n_estimators': 1000,
    'eval_metric': 'rmse',
    'early_stopping_rounds': 20
}

# parameters to explore
explore_params = {
    'booster': ['gbtree'],              # tried: 'gbtree', 'gblinear', 'dart'
    'learning_rate': [0.5, 1.0, 10],    # tried: 0.001, 0.01, 0.1, 0.5, 1.0
    'max_depth': [2, 4, 6],             # tried: 1, 2, 3, 4
    'alpha': [1, 2],                    # tried: 0.1, 1, 2
    'lambda': [1, 5, 10],               # tried: 0.1, 1, 2, 5, 10
    'gamma': [1, 5, 10]                 # tried: 0.1, 1, 2, 5, 10
}

# dict to store results
results = {}

# generate all combinations of hyperparameters to explore
keys, values = zip(*explore_params.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

start_time = time.time()

# iterate over each combination of hyperparameters
for i, explore_param in enumerate(param_combinations):

    print('Parameters currently explored:', explore_param)
    
    # instantiate the model with combined parameters
    model = XGBRegressor(**constant_params, **explore_param)
    
    # train over expanding window
    model_outputs, y_true = train_with_expanding_window(
        df=df,  
        initial_train_size=initial_train_size, 
        expansion_limit=expansion_limit,
        test_size=test_size,           
        target_col=target_col, 
        model=model
    )
    
    # store outputs and true values in the results dictionary
    results[f"run_{i}"] = {
        "params": {**explore_param},
        "model_outputs": model_outputs,
        "y_true": y_true
    }

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 1, 'gamma': 1}
Total time taken: 2.52 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 1, 'gamma': 5}
Total time taken: 2.52 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 1, 'gamma': 10}
Total time taken: 2.53 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 5, 'gamma': 1}
Total time taken: 2.37 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 5, 'gamma': 5}
Total time taken: 2.55 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 5, 'gamma': 10}
Total time taken: 2.64 seconds
Parameters currently explored: {'booster': '

In [10]:
# get metrics for each combination of parameter values
results_df = utl.compile_results_to_dataframe(results)

# print best hyperparameter settings
results_df.sort_values(by='average_rmse', ascending=True).head()

Unnamed: 0,run_id,alpha,average_rmse,booster,gamma,lambda,learning_rate,max_depth
94,run_94,1,10.519,gbtree,5,5,1.0,6
73,run_73,1,10.959,gbtree,5,1,1.0,4
74,run_74,1,11.172,gbtree,10,1,1.0,4
92,run_92,1,11.176,gbtree,10,1,1.0,6
90,run_90,1,11.287,gbtree,1,1,1.0,6


In [11]:
# get best parameters from validation as dictionary
best_params = utl.get_best_params(results_df, metric='average_rmse')

# save the dictionary to a file
with open('../hyperparameters/XGB_pts_best_params.json', 'w') as json_file:
    json.dump(best_params, json_file, default=utl.handle_non_serializable, indent=4)

<a name="2.-Target:-Difference-in-Points-(plus-/-minus)"></a>
# 2. Target: Difference in Points (plus / minus)

[Return to top](#Models)

In [12]:
# configuration for expanding window
initial_train_size = season_22_ngames # starting size of the training set
expansion_limit = 50      # maximum number of new training observations in expansion
test_size = 1             # leave-one-out (LOO) cross-validation
df = pm_scaled_df         # data set to use
target_col = 'PLUS_MINUS' # target column name

# constant parameters
constant_params = {
    'random_state': 599,
    'n_jobs': -1,
    'objective': 'reg:squarederror',
    'n_estimators': 1000,
    'eval_metric': 'rmse',
    'early_stopping_rounds': 20
}

# parameters to explore
explore_params = {
    'booster': ['gbtree'],              # tried: 'gbtree', 'gblinear', 'dart'
    'learning_rate': [0.5, 1.0, 10],    # tried: 0.001, 0.01, 0.1, 0.5, 1.0
    'max_depth': [2, 4, 6],             # tried: 1, 2, 3, 4
    'alpha': [1, 2],                    # tried: 0.1, 1, 2
    'lambda': [1, 5, 10],               # tried: 0.1, 1, 2, 5, 10
    'gamma': [1, 5, 10]                 # tried: 0.1, 1, 2, 5, 10
}

# dict to store results
results = {}

# generate all combinations of hyperparameters to explore
keys, values = zip(*explore_params.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

start_time = time.time()

# iterate over each combination of hyperparameters
for i, explore_param in enumerate(param_combinations):

    print('Parameters currently explored:', explore_param)
    
    # instantiate the model with combined parameters
    model = XGBRegressor(**constant_params, **explore_param)
    
    # train over expanding window
    model_outputs, y_true = train_with_expanding_window(
        df=df,  
        initial_train_size=initial_train_size, 
        expansion_limit=expansion_limit,
        test_size=test_size,           
        target_col=target_col, 
        model=model
    )
    
    # store outputs and true values in the results dictionary
    results[f"run_{i}"] = {
        "params": {**explore_param},
        "model_outputs": model_outputs,
        "y_true": y_true
    }

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 1, 'gamma': 1}
Total time taken: 1.95 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 1, 'gamma': 5}
Total time taken: 1.96 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 1, 'gamma': 10}
Total time taken: 1.97 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 5, 'gamma': 1}
Total time taken: 2.36 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 5, 'gamma': 5}
Total time taken: 2.51 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 5, 'gamma': 10}
Total time taken: 2.23 seconds
Parameters currently explored: {'booster': '

In [13]:
# get metrics for each combination of parameter values
results_df = utl.compile_results_to_dataframe(results)

# print best hyperparameter settings
results_df.sort_values(by='average_rmse', ascending=True).head()

Unnamed: 0,run_id,alpha,average_rmse,booster,gamma,lambda,learning_rate,max_depth
76,run_76,1,7.216,gbtree,5,5,1.0,4
77,run_77,1,7.541,gbtree,10,5,1.0,4
73,run_73,1,7.667,gbtree,5,1,1.0,4
74,run_74,1,7.685,gbtree,10,1,1.0,4
72,run_72,1,7.767,gbtree,1,1,1.0,4


In [14]:
# get best parameters from validation as dictionary
best_params = utl.get_best_params(results_df, metric='average_rmse')

# save the dictionary to a file
with open('../hyperparameters/XGB_pm_best_params.json', 'w') as json_file:
    json.dump(best_params, json_file, default=utl.handle_non_serializable, indent=4)

<a name="3.-Target:-Game-Winner-(moneyline)"></a>
# 3. Target: Game Winner (moneyline)

[Return to top](#Models)

In [None]:
# configuration for expanding window
initial_train_size = season_22_ngames # starting size of the training set
expansion_limit = 50       # maximum number of new training observations in expansion
test_size = 1              # leave-one-out (LOO) cross-validation
df = res_scaled_df         # data set to use
target_col = 'GAME_RESULT' # target column name

# constant parameters
constant_params = {
    'random_state': 599,
    'n_jobs': -1,
    'objective': 'binary:logistic',
    'n_estimators': 1000,
    'eval_metric': 'error', # 'auc'
    'early_stopping_rounds': 20
}

# parameters to explore
explore_params = {
    'booster': ['gbtree'],              # tried: 'gbtree', 'gblinear', 'dart'
    'learning_rate': [0.5, 1.0, 10],    # tried: 0.001, 0.01, 0.1, 0.5, 1.0
    'max_depth': [2, 4, 6],             # tried: 1, 2, 3, 4
    'alpha': [1, 2],                    # tried: 0.1, 1, 2
    'lambda': [1, 5, 10],               # tried: 0.1, 1, 2, 5, 10
    'gamma': [1, 5, 10]                 # tried: 0.1, 1, 2, 5, 10
}

# dict to store results
results = {}

# generate all combinations of hyperparameters to explore
keys, values = zip(*explore_params.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

start_time = time.time()

# iterate over each combination of hyperparameters
for i, explore_param in enumerate(param_combinations):

    print('Parameters currently explored:', explore_param)
    
    # instantiate the model with combined parameters
    model = XGBClassifier(**constant_params, **explore_param)
    
    # train over expanding window
    model_outputs, y_true = train_with_expanding_window(
        df=df,  
        initial_train_size=initial_train_size, 
        expansion_limit=expansion_limit,
        test_size=test_size,           
        target_col=target_col, 
        model=model
    )
    
    # store outputs and true values in the results dictionary
    results[f"run_{i}"] = {
        "params": {**explore_param},
        "model_outputs": model_outputs,
        "y_true": y_true
    }

end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 1, 'gamma': 1}
Total time taken: 1.51 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 1, 'gamma': 5}
Total time taken: 1.42 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 1, 'gamma': 10}
Total time taken: 1.41 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 5, 'gamma': 1}
Total time taken: 1.45 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 5, 'gamma': 5}
Total time taken: 1.44 seconds
Parameters currently explored: {'booster': 'gbtree', 'learning_rate': 0.5, 'max_depth': 2, 'alpha': 1, 'lambda': 5, 'gamma': 10}
Total time taken: 1.40 seconds
Parameters currently explored: {'booster': '

In [None]:
# get metrics for each combination of parameter values
results_df = utl.compile_results_to_dataframe(results)

# print best hyperparameter settings
results_df.sort_values(by='average_accuracy', ascending=False).head()

In [None]:
# get best parameters from validation as dictionary
best_params = utl.get_best_params(results_df, metric='average_accuracy')

# save the dictionary to a file
with open('../hyperparameters/XGB_res_best_params.json', 'w') as json_file:
    json.dump(best_params, json_file, default=utl.handle_non_serializable, indent=4)