In [1]:
import numpy as np
import pandas as pd
from itertools import product

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor
)

#### SPLIT X / y

In [None]:
y = train_df["Item_Outlet_Sales"]
X = train_df.drop("Item_Outlet_Sales", axis=1)

#### ENCODING

In [None]:
cat_cols = X.select_dtypes(include="object").columns

try:
    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

encoder.fit(X[cat_cols])

X_final = np.hstack([
    X.drop(columns=cat_cols).values,
    encoder.transform(X[cat_cols])
])

test_final = np.hstack([
    test_df.drop(columns=cat_cols).values,
    encoder.transform(test_df[cat_cols])
])

#### CV RMSE FUNCTION

In [None]:
def cv_rmse(model, X, y, folds=5):
    kf = KFold(n_splits=folds, shuffle=True, random_state=RANDOM_STATE)
    rmses = []

    for tr, val in kf.split(X):
        model.fit(X[tr], y.iloc[tr])
        preds = model.predict(X[val])
        rmses.append(
            np.sqrt(mean_squared_error(y.iloc[val], preds))
        )

    return np.mean(rmses)

#### STAGE-WISE HYPERPARAMETER TUNING (HistGB)

In [None]:
print("\nStage 1: Coarse Search")

stage1_grid = {
    "learning_rate": [0.02, 0.04, 0.06],
    "max_depth": [5, 7, 9],
    "min_samples_leaf": [20, 30, 40]
}

best_rmse = np.inf
best_params = None

for values in product(*stage1_grid.values()):
    params = dict(zip(stage1_grid.keys(), values))
    model = HistGradientBoostingRegressor(random_state=RANDOM_STATE, **params)
    rmse = cv_rmse(model, X_final, y)
    print(f"{params} -> RMSE: {rmse:.2f}")

    if rmse < best_rmse:
        best_rmse = rmse
        best_params = params

print("\nStage 2: Refined Search")

stage2_grid = {
    "learning_rate": [
        best_params["learning_rate"] * 0.8,
        best_params["learning_rate"],
        best_params["learning_rate"] * 1.2
    ],
    "max_depth": [
        best_params["max_depth"] - 1,
        best_params["max_depth"],
        best_params["max_depth"] + 1
    ],
    "min_samples_leaf": [
        int(best_params["min_samples_leaf"] * 0.8),
        best_params["min_samples_leaf"],
        int(best_params["min_samples_leaf"] * 1.2)
    ]
}

for values in product(*stage2_grid.values()):
    params = dict(zip(stage2_grid.keys(), values))
    model = HistGradientBoostingRegressor(random_state=RANDOM_STATE, **params)
    rmse = cv_rmse(model, X_final, y)
    print(f"{params} -> RMSE: {rmse:.2f}")

    if rmse < best_rmse:
        best_rmse = rmse
        best_params = params

print("\nStage 3: Regularization Search")

stage3_grid = {
    "l2_regularization": [0.0, 0.1, 0.3],
    "max_bins": [200,255]
}

final_params = best_params.copy()

for values in product(*stage3_grid.values()):
    params = final_params | dict(zip(stage3_grid.keys(), values))
    model = HistGradientBoostingRegressor(random_state=RANDOM_STATE, **params)
    rmse = cv_rmse(model, X_final, y)
    print(f"{params} -> RMSE: {rmse:.2f}")

    if rmse < best_rmse:
        best_rmse = rmse
        final_params = params


#### TRAIN FINAL MODEL

In [None]:
print("\n==============================")
print("FINAL MODEL (OPTIMIZED)")
print("==============================")
print(f"Parameters: {final_params}")
print(f"CV RMSE   : {best_rmse:.2f}")

final_model = HistGradientBoostingRegressor(
    random_state=RANDOM_STATE,
    **final_params
)

final_model.fit(X_final, y)