In [2]:
import numpy as np
import joblib
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def hist_gradient_boosting_tuning(X_train, y_train, X_test, y_test):
    # Define the parameter grid for tuning
    param_grid = {
        'learning_rate': [0.01, 0.1, 0.5],
        'max_iter': [100, 200, 500],
        'max_depth': [3, 5, 7, None],
        'min_samples_leaf': [1, 5, 10],
        'l2_regularization': [0, 0.1, 1.0]
    }
    
    # Create the HistGradientBoostingClassifier
    hist_gb = HistGradientBoostingClassifier(random_state=42)
    
    # Create GridSearchCV object
    grid_search = GridSearchCV(
        estimator=hist_gb, 
        param_grid=param_grid, 
        cv=5,  # 5-fold cross-validation
        scoring='accuracy',
        n_jobs=-1,  # Use all available cores
        verbose=2
    )
    
    # Fit the grid search
    grid_search.fit(X_train, y_train)
    
    # Best model
    best_model = grid_search.best_estimator_
    
    # Predictions
    y_pred = best_model.predict(X_test)
    
    # Print results
    print("Best Hyperparameters:", grid_search.best_params_)
    print("\nBest Cross-Validation Score:", grid_search.best_score_)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    
    # Detailed Classification Report
    # print("\nClassification Report:")
    # print(classification_report(y_test, y_pred))
    
    # Confusion Matrix Visualization
    # plt.figure(figsize=(8, 6))
    # cm = confusion_matrix(y_test, y_pred)
    # sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    # plt.title('Confusion Matrix - HistGradientBoostingClassifier')
    # plt.ylabel('True Label')
    # plt.xlabel('Predicted Label')
    # plt.tight_layout()
    # plt.savefig('confusion_matrix.png')
    # plt.close()
    
    # Save the best model
    # joblib.dump(best_model, 'best_hist_gradient_boosting_model.joblib')
    
    return best_model, grid_search.best_params_

In [4]:
def main():
    # Load the data
    data = np.load('../../Data/#1/processed_data.npz')
    x_tr_resample = data['x_tr_resample']
    y_tr_resample = data['y_tr_resample']
    X_test = data['X_test']
    y_test = data['y_test']
    X_train = data['X_train']

    with open('../../Data/#1/power_transformer.pkl', 'rb') as f:
        norm = pickle.load(f)

    # Load normalization transformer
    # norm = joblib.load('../../Data/#1/power_transformer.joblib')

    # Normalize features
    norm_train_feature = norm.fit_transform(X_train)
    norm_test_feature = norm.transform(X_test)

    best_model, best_params = hist_gradient_boosting_tuning(
        x_tr_resample, 
        y_tr_resample, 
        norm_test_feature, 
        y_test
    )

    print("\nHyperparameter Tuning Complete!")

In [5]:
if __name__ == '__main__':
    main()

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Hyperparameters: {'l2_regularization': 0, 'learning_rate': 0.1, 'max_depth': None, 'max_iter': 500, 'min_samples_leaf': 10}

Best Cross-Validation Score: 0.9900245298446444
Accuracy: 98.07%

Hyperparameter Tuning Complete!
