##### Hyperparameter Tuning

Here I will do hyperparameter tuning on the models on the dataset that is balanced and unscaled as that determined the best dataset

Import required packages and libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV

import warnings

Define the evaluate_model function as done in Model training

In [3]:
def evaluate_model(true, predicted, model, X_test):
    precision = precision_score(true, predicted , zero_division = 0)
    recall = recall_score(true, predicted , zero_division = 0)
    f1 = f1_score(true , predicted, zero_division = 0)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(true, y_pred_proba)
    pr_auc = average_precision_score(true, y_pred_proba)
    return precision, recall, f1, roc_auc, pr_auc

Define the no_scale_classification function as defined in Model training

These are the following hyperparameters that I use:

Decision Tree
* 'max_depth': [3, 5, 7, 10, 15, 20, None],
* 'min_samples_split': [2, 5, 10, 20],
* 'min_samples_leaf': [1, 2, 4, 8],
                'criterion': ['gini', 'entropy'],
                'max_features': ['sqrt', 'log2', None],
                'class_weight': ['balanced', None]
* 

Random forest
* 
* 

XGBoost
* 
* 

CatBoost
* 
* 

AdaBoost
* 
* 

Gradient Boosting
* 
* 

In [4]:
def no_scale_classification_hyperparam_tuning(X_train, y_train, X_test, y_test):
    
    # Models with hyperparameter tuning
    tuned_models = {
        "Decision Tree": {
            "model": DecisionTreeClassifier(random_state=42),
            "params": {
                'max_depth': [3, 5, 7, 10, 15, 20, None],
                'min_samples_split': [2, 5, 10, 20],
                'min_samples_leaf': [1, 2, 4, 8],
                'criterion': ['gini', 'entropy'],
                'max_features': ['sqrt', 'log2', None],
                'class_weight': ['balanced', None]
            }
        },
        "Random Forest": {
            "model": RandomForestClassifier(random_state=42),
            "params": {
                'n_estimators': [50, 100, 200, 300],
                'max_depth': [5, 10, 15, 20, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2'],
                'class_weight': ['balanced', 'balanced_subsample', None],
                'bootstrap': [True, False]
            }
        },
        "XGBoost": {
            "model": XGBClassifier(random_state=42, eval_metric='logloss'),
            "params": {
                'n_estimators': [50, 100, 200, 300],
                'max_depth': [3, 5, 7, 9],
                'learning_rate': [0.01, 0.05, 0.1, 0.2],
                'subsample': [0.6, 0.8, 1.0],
                'colsample_bytree': [0.6, 0.8, 1.0],
                'gamma': [0, 0.1, 0.5, 1],
                'min_child_weight': [1, 3, 5],
                'scale_pos_weight': [1, 2, 3]
            }
        },
        "CatBoost": {
            "model": CatBoostClassifier(random_state=42, verbose=False),
            "params": {
                'iterations': [50, 100, 200, 300],
                'depth': [4, 6, 8, 10],
                'learning_rate': [0.01, 0.05, 0.1, 0.2],
                'l2_leaf_reg': [1, 3, 5, 7],
                'border_count': [32, 64, 128],
                'class_weights': [[1, 1], [1, 2], [1, 3]]
            }
        },
        "AdaBoost": {
            "model": AdaBoostClassifier(random_state=42),
            "params": {
                'n_estimators': [50, 100, 200, 300],
                'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
                'algorithm': ['SAMME', 'SAMME.R']
            }
        },
        "GradientBoosting": {
            "model": GradientBoostingClassifier(random_state=42),
            "params": {
                'n_estimators': [50, 100, 200, 300],
                'max_depth': [3, 5, 7, 9],
                'learning_rate': [0.01, 0.05, 0.1, 0.2],
                'subsample': [0.6, 0.8, 1.0],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2', None],
                'loss': ['log_loss', 'exponential'],
            }
        }
    }
    
    results = []
    
    # Train models with hyperparameter tuning
    print("\nTraining models with hyperparameter tuning...")
    for model_name, model_info in tuned_models.items():
        print(f"  - {model_name} (this may take a while...)")
        
        random_search = RandomizedSearchCV(
            estimator=model_info["model"],
            param_distributions=model_info["params"],
            n_iter=50,
            cv=5,
            scoring='roc_auc',
            random_state=42,
            n_jobs=-1,
            verbose=0
        )
        
        random_search.fit(X_train, y_train)
        best_model = random_search.best_estimator_
        
        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)
        
        accuracy_train = accuracy_score(y_train, y_train_pred)
        accuracy_test = accuracy_score(y_test, y_test_pred)
        
        train_precision, train_recall, train_f1, train_roc_auc, train_pr_auc = evaluate_model(
            y_train, y_train_pred, best_model, X_train
        )
        test_precision, test_recall, test_f1, test_roc_auc, test_pr_auc = evaluate_model(
            y_test, y_test_pred, best_model, X_test
        )
        
        results.append({
            'Model': model_name,
            'Train Accuracy': accuracy_train,
            'Test Accuracy': accuracy_test,
            'Test Precision': test_precision,
            'Test Recall': test_recall,
            'Test F1': test_f1,
            'Test ROC AUC': test_roc_auc,
            'Test PR AUC': test_pr_auc,
            'Tuned': 'Yes',
            'Best Params': str(random_search.best_params_)
        })
    
    # Create DataFrame
    df_results = pd.DataFrame(results)
    
    # Display main results table
    print("\n" + "=" * 150)
    print("MODEL COMPARISON - ALL METRICS")
    print("=" * 150)
    display_df = df_results.drop('Best Params', axis=1)
    print(display_df.to_string(index=False, float_format=lambda x: f'{x:.4f}'))
    
    # Display best parameters for tuned models
    print("\n" + "=" * 150)
    print("BEST HYPERPARAMETERS FOR TUNED MODELS")
    print("=" * 150)
    tuned_df = df_results[df_results['Tuned'] == 'Yes'][['Model', 'Best Params']]
    for idx, row in tuned_df.iterrows():
        print(f"\n{row['Model']}:")
        params = eval(row['Best Params'])
        for param, value in params.items():
            print(f"  - {param}: {value}")
    
    # Display sorted by Test ROC AUC
    print("\n" + "=" * 150)
    print("TOP 5 MODELS BY TEST ROC AUC")
    print("=" * 150)
    top_models = df_results.nlargest(5, 'Test ROC AUC')[['Model', 'Test Accuracy', 'Test ROC AUC', 'Test F1', 'Tuned']]
    print(top_models.to_string(index=False, float_format=lambda x: f'{x:.4f}'))
    print("\n")
    
    return df_results

Load in the unscaled and balanced dataset, with final feature selection performed

In [5]:
X_train=pd.read_csv('data/final_X_train.csv')
y_train=pd.read_csv('data/unscaled_balanced_y_train.csv')

Load in the test datasets

In [6]:
X_test=pd.read_csv('data/final_X_test.csv')
y_test=pd.read_csv('data/y_test.csv')

Run the hyperparam tuning

In [7]:
no_scale_classification_hyperparam_tuning(X_train, y_train, X_test, y_test)


Training models with hyperparameter tuning...
  - Decision Tree (this may take a while...)
  - Random Forest (this may take a while...)


  return fit_method(estimator, *args, **kwargs)


  - XGBoost (this may take a while...)
  - CatBoost (this may take a while...)
  - AdaBoost (this may take a while...)


  y = column_or_1d(y, warn=True)


  - GradientBoosting (this may take a while...)


  y = column_or_1d(y, warn=True)



MODEL COMPARISON - ALL METRICS
           Model  Train Accuracy  Test Accuracy  Test Precision  Test Recall  Test F1  Test ROC AUC  Test PR AUC Tuned
   Decision Tree          0.8359         0.7962          0.3934       0.5571   0.4611        0.7938       0.4030   Yes
   Random Forest          0.9737         0.8205          0.4276       0.4342   0.4309        0.8016       0.4065   Yes
         XGBoost          0.8636         0.8383          0.4819       0.4384   0.4591        0.8252       0.4577   Yes
        CatBoost          0.8655         0.8387          0.4833       0.4420   0.4618        0.8264       0.4590   Yes
        AdaBoost          0.8455         0.8166          0.4300       0.5281   0.4740        0.8205       0.4452   Yes
GradientBoosting          0.8672         0.8359          0.4745       0.4489   0.4614        0.8245       0.4557   Yes

BEST HYPERPARAMETERS FOR TUNED MODELS

Decision Tree:
  - min_samples_split: 2
  - min_samples_leaf: 8
  - max_features: None
  - max_

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Test Precision,Test Recall,Test F1,Test ROC AUC,Test PR AUC,Tuned,Best Params
0,Decision Tree,0.83591,0.79622,0.393384,0.557109,0.461146,0.793831,0.402966,Yes,"{'min_samples_split': 2, 'min_samples_leaf': 8..."
1,Random Forest,0.97371,0.820482,0.427632,0.434202,0.430892,0.801592,0.406459,Yes,"{'n_estimators': 300, 'min_samples_split': 5, ..."
2,XGBoost,0.86363,0.83834,0.481933,0.438358,0.459114,0.825184,0.457683,Yes,"{'subsample': 1.0, 'scale_pos_weight': 1, 'n_e..."
3,CatBoost,0.865475,0.838714,0.483338,0.44201,0.461751,0.826403,0.459016,Yes,"{'learning_rate': 0.1, 'l2_leaf_reg': 5, 'iter..."
4,AdaBoost,0.845525,0.81656,0.429977,0.528145,0.474032,0.820547,0.445158,Yes,"{'n_estimators': 300, 'learning_rate': 1.0, 'a..."
5,GradientBoosting,0.867205,0.835935,0.474511,0.448936,0.461369,0.824529,0.455708,Yes,"{'subsample': 0.8, 'n_estimators': 300, 'min_s..."


In [None]:
# ======================================================================================================================================================
# MODEL COMPARISON - ALL METRICS
# ======================================================================================================================================================
#            Model  Train Accuracy  Test Accuracy  Test Precision  Test Recall  Test F1  Test ROC AUC  Test PR AUC Tuned
#    Decision Tree          0.8359         0.7962          0.3934       0.5571   0.4611        0.7938       0.4030   Yes
#    Random Forest          0.9737         0.8205          0.4276       0.4342   0.4309        0.8016       0.4065   Yes
#          XGBoost          0.8636         0.8383          0.4819       0.4384   0.4591        0.8252       0.4577   Yes
#         CatBoost          0.8655         0.8387          0.4833       0.4420   0.4618        0.8264       0.4590   Yes
#         AdaBoost          0.8455         0.8166          0.4300       0.5281   0.4740        0.8205       0.4452   Yes
# GradientBoosting          0.8672         0.8359          0.4745       0.4489   0.4614        0.8245       0.4557   Yes

# ======================================================================================================================================================
# BEST HYPERPARAMETERS FOR TUNED MODELS
# ======================================================================================================================================================

# Decision Tree:
#   - min_samples_split: 2
#   - min_samples_leaf: 8
#   - max_features: None
#   - max_depth: 15
#   - criterion: gini
#   - class_weight: None

# Random Forest:
#   - n_estimators: 300
#   - min_samples_split: 5
#   - min_samples_leaf: 2
#   - max_features: sqrt
#   - max_depth: None
#   - class_weight: balanced_subsample
#   - bootstrap: False

# XGBoost:
#   - subsample: 1.0
#   - scale_pos_weight: 1
#   - n_estimators: 200
#   - min_child_weight: 1
#   - max_depth: 5
#   - learning_rate: 0.2
#   - gamma: 0.1
#   - colsample_bytree: 0.6

# CatBoost:
#   - learning_rate: 0.1
#   - l2_leaf_reg: 5
#   - iterations: 300
#   - depth: 6
#   - class_weights: [1, 1]
#   - border_count: 128

# AdaBoost:
#   - n_estimators: 300
#   - learning_rate: 1.0
#   - algorithm: SAMME.R

# GradientBoosting:
#   - subsample: 0.8
#   - n_estimators: 300
#   - min_samples_split: 2
#   - min_samples_leaf: 1
#   - max_features: None
#   - max_depth: 5
#   - loss: log_loss
#   - learning_rate: 0.1

# ======================================================================================================================================================
# TOP 5 MODELS BY TEST ROC AUC
# ======================================================================================================================================================
#            Model  Test Accuracy  Test ROC AUC  Test F1 Tuned
#         CatBoost         0.8387        0.8264   0.4618   Yes
#          XGBoost         0.8383        0.8252   0.4591   Yes
# GradientBoosting         0.8359        0.8245   0.4614   Yes
#         AdaBoost         0.8166        0.8205   0.4740   Yes
#    Random Forest         0.8205        0.8016   0.4309   Yes