In [9]:
# Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Dataset
from sklearn.datasets import fetch_openml

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Models
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

In [10]:
dataset = fetch_openml(data_id=4538, as_frame=True)

X = dataset.data
y = dataset.target

print("Dataset shape:", X.shape)
print("Number of classes:", len(y.unique()))

Dataset shape: (9873, 32)
Number of classes: 5


In [11]:
imputer = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

print("Missing values handled.")

Missing values handled.


In [12]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Encoded classes:", np.unique(y_encoded))

Encoded classes: [0 1 2 3 4]


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.3,
    stratify=y_encoded,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (6911, 32)
Test shape: (2962, 32)


In [14]:
gb_model = GradientBoostingClassifier(random_state=42)

gb_model.fit(X_train, y_train)

gb_preds = gb_model.predict(X_test)

baseline_accuracy = accuracy_score(y_test, gb_preds)

print("Gradient Boosting Baseline Accuracy:", baseline_accuracy)
print(classification_report(y_test, gb_preds))

Gradient Boosting Baseline Accuracy: 0.5793382849426063
              precision    recall  f1-score   support

           0       0.62      0.82      0.71       822
           1       0.53      0.30      0.38       300
           2       0.54      0.45      0.49       629
           3       0.54      0.36      0.43       326
           4       0.57      0.62      0.59       885

    accuracy                           0.58      2962
   macro avg       0.56      0.51      0.52      2962
weighted avg       0.57      0.58      0.57      2962



In [15]:
param_dist_gb = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4],
    'subsample': [0.8, 1.0]
}

random_search_gb = RandomizedSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_distributions=param_dist_gb,
    n_iter=2,
    cv=2,
    n_jobs=-1,
    random_state=42
)

random_search_gb.fit(X_train, y_train)

best_gb_model = random_search_gb.best_estimator_

gb_tuned_preds = best_gb_model.predict(X_test)

tuned_accuracy = accuracy_score(y_test, gb_tuned_preds)

print("Gradient Boosting Tuned Accuracy:", tuned_accuracy)
print(classification_report(y_test, gb_tuned_preds))

Gradient Boosting Tuned Accuracy: 0.6090479405806887
              precision    recall  f1-score   support

           0       0.65      0.82      0.73       822
           1       0.55      0.39      0.46       300
           2       0.55      0.46      0.50       629
           3       0.58      0.44      0.50       326
           4       0.61      0.65      0.63       885

    accuracy                           0.61      2962
   macro avg       0.59      0.55      0.56      2962
weighted avg       0.60      0.61      0.60      2962



In [21]:
et_model = ExtraTreesClassifier(random_state=42)

et_model.fit(X_train, y_train)

et_preds = et_model.predict(X_test)

et_baseline_accuracy = accuracy_score(y_test, et_preds)

print("Extra Trees Baseline Accuracy:", et_baseline_accuracy)
print(classification_report(y_test, et_preds))

Extra Trees Baseline Accuracy: 0.6772451046590142
              precision    recall  f1-score   support

           0       0.68      0.88      0.77       822
           1       0.88      0.40      0.55       300
           2       0.71      0.52      0.60       629
           3       0.75      0.39      0.51       326
           4       0.63      0.80      0.70       885

    accuracy                           0.68      2962
   macro avg       0.73      0.60      0.63      2962
weighted avg       0.70      0.68      0.66      2962



In [22]:
param_dist_et = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2']
}

random_search_et = RandomizedSearchCV(
    estimator=ExtraTreesClassifier(random_state=42),
    param_distributions=param_dist_et,
    n_iter=5,
    cv=3,
    n_jobs=-1,
    random_state=42
)

random_search_et.fit(X_train, y_train)

best_et_model = random_search_et.best_estimator_

et_tuned_preds = best_et_model.predict(X_test)

et_tuned_accuracy = accuracy_score(y_test, et_tuned_preds)

print("Extra Trees Tuned Accuracy:", et_tuned_accuracy)
print(classification_report(y_test, et_tuned_preds))

Extra Trees Tuned Accuracy: 0.5955435516542876
              precision    recall  f1-score   support

           0       0.56      0.89      0.69       822
           1       0.84      0.09      0.16       300
           2       0.67      0.45      0.54       629
           3       0.81      0.32      0.45       326
           4       0.58      0.71      0.63       885

    accuracy                           0.60      2962
   macro avg       0.69      0.49      0.49      2962
weighted avg       0.64      0.60      0.56      2962



In [27]:
comparison_df = pd.DataFrame({
    'Model': ['Gradient Boosting', 'Extra Trees'],
    'Best Accuracy': [tuned_accuracy, et_baseline_accuracy]
})

print(comparison_df)

               Model  Best Accuracy
0  Gradient Boosting       0.609048
1        Extra Trees       0.677245


The Gradient Boosting classifier increased in accuracy from 57.9% to 60.9% after hyperparameter tuning.
The accuracy of the Extra Trees classifier did not increase with hyperparameter tuning. The baseline parameters yielded the highest accuracy (67.7%), and so these parameters were used in the final model.
In general, the Extra Trees performed better than Gradient Boosting on this dataset.
