In [1]:
# Author: Roi Yehoshua <roiyeho@gmail.com>
# August 2024
# License: MIT

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
from sklearn.metrics import roc_auc_score

np.random.seed(42)

DataSet Exploration

In [3]:
df = pd.read_csv('data/higgs.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11000000 entries, 0 to 10999999
Data columns (total 29 columns):
 #   Column  Dtype  
---  ------  -----  
 0   0       float64
 1   1       float64
 2   2       float64
 3   3       float64
 4   4       float64
 5   5       float64
 6   6       float64
 7   7       float64
 8   8       float64
 9   9       float64
 10  10      float64
 11  11      float64
 12  12      float64
 13  13      float64
 14  14      float64
 15  15      float64
 16  16      float64
 17  17      float64
 18  18      float64
 19  19      float64
 20  20      float64
 21  21      float64
 22  22      float64
 23  23      float64
 24  24      float64
 25  25      float64
 26  26      float64
 27  27      float64
 28  28      float64
dtypes: float64(29)
memory usage: 2.4 GB


In [5]:
# Count missing values across the entire table
missing_values = df.isnull().sum().sum()
print(missing_values)

0


In [6]:
df[0].value_counts()

0
1.0    5829123
0.0    5170877
Name: count, dtype: int64

Data Preparation

In [7]:
X = df.drop([0], axis=1)
y = df[0].astype(int)

In [8]:
# The last 500,000 examples are used as a test set
test_size = 500000
X_train, y_train = X[:-test_size], y[:-test_size]
X_test, y_test = X[-test_size:], y[-test_size:]

In [9]:
print('Training set size:', len(X_train))
print('Test set size:', len(X_test))

Training set size: 10500000
Test set size: 500000


Benchmark Gradient Boosting Classifiers

In [10]:
def evaluate_classifiers(classifiers, X_train, y_train, X_test, y_test, verbose=True):
    results = {}

    for name, clf in classifiers.items():
        # Train the classifier and measure the training time
        start_time = time.time()
        clf.fit(X_train, y_train)
        training_time = time.time() - start_time
        
        # Compute accuracy and AUC on the test set
        test_accuracy = clf.score(X_test, y_test)
        test_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        
        # Store the evaluation metrics in a dictionary
        results[name] = {            
            'Test Accuracy (%)': test_accuracy * 100,
            'Test AUC': test_auc,
            'Training Time (s)': np.round(training_time, 3)
        }              
        if verbose:
            print(results[name])       
         
    return results

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

classifiers = {
    'LogisticRegression': LogisticRegression(random_state=42),
    'HistGradientBoosting': HistGradientBoostingClassifier(random_state=42), 
    'XGBoost (CPU)': XGBClassifier(random_state=42, verbosity=0),
    'XGBoost (GPU)': XGBClassifier(device='gpu', random_state=42, verbosity=0),
    'LightGBM': LGBMClassifier(device='gpu', random_state=42, verbose=0),
    'CatBoost': CatBoostClassifier(task_type='GPU', random_state=42, verbose=0)
}

results = evaluate_classifiers(classifiers, X_train, y_train, X_test, y_test)

{'Test Accuracy (%)': 64.13759999999999, 'Test AUC': 0.6844149305586753, 'Training Time (s)': 35.118}
{'Test Accuracy (%)': 73.172, 'Test AUC': 0.8121522445298159, 'Training Time (s)': 175.034}
{'Test Accuracy (%)': 74.1528, 'Test AUC': 0.823515052394782, 'Training Time (s)': 82.585}
{'Test Accuracy (%)': 74.1496, 'Test AUC': 0.8234700204891525, 'Training Time (s)': 27.661}
{'Test Accuracy (%)': 73.152, 'Test AUC': 0.8122095080704034, 'Training Time (s)': 73.56}
{'Test Accuracy (%)': 73.0466, 'Test AUC': 0.8104931627727872, 'Training Time (s)': 253.875}


In [12]:
results_df = pd.DataFrame.from_dict(results).T
results_df

Unnamed: 0,Test Accuracy (%),Test AUC,Training Time (s)
LogisticRegression,64.1376,0.684415,35.118
HistGradientBoosting,73.172,0.812152,175.034
XGBoost (CPU),74.1528,0.823515,82.585
XGBoost (GPU),74.1496,0.82347,27.661
LightGBM,73.152,0.81221,73.56
CatBoost,73.0466,0.810493,253.875
