In [1]:
# Author: Roi Yehoshua <roiyeho@gmail.com>
# License: MIT

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time

np.random.seed(0)

Higgs DataSet

In [2]:
df = pd.read_csv('data/higgs.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [3]:
X = df.drop([0], axis=1)
y = df[0].astype(int)

In [4]:
y.value_counts()

1    5829123
0    5170877
Name: 0, dtype: int64

In [5]:
# The last 500,000 examples are used as a test set
X_train, y_train = X[:-500000], y[:-500000]
X_test, y_test = X[-500000:], y[-500000:]

In [15]:
def evaluate_classifier(clf, X_train, y_train, X_test, y_test):
    train_start_time = time.time()
    clf.fit(X_train, y_train)
    elapsed = time.time() - train_start_time
    print(f'Training time: {elapsed:.2f} sec')
    
    print(f'Train accuracy: {clf.score(X_train, y_train):.4f}')
    print(f'Test accuracy: {clf.score(X_test, y_test):.4f}')

In [6]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(random_state=0)
evaluate_classifier(xgb_clf, X_train, y_train, X_test, y_test)

Training time: 743.600 sec


In [16]:
from sklearn.ensemble import GradientBoostingClassifier

gbt_clf = GradientBoostingClassifier(random_state=0)
evaluate_classifier(gbt_clf, X_train, y_train, X_test, y_test)

Training time: 9071.17 sec
Train accuracy: 0.7144
Test accuracy: 0.7138


In [8]:
from sklearn.ensemble import HistGradientBoostingClassifier

hist_gbt_clf = HistGradientBoostingClassifier(random_state=0)
evaluate_classifier(hist_gbt_clf, X_train, y_train, X_test, y_test)

Training time: 168.070 sec


In [10]:
from sklearn.ensemble import RandomForestClassifier

train_start_time = time.time()
rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
evaluate_classifier(rf_clf, X_train, y_train, X_test, y_test)

Training time: 2421.052 sec


In [12]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
evaluate_classifier(log_clf, X_train, y_train, X_test, y_test)

Training time: 82.619 sec
