In [None]:
#===============================================================================
# Copyright 2014-2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================

# daal4py Gradient Boosting Classification model creation from XGBoost example

In this example we will train a XGBoost model and predict using the daal4py prediction method for increased performance. Intel optimized XGBoost and daal4py shipped as a part of the oneAPI AI Analytics Toolkit.

In this example, we will use a dataset with particle features and functions of those features **to distinguish between a signal process which produces Higgs bosons (1) and a background process which does not (0)**. The Higgs boson is a basic particle in the standard model produced by the quantum excitation of the Higgs field, named after physicist Peter Higgs. Users can opt to remove the data portion of this sample and replace it with their own data as they see fit.

## Importing and Organizing Data

Let's start by **importing** all necessary data and packages.


In [None]:
import daal4py as d4p
import xgboost as xgb
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import os
import requests
import sys

Now let's **load** in the Higgs dataset and **organize** it as necessary to work with our model. You can opt to remove this cell and add your own data as you see fit.

In [None]:
def load_higgs(nrows_train, nrows_test, dtype=np.float32):
    if not os.path.isfile("./data/batch/HIGGS.csv.gz"):
        print("Loading data set...")
        url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
        myfile = requests.get(url)
        open('./data/batch/HIGGS.csv.gz', 'wb').write(myfile.content)
    print("Reading data set...")
    data = pd.read_csv("./data/batch/HIGGS.csv.gz", delimiter=",", header=None, compression="gzip", dtype=dtype, nrows=nrows_train+nrows_test)
    print("Pre-processing data set...")
    data = data[list(data.columns[1:])+list(data.columns[0:1])]
    n_features = data.shape[1]-1
    train_data = np.ascontiguousarray(data.values[:nrows_train,:n_features])
    train_label = np.ascontiguousarray(data.values[:nrows_train,n_features])
    test_data = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,:n_features])
    test_label = np.ascontiguousarray(data.values[nrows_train:nrows_train+nrows_test,n_features])
    n_classes = len(np.unique(train_label))
    print(sys.getsizeof(train_data))
    return train_data, train_label, test_data, test_label, n_classes, n_features 

We will run this model and prediction using 1,000,000 rows of the Higgs dataset.

In [None]:
train_data, train_label, test_data, test_label, n_classes, n_features = load_higgs(990000, 10000)

## Training the Model

**Fitting and training the model** using the training dataset, which consists of particle features and functions of those features to help discern between a signal process that produces Higgs bosons and background process.

In [None]:
# Set XGBoost parameters
xgb_params = {
    'verbosity':                    0,
    'alpha':                        0.9,
    'max_bin':                      256,
    'scale_pos_weight':             2,
    'learning_rate':                0.1,
    'subsample':                    1,
    'reg_lambda':                   1,
    "min_child_weight":             0,
    'max_depth':                    8,
    'max_leaves':                   2**8,
    'objective':                    'binary:logistic',
    'predictor':                    'cpu_predictor',
    'tree_method':                  'hist',
    'n_estimators':                1000
}

# Train the model
t0 = time.time() #begin timer
model_xgb= xgb.XGBClassifier(**xgb_params)
model_xgb.fit(train_data, train_label)

## Using daal4py for Prediction

### XGBoost Prediction

We will also make a prediction using XGBoost for accuracy/performance comparison.

In [None]:
# XGBoost prediction (for accuracy comparison)
t0 = time.time()
xgb_prediction = model_xgb.predict(test_data)
t1 = time.time()
xgb_errors_count = np.count_nonzero(xgb_prediction - np.ravel(test_label))

xgb_total = t1-t0

### daal4py Prediction

Now let's make a prediction using daal4py for increased performance.

In [None]:
# Conversion to daal4py
daal_model = d4p.get_gbt_model_from_xgboost(model_xgb.get_booster())
t0 = time.time()
daal_prediction = d4p.gbt_classification_prediction(nClasses = n_classes).compute(test_data, daal_model)
t1 = time.time()
daal_errors_count = np.count_nonzero(np.ravel(daal_prediction.prediction) - test_label)

d4p_total = t1-t0

In [None]:
assert np.absolute(xgb_errors_count - daal_errors_count) == 0
y_test = np.ravel(test_label)
daal_prediction = np.ravel(daal_prediction.prediction)


## Accuracy & Performance Comparison: XGBoots Prediction vs. Daal4py Prediction
### No accuracy loss!

In [None]:
print("\nXGBoost prediction results (first 10 rows):\n", xgb_prediction[0:10])
print("\ndaal4py prediction results (first 10 rows):\n", daal_prediction[0:10])
print("\nGround truth (first 10 rows):\n", y_test[0:10])

print("XGBoost errors count:", xgb_errors_count)
print("XGBoost accuracy score:", 1 - xgb_errors_count / xgb_prediction.shape[0])

print("\ndaal4py errors count:", daal_errors_count)
print("daal4py accuracy score:", 1 - daal_errors_count / daal_prediction.shape[0])

print("\n XGBoost Prediction Time:", xgb_total)
print("\n daal4py Prediction Time:", d4p_total)
print("\nAll looks good!")

### Visualizations

#### Performance

In [None]:
left = [1,2]
pred_times = [xgb_total, d4p_total]
tick_label = ['XGBoost Prediction', 'daal4py Prediction']
plt.bar(left, pred_times, tick_label = tick_label, width = 0.5, color = ['red', 'blue'])
plt.xlabel('Prediction Method'); plt.ylabel('time,s'); plt.title('Prediction time,s')
plt.show()
print("speedup:",xgb_total/d4p_total)

#### Accuracy

In [None]:
left = [1,2]
xgb_acc = 1 - xgb_errors_count / xgb_prediction.shape[0]
d4p_acc = 1 - daal_errors_count / daal_prediction.shape[0]
pred_acc = [xgb_acc, d4p_acc]
tick_label = ['XGBoost Prediction', 'daal4py Prediction']
plt.bar(left, pred_acc, tick_label = tick_label, width = 0.5, color = ['red', 'blue'])
plt.xlabel('Prediction Method'); plt.ylabel('accuracy, %'); plt.title('Prediction Accuracy, %')
plt.show()
print("Accuracy Difference",xgb_acc-d4p_acc)

In [None]:
print("[CODE_SAMPLE_COMPLETED_SUCCESFULLY]")