In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt  # for plotting facilities

## Data Loading


In [6]:
data = '/kaggle/input/wholesale-customers-data/Wholesale customers data.csv'

df = pd.read_csv(data)

In [7]:
df.shape

(440, 8)

In [8]:
df.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Channel           440 non-null    int64
 1   Region            440 non-null    int64
 2   Fresh             440 non-null    int64
 3   Milk              440 non-null    int64
 4   Grocery           440 non-null    int64
 5   Frozen            440 non-null    int64
 6   Detergents_Paper  440 non-null    int64
 7   Delicassen        440 non-null    int64
dtypes: int64(8)
memory usage: 27.6 KB


In [10]:
df.isnull().sum()

Channel             0
Region              0
Fresh               0
Milk                0
Grocery             0
Frozen              0
Detergents_Paper    0
Delicassen          0
dtype: int64

In [11]:
X = df.drop('Channel', axis=1)

y = df['Channel']

In [12]:
X.head()

Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,3,12669,9656,7561,214,2674,1338
1,3,7057,9810,9568,1762,3293,1776
2,3,6353,8808,7684,2405,3516,7844
3,3,13265,1196,4221,6404,507,1788
4,3,22615,5410,7198,3915,1777,5185


In [14]:
y.head()

0    2
1    2
2    2
3    1
4    2
Name: Channel, dtype: int64

In [15]:
# convert labels into binary values

y[y == 2] = 0

y[y == 1] = 1

In [16]:
# again preview the y label

y.head()

0    0
1    0
2    0
3    1
4    0
Name: Channel, dtype: int64

Now, we will convert the dataset into an optimized data structure called Dmatrix that XGBoost supports and gives it acclaimed performance and efficiency gains.

We will do it as follows -

## What is DMatrix?
A DMatrix is a matrix-like structure used to store the features (input data) and corresponding labels (target values) for training, validation, or testing in XGBoost.
It is designed to handle large datasets efficiently, with optimizations for sparse data and memory usage.
Features of DMatrix
Compact Representation:

Internally uses efficient memory structures to store data, especially when dealing with sparse datasets.
Reduces memory overhead compared to traditional data structures like Pandas DataFrame or NumPy arrays.
Support for Sparse Data:

Handles sparse datasets (with many zeros or missing values) effectively using compressed sparse row (CSR) or compressed sparse column (CSC) formats.
Incorporates Labels:

Can include both feature data (X) and labels (y) in one structure for supervised learning tasks.
Supports Feature Weights:

Allows you to assign weights to features or data instances for more advanced use cases.
Optimized for XGBoost:

Faster data access and processing during training and prediction.
## When to Use DMatrix?
For Training: Convert your input features and labels into a DMatrix format before training an XGBoost model.
For Validation: Use DMatrix for test/validation datasets to maintain efficiency and compatibility with XGBoost's functions.
For Sparse Data: Essential when working with datasets containing a significant number of zeros or missing values.

In [20]:
# import XGBoost
import xgboost as xgb
# define data_dmatrix
data_dmatrix = xgb.DMatrix(data=X,label=y)

## Data Splitting

In [21]:
# split X and y into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## Model training

In [22]:
# import XGBClassifier
from xgboost import XGBClassifier


# declare parameters
params = {
            'objective':'binary:logistic',
            'max_depth': 4,
            'alpha': 10,
            'learning_rate': 1.0,
            'n_estimators':100
        }         

# instantiate the classifier 
xgb_clf = XGBClassifier(**params)
# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)

In [23]:
# we can view the parameters of the xgb trained model as follows -

print(xgb_clf)

XGBClassifier(alpha=10, base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=1.0, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, ...)


In [24]:
# make predictions on test data

y_pred = xgb_clf.predict(X_test)

In [25]:
# compute and print accuracy score

from sklearn.metrics import accuracy_score

print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

XGBoost model accuracy score: 0.8788


## Cross Validation
dtrain=data_dmatrix: The training data in DMatrix format, which includes both features and labels for model training.

params=params: A dictionary of hyperparameters for the model:

objective: Defines the task (binary:logistic for binary classification).

colsample_bytree: Fraction of features to sample per tree (0.3).

learning_rate: Step size for learning (0.1).

max_depth: Maximum tree depth (5).

alpha: L1 regularization term (10).

nfold=3: Number of folds for cross-validation (3 folds).

num_boost_round=50: Number of boosting rounds (50).

early_stopping_rounds=10: Stop early if no improvement in 10 rounds.

metrics="auc": Evaluation metric (AUC for binary classification).

as_pandas=True: Return results as a Pandas DataFrame.

seed=123: Random seed for reproducibility.* 

In [26]:
from xgboost import cv

params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

xgb_cv = cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=123)

In [30]:
xgb_cv.head()

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.91613,0.017352,0.883424,0.031286
1,0.916677,0.017711,0.884176,0.031818
2,0.942047,0.010679,0.921339,0.004898
3,0.959857,0.004367,0.947506,0.010233
4,0.963008,0.002962,0.944536,0.010897


In [31]:
# Get the overall mean AUC score from the test set
overall_test_auc = xgb_cv['test-auc-mean'].max()

# Print the overall score
print(f"Overall test AUC score: {overall_test_auc}")


Overall test AUC score: 0.9560670309317983
