In [1]:
%load_ext watermark
%watermark -p numpy,pandas,scikit-learn,xgboost,lightgbm,catboost

numpy       : 1.21.2
pandas      : 1.2.3
scikit-learn: 1.0
xgboost     : 1.3.3
lightgbm    : 3.2.1
catboost    : 0.26.1



In [2]:
import pandas as pd
import numpy as np

# Demo Notebook Illustrating How To Use Common Gradient Boosting Implementations With Categorical Data

# Dataset Loading

- To keep things simple, we will be using the Titanic dataset. Consequently, please don't overinterpret the predictive performance values. This is more intended as a technical demo/reference for how to use categorical support, not how to achieve good predictive performance.
- Titanic dataset reference: https://www.openml.org/d/40945

In [3]:
df = pd.read_csv('titanic.csv', sep=',')
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"


### Remove rows with missing values

In [4]:
(df[['embarked']] == '?').sum()

embarked    2
dtype: int64

In [5]:
df = df[~(df['embarked'] == '?')]

In [6]:
(df[['fare']] == '?').sum()

fare    1
dtype: int64

In [7]:
df = df[~(df['fare'] == '?')]

In [8]:
df['fare'] = df['fare'].astype(float)

### Convert to array format

- To keep things simple, we will only use a few columns in this dataset:

In [9]:
y = df['survived'].values

feature_names = ['pclass', 'sex', 'fare', 'embarked']

X = df[feature_names].values

X[:10]

array([[1, 'female', 211.3375, 'S'],
       [1, 'male', 151.55, 'S'],
       [1, 'female', 151.55, 'S'],
       [1, 'male', 151.55, 'S'],
       [1, 'female', 151.55, 'S'],
       [1, 'male', 26.55, 'S'],
       [1, 'female', 77.9583, 'S'],
       [1, 'male', 0.0, 'S'],
       [1, 'female', 51.4792, 'S'],
       [1, 'male', 49.5042, 'C']], dtype=object)

In [10]:
np.bincount(y)

array([808, 498])

In [11]:
for i in feature_names:
    print(f'{i}: {np.unique(df[i].values)}')

pclass: [1 2 3]
sex: ['female' 'male']
fare: [  0.       3.1708   4.0125   5.       6.2375   6.4375   6.45     6.4958
   6.75     6.8583   6.95     6.975    7.       7.0458   7.05     7.0542
   7.125    7.1417   7.225    7.2292   7.25     7.2833   7.3125   7.4958
   7.5208   7.55     7.575    7.5792   7.6292   7.65     7.7208   7.725
   7.7292   7.7333   7.7375   7.7417   7.75     7.775    7.7792   7.7875
   7.7958   7.8      7.8208   7.8292   7.85     7.8542   7.875    7.8792
   7.8875   7.8958   7.925    8.0292   8.05     8.1125   8.1375   8.1583
   8.3      8.3625   8.4042   8.4333   8.4583   8.5167   8.6542   8.6625
   8.6833   8.7125   8.85     8.9625   9.       9.2167   9.225    9.325
   9.35     9.475    9.4833   9.5      9.5875   9.6875   9.825    9.8375
   9.8417   9.8458  10.1708  10.4625  10.5     10.5167  10.7083  11.1333
  11.2417  11.5     12.      12.1833  12.275   12.2875  12.35    12.475
  12.525   12.65    12.7375  12.875   13.      13.4167  13.5     13.775
  13.7917 

- In this dataset, `'sex'` is a binary variable with only two values, so using categorical or onehot encoding is not necessary. However, we will do it anyways for demo purposes.
- Here, `'embarked'` is a categorical variable with 3 possible values.

### Onehot encoder pipeline

- The OneHot encoder pipeline encodes `'sex'` and `'embarked'` into a onehot-encoded form. The remaining features remain unchanged.

In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


ohe_features = ['sex', 'embarked']
ohe_transformer = make_pipeline(OneHotEncoder(drop='first'))

ohe_preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', ohe_transformer, ohe_features)],
    remainder='passthrough')

ohe_preprocessor.fit_transform(df[feature_names])

array([[  0.    ,   0.    ,   1.    ,   1.    , 211.3375],
       [  1.    ,   0.    ,   1.    ,   1.    , 151.55  ],
       [  0.    ,   0.    ,   1.    ,   1.    , 151.55  ],
       ...,
       [  1.    ,   0.    ,   0.    ,   3.    ,   7.225 ],
       [  1.    ,   0.    ,   0.    ,   3.    ,   7.225 ],
       [  1.    ,   0.    ,   1.    ,   3.    ,   7.875 ]])

In [13]:
ohe_preprocessor.get_feature_names_out()

array(['ohe__sex_male', 'ohe__embarked_Q', 'ohe__embarked_S',
       'remainder__pclass', 'remainder__fare'], dtype=object)

### Ordinal/Categorical encoder pipeline

- This pipeline will convert the string encoding of `'sex'` and `'embarked'` into an integer format.

In [14]:
from sklearn.preprocessing import OrdinalEncoder


cat_features = ['sex', 'embarked']
cat_transformer = make_pipeline(OrdinalEncoder())

cat_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_features)],
    remainder='passthrough')

cat_preprocessor.fit_transform(df[feature_names])

array([[  0.    ,   2.    ,   1.    , 211.3375],
       [  1.    ,   2.    ,   1.    , 151.55  ],
       [  0.    ,   2.    ,   1.    , 151.55  ],
       ...,
       [  1.    ,   0.    ,   3.    ,   7.225 ],
       [  1.    ,   0.    ,   3.    ,   7.225 ],
       [  1.    ,   2.    ,   3.    ,   7.875 ]])

- Note that feature index 0 corresponds to `'sex'`, and feature index 1 to `'embarked'`

### Train/Valid/Test splits

- Next, we are splitting the dataset into the usual subsets.

In [15]:
from sklearn.model_selection import train_test_split


df_X_temp, df_X_test, df_y_temp, df_y_test = \
    train_test_split(df[feature_names], df['survived'], test_size=0.20, random_state=123, stratify=df['survived'])

df_X_train, df_X_valid, df_y_train, df_y_valid = \
    train_test_split(df_X_temp, df_y_temp, test_size=0.25, random_state=123, stratify=df_y_temp)

print('Train/Valid/Test sizes:', df_y_train.shape[0], df_y_valid.shape[0], df_y_test.shape[0])

Train/Valid/Test sizes: 783 261 262


## Performance Baselines

### Majority class prediction

In [16]:
bins = np.bincount(df_y_test)
print(f'Test accuracy: {100* np.max(bins) / np.sum(bins):.2f}%', )

Test accuracy: 61.83%


### Decision Tree (Onehot)

In [17]:
from sklearn.tree import DecisionTreeClassifier


tree = DecisionTreeClassifier(random_state=123)
clf_pipe = make_pipeline(ohe_preprocessor, tree)

clf_pipe.fit(df_X_train, df_y_train)

print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 90.80%
Validation Accuracy: 77.01%
Test Accuracy: 77.86%


### Decision Tree (Ordinal)

- "Ordinal" means that the `'embarked'` variable is treated as an ordinal variable due to the integer encoding.

In [18]:
from sklearn.tree import DecisionTreeClassifier


tree = DecisionTreeClassifier(random_state=123)
clf_pipe = make_pipeline(cat_preprocessor, tree)

clf_pipe.fit(df_X_train, df_y_train)

print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 90.80%
Validation Accuracy: 77.01%
Test Accuracy: 77.48%


## Original gradient boosting (Onehot)

In [19]:
from sklearn.ensemble import GradientBoostingClassifier


boost = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=4,
    random_state=1)

clf_pipe = make_pipeline(ohe_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 89.66%
Validation Accuracy: 78.16%
Test Accuracy: 78.24%


## Original gradient boosting (Ordinal)

In [20]:
from sklearn.ensemble import GradientBoostingClassifier


boost = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=4,
    random_state=1)

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 89.27%
Validation Accuracy: 79.31%
Test Accuracy: 76.34%


## HistGradientBoostingClassifier (Onehot)

In [21]:
#from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier


boost = HistGradientBoostingClassifier(
    learning_rate=0.1,
    random_state=1)

clf_pipe = make_pipeline(ohe_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 87.74%
Validation Accuracy: 77.01%
Test Accuracy: 76.72%


## HistGradientBoostingClassifier (Ordinal)

In [22]:
boost = HistGradientBoostingClassifier(
    learning_rate=0.1,
    random_state=1)

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 88.25%
Validation Accuracy: 77.78%
Test Accuracy: 77.10%


## HistGradientBoostingClassifier (Categorical)

- In contrast to the "ordinal" control above, the "categorical" sections shows how to use the implemented support for categorical variables

In [23]:
boost = HistGradientBoostingClassifier(
    learning_rate=0.1,
    categorical_features=[0, 1], # -> ['sex', 'embarked'],
    random_state=1)

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 87.87%
Validation Accuracy: 77.39%
Test Accuracy: 78.63%


## XGBoost (Onehot)

In [24]:
import numpy as np
import xgboost as xgb


boost = xgb.XGBClassifier()

clf_pipe = make_pipeline(ohe_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")





Training Accuracy: 89.78%
Validation Accuracy: 77.78%
Test Accuracy: 78.63%


## XGBoost (Ordinal)

In [25]:
boost = xgb.XGBClassifier()

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")





Training Accuracy: 89.66%
Validation Accuracy: 77.01%
Test Accuracy: 77.86%


## XGBoost (Categorical) -- experimental

- In contrast to the "ordinal" control above, the "categorical" sections shows how to use the implemented support for categorical variables.
- Afaik, XGBoost detects categorical features via their `DataFrame` column type:

In [26]:
df_X_train_new = df_X_train.copy()
df_X_valid_new = df_X_valid.copy()
df_X_test_new = df_X_test.copy()


for name in ['sex', 'embarked']:
    df_X_train_new[name] = df_X_train_new[name].astype('category')
    df_X_valid_new[name] = df_X_valid_new[name].astype('category')
    df_X_test_new[name] = df_X_test_new[name].astype('category')

df_X_train_new.dtypes

pclass         int64
sex         category
fare         float64
embarked    category
dtype: object

- string variabels are not supported yet:

In [27]:
d = {'female': 0, 'male': 1}
df_X_train_new['sex'] = df_X_train_new['sex'].map(d)
df_X_valid_new['sex'] = df_X_valid_new['sex'].map(d)
df_X_test_new['sex'] = df_X_test_new['sex'].map(d)

d = {'C': 0, 'Q': 1, 'S': 2}
df_X_train_new['embarked'] = df_X_train_new['embarked'].map(d)
df_X_valid_new['embarked'] = df_X_valid_new['embarked'].map(d)
df_X_test_new['embarked'] = df_X_test_new['embarked'].map(d)

- `boost = xgb.XGBClassifier(enable_categorical=True)` throws an error later, not recognizing categorical columns in `DataFrame`, hence use `DMatrix as a workaround:



In [28]:
dtrain = xgb.DMatrix(df_X_train_new, label=df_y_train, enable_categorical=True)
dvalid = xgb.DMatrix(df_X_valid_new, label=df_y_valid, enable_categorical=True)
dtest = xgb.DMatrix(df_X_test_new, label=df_y_test, enable_categorical=True)

In [29]:
from sklearn.metrics import accuracy_score

#boost = xgb.XGBClassifier(enable_categorical=True)

gbm_model = xgb.train(params={}, dtrain=dtrain)

train_predict = (gbm_model.predict(dtrain) > 0.5).astype(int)
valid_predict = (gbm_model.predict(dvalid) > 0.5).astype(int)
test_predict = (gbm_model.predict(dtest) > 0.5).astype(int)

print(f"Training Accuracy: {100*accuracy_score(train_predict, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*accuracy_score(valid_predict, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*accuracy_score(test_predict, df_y_test):0.2f}%")

Training Accuracy: 87.87%
Validation Accuracy: 77.01%
Test Accuracy: 75.95%


## LightGBM (Onehot)

In [30]:
import lightgbm as lgb


boost = lgb.LGBMClassifier()

clf_pipe = make_pipeline(ohe_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 87.23%
Validation Accuracy: 77.39%
Test Accuracy: 77.48%


## LightGBM (Ordinal)

In [31]:
boost = lgb.LGBMClassifier()

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 87.23%
Validation Accuracy: 78.16%
Test Accuracy: 77.10%


## LightGBM (Categorical)

- In contrast to the "ordinal" control above, the "categorical" sections shows how to use the implemented support for categorical variables

In [32]:
boost = lgb.LGBMClassifier(categorical_feature="0,1")

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Please use categorical_feature argument of the Dataset constructor to pass this parameter.


Training Accuracy: 87.23%
Validation Accuracy: 78.16%
Test Accuracy: 77.10%


## CatBoost (Onehot)

In [33]:
from catboost import CatBoostClassifier


boost = CatBoostClassifier(verbose=0)

clf_pipe = make_pipeline(ohe_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 85.31%
Validation Accuracy: 80.84%
Test Accuracy: 75.57%


## CatBoost (Ordinal)

In [34]:
from catboost import CatBoostClassifier


boost = CatBoostClassifier(verbose=0)

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 85.06%
Validation Accuracy: 80.84%
Test Accuracy: 76.34%


## CatBoost (Categorical)

- In contrast to the "ordinal" control above, the "categorical" sections shows how to use the implemented support for categorical variables

In [35]:
boost = CatBoostClassifier(verbose=0, cat_features=['sex', 'embarked'])


#clf_pipe = make_pipeline(cat_preprocessor, boost)
boost.fit(df_X_train, df_y_train)

print(f"Training Accuracy: {100*boost.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*boost.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*boost.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 83.78%
Validation Accuracy: 81.99%
Test Accuracy: 75.57%
