<a href="https://colab.research.google.com/github/stepthom/869_course/blob/main/ensemble/boosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Playground for Boosting

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869


Here we try some of the best boosting algorithms: XGBoost, Catboost, and LGBM.

We'll use the Portugese Bank Marketing dataset.

In [1]:
import pandas as pd
import time
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
!pip install -U scikit-learn



In [3]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.0.


In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/stepthom/869_course/main/data/bank.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [6]:
# Make sure the label is in 0/1 format, not "no/yes" format
from sklearn.preprocessing import LabelEncoder
label_transformer = LabelEncoder()
df['y'] = label_transformer.fit_transform(df['y'])

# LightGBM

LightGBM is fast and can handle categorical features internally. A very good choice!

In [7]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_validate

# Split the dataframe into X (features) and y (target, aka, label).
X = df.copy().drop('y', axis=1)
y = df.copy()['y']

# This is a simple dataset that does not need any cleaning or feature engineering,
# so we will skip those here. In most real-world situations, we'd want to do the
# normal pipeline dance at this point.

# LightGBM can handle categorical features by itself. We just need to make 
# sure the dataframe is using panda's built-in "category" data type.
cat_cols = ['job', 'marital', 'education', 'default', 
            'loan', 'housing', 'contact', 'month', 'poutcome']
X[cat_cols] = X[cat_cols].astype('category')

params = {
      "cat_l2": 10,
      "cat_smooth": 10,
      "colsample_bytree": 0.8,
      "feature_fraction_bynode":  0.8,
      "learning_rate": 0.03,
      "max_depth": 6,
      "min_child_samples": 63,
      "min_data_per_group": 100,
      "n_estimators": 200,
      "num_leaves": 63,
      "path_smooth": 0,
      "reg_alpha":  0.05,
      "reg_lambda": 0.05,
      "subsample_freq": 1,
      "subsample": 0.8,
      "max_bin": 127,
      "extra_trees": False,
      "is_unbalance": True,
      "boosting_type": 'gbdt',
      "n_jobs": 1,
      "verbosity": -1,
      "seed": 77,
}
        
fit_params= {
    'feature_name': "auto",
    'categorical_feature': cat_cols,
}

start = time.time()
estimator = LGBMClassifier(**params)

# The more CV, the better our estimate of the score
inner_cv_scores = cross_validate(estimator, X, y, 
                                 fit_params=fit_params, 
                                 cv=15, 
                                 scoring="roc_auc", 
                                 n_jobs=5, 
                                 verbose=0, 
                                 return_train_score=True)
cv_scores=inner_cv_scores['test_score'].tolist()
duration = time.time() - start

print("CV Scores:")
print(["{:0.4f}".format(cv_score) for cv_score in cv_scores])
print("CV Score mean: {:.4f} ".format(np.mean(cv_scores)))
print("CV Score range: {:0.4f} -- {:0.4f}".format(np.mean(cv_scores) - np.std(cv_scores), np.mean(cv_scores) + np.std(cv_scores)))
print("Fit times: {}".format(duration))
print("Total duration: {}".format(duration))

CV Scores:
['0.9123', '0.9084', '0.9264', '0.8633', '0.9157', '0.8977', '0.9347', '0.9186', '0.9049', '0.9030', '0.9302', '0.9331', '0.9159', '0.8726', '0.9233']
CV Score mean: 0.9107 
CV Score range: 0.8908 -- 0.9306
Fit times: 7.002415895462036
Total duration: 7.002415895462036


In [8]:
# Of course, at this point, once you've found the best parameters etc., you would retrain the model on the full dataset:
estimator = LGBMClassifier(**params)
estimator = estimator.fit(X, y, **fit_params)

New categorical_feature is ['contact', 'default', 'education', 'housing', 'job', 'loan', 'marital', 'month', 'poutcome']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


# Catboost

In [9]:
pip install catboost



In [10]:
from catboost import CatBoostClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_validate

# Split the dataframe into X (features) and y (target, aka, label).
X = df.copy().drop('y', axis=1)
y = df.copy()['y']

# This is a simple dataset that does not need any cleaning or feature engineering,
# so we will skip those here. In most real-world situations, we'd want to do the
# normal pipeline dance at this point.

# Catboost can handle categorical features by itself. We just need to make 
# sure the dataframe is using panda's built-in "category" data type.
cat_cols = ['job', 'marital', 'education', 'default', 
            'loan', 'housing', 'contact', 'month', 'poutcome']
X[cat_cols] = X[cat_cols].astype('category')

params= {
    'cat_features': cat_cols,
    'verbose': 0,
}

start = time.time()
estimator = CatBoostClassifier(**params)

# The more CV, the better our estimate of the score
inner_cv_scores = cross_validate(estimator, X, y, 
                                 cv=15, 
                                 scoring="roc_auc", 
                                 n_jobs=5, 
                                 verbose=0, 
                                 return_train_score=True)
cv_scores=inner_cv_scores['test_score'].tolist()
fit_times=inner_cv_scores['fit_time'].tolist()
duration = time.time() - start

print("CV Scores:")
print(["{:0.4f}".format(cv_score) for cv_score in cv_scores])
print("CV Score mean: {:.4f} ".format(np.mean(cv_scores)))
print("CV Score range: {:0.4f} -- {:0.4f}".format(np.mean(cv_scores) - np.std(cv_scores), np.mean(cv_scores) + np.std(cv_scores)))
print("Total duration: {}".format(duration))

CV Scores:
['0.9240', '0.9005', '0.9247', '0.8809', '0.9129', '0.9007', '0.9430', '0.9265', '0.9348', '0.9005', '0.9378', '0.9376', '0.9086', '0.8871', '0.9396']
CV Score mean: 0.9173 
CV Score range: 0.8979 -- 0.9367
Total duration: 156.7729434967041


# XGBoost

In [11]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Split the dataframe into X (features) and y (target, aka, label).
X = df.copy().drop('y', axis=1)
y = df.copy()['y']

# XGBoost does not handle categorical automatically, so we will OHE them.

# We just need to make sure the dataframe is using panda's
#  built-in "category" data type.
cat_cols = ['job', 'marital', 'education', 'default', 
            'loan', 'housing', 'contact', 'month', 'poutcome']
X[cat_cols] = X[cat_cols].astype('category')

params = {
      "learning_rate": 0.03,
      "max_depth": 6,
      "min_child_samples": 63,
      "min_data_per_group": 100,
      "n_estimators": 200,
      "num_leaves": 63,
      "reg_alpha":  0.05,
      "reg_lambda": 0.05,
      "subsample": 0.8,
      "boosting_type": 'gbtree',
      "n_jobs": 1,
      "verbosity": 0,
      "seed": 77,
}

pipe = Pipeline(steps=[
   #('encoder', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1, dtype=np.int32)),
   ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=np.int32)),
   ('clf', XGBClassifier(**params)),                    
])

start = time.time()
pipe.fit(X, y)

# The more CV, the better our estimate of the score
inner_cv_scores = cross_validate(pipe, X, y, 
                                 cv=5, 
                                 scoring="roc_auc", 
                                 n_jobs=15, 
                                 verbose=0, 
                                 return_train_score=True)
cv_scores=inner_cv_scores['test_score'].tolist()
fit_times=inner_cv_scores['fit_time'].tolist()
duration = time.time() - start

print("CV Scores:")
print(["{:0.4f}".format(cv_score) for cv_score in cv_scores])
print("CV Score mean: {:.4f} ".format(np.mean(cv_scores)))
print("CV Score range: {:0.4f} -- {:0.4f}".format(np.mean(cv_scores) - np.std(cv_scores), np.mean(cv_scores) + np.std(cv_scores)))
print("Total duration: {}".format(duration))

Pipeline(steps=[('encoder',
                 OneHotEncoder(dtype=<class 'numpy.int32'>,
                               handle_unknown='ignore', sparse=False)),
                ('clf',
                 XGBClassifier(boosting_type='gbtree', learning_rate=0.03,
                               max_depth=6, min_child_samples=63,
                               min_data_per_group=100, n_estimators=200,
                               num_leaves=63, reg_alpha=0.05, reg_lambda=0.05,
                               seed=77, subsample=0.8, verbosity=0))])

CV Scores:
['0.7655', '0.7482', '0.7226', '0.7562', '0.7303']
CV Score mean: 0.7446 
CV Score range: 0.7286 -- 0.7605
Total duration: 505.54399943351746
