# Gradient Boosting Baselines

In this notebook we get baselines for the GPU enabled LightGBM, XGBoost and CatBoost models. We mostly leave our settings as default and do not preprocess the training data with the exception of the following:

* Downcast our data to it's lowest subtype (e.g. `float64` to `float32`)
* We set `n_estimators = 10000` and `learning_rate = 0.016` with early stopping.
* Enable training on GPU

We change the learning rates so that all the models considered are consistent with what CatBoost chooses by default.

**Note:** The performance of one model relative to another does not indicate that it will perform better than the other after we tweak more parameters or perform feature engineering.

In [None]:
# Global variables for testing changes to this notebook quickly
RANDOM_SEED = 0
NUM_FOLDS = 5
ETA = 0.016
NUM_TREES = 10000
EARLY_STOP = 150

## Install GPU-enabled LightGBM

We follow [this notebook](https://www.kaggle.com/abhishek/running-lightgbm-on-gpu/notebook) for installing GPU-enabled LightGBM.

In [None]:
# Remove CPU only verson
!pip uninstall -y lightgbm

# Install boost development library
!apt-get install -y libboost-all-dev

# Clone LightGBM repository
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
%%bash
cd LightGBM
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
# Reinstall LightGBM
!cd LightGBM/python-package/;python3 setup.py install --precompile

# Cleanup
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

## Imports

In [None]:
# General Imports
import numpy as np
import pandas as pd
import time
import gc

# Models
import xgboost
import lightgbm
import catboost
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Model evaluation
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Plotting
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

# Prepare Data

In [None]:
%%time

# Load data
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

# Save feature columns
features = [x for x in train.columns if x not in ['id', 'target', 'kfold']]

In [None]:
# Downcast float/int datatypes
def reduce_memory_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col, dtype in df.dtypes.iteritems():
        if dtype.name.startswith('int'):
            df[col] = pd.to_numeric(df[col], downcast ='integer')
        elif dtype.name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast ='float')
        
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
%%time

# Downcast data
train = reduce_memory_usage(train)
test = reduce_memory_usage(test)

In [None]:
# Create cross-validation scheme
train['kfold'] = -1
skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['target'])):
    train['kfold'].iloc[valid_idx] = fold

# CatBoost Baseline

In [None]:
# CatBoost parameters
catboost_params = {
    'random_state': RANDOM_SEED,
    'n_estimators': NUM_TREES,
    #'learning_rate': ETA,
    'eval_metric': 'AUC:hints=skip_train~false',
    'task_type': 'GPU',
}

In [None]:
def train_catboost(model_params = {}, fit_params = {}):
    
    # Store the predictions
    oof_preds = np.zeros((train.shape[0],))
    test_preds = np.zeros((test.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    times = np.zeros(NUM_FOLDS)
    print('')

    # Stratified k-fold cross-validation
    for fold in range(NUM_FOLDS):
        
        # Training and Validation Sets
        X_train, y_train = train[train.kfold != fold][features], train[train.kfold != fold]['target']
        X_valid, y_valid = train[train.kfold == fold][features], train[train.kfold == fold]['target']
        X_test = test[features]
        
        start = time.time()
        
        # Define Model
        model = CatBoostClassifier(**{**catboost_params, **model_params})
        gc.collect()
        
        model.fit(
            X_train, y_train,
            verbose = False,
            eval_set = [(X_valid, y_valid)],
            early_stopping_rounds = EARLY_STOP,
            use_best_model = True,
            **fit_params
        )
        
        # validation and test predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS
        oof_preds[train.kfold == fold] = valid_preds
        
        # fold auc score
        fold_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} (AUC):', fold_auc)
        scores[fold] = fold_auc
        times[fold] = end-start
    
    print("\nAverage AUC:", scores.mean())
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    return model, test_preds, oof_preds

In [None]:
# Make CatBoost submission
catb_model, catb_preds, catb_oof = train_catboost()
submission['target'] =  catb_preds
submission.to_csv('catboost_submission.csv', index=False)

In [None]:
# Get model evaluation results
results = catb_model.get_evals_result()
num_iter = len(results['learn']['AUC'])
x_axis = range(0, num_iter)

# Plot training curve
fig, ax = plt.subplots(figsize = (9,6))
ax.plot(x_axis, results['learn']['AUC'], label='Train')
ax.plot(x_axis, results['validation']['AUC'], label='Valid')
plt.axvline(x=catb_model.get_best_iteration(), color='k', linestyle='--')
ax.legend()
plt.ylabel('AUC')
plt.xlabel('Iterations')
plt.title('CatBoost AUC')
plt.grid(True)
plt.show()

# LightGBM Baseline

In [None]:
# LightGBM parameters
lightgbm_params = {
    'random_state': RANDOM_SEED,
    'n_estimators': NUM_TREES,
    'learning_rate': ETA,
    'verbose': 0,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
}

In [None]:
def train_lightgbm(model_params = {}, fit_params = {}):
    
    # Store the holdout predictions
    oof_preds = np.zeros((train.shape[0],))
    test_preds = np.zeros((test.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    times = np.zeros(NUM_FOLDS)
    print('')
    
    # Stratified k-fold cross-validation
    for fold in range(NUM_FOLDS):
        
        # Training and Validation Sets
        X_train, y_train = train[train.kfold != fold][features], train[train.kfold != fold]['target']
        X_valid, y_valid = train[train.kfold == fold][features], train[train.kfold == fold]['target']
        X_test = test[features]
        
        # Define Model
        model = LGBMClassifier(**{**lightgbm_params, **model_params})
        gc.collect()
        
        start = time.time()
        
        model.fit(
            X_train, y_train,
            eval_set = [(X_train, y_train), (X_valid, y_valid)],
            eval_names = ['Train', 'Valid'],
            eval_metric = "auc",
            callbacks = [lightgbm.early_stopping(EARLY_STOP, verbose = False)],
            **fit_params
        )
        
        # validation and test predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS
        oof_preds[train.kfold == fold] = valid_preds
        
        # fold auc score
        fold_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} (AUC):', fold_auc)
        scores[fold] = fold_auc
        times[fold] = end-start
    
    print("\nAverage AUC:", scores.mean())
    print(f'Training Time: {round(times.sum(), 2)}s')
    
    return model, test_preds, oof_preds

In [None]:
# Make LightGBM submission
lgbm_model, lgbm_preds, lgbm_oof = train_lightgbm()
submission['target'] =  lgbm_preds
submission.to_csv('lightgbm_submission.csv', index=False)

In [None]:
# Get model evaluation results
results = lgbm_model.evals_result_
num_iter = len(results['Train']['auc'])
x_axis = range(0, num_iter)

# Plot training curve
fig, ax = plt.subplots(figsize = (9,6))
ax.plot(x_axis, results['Train']['auc'], label='Train')
ax.plot(x_axis, results['Valid']['auc'], label='Valid')
plt.axvline(x=lgbm_model.best_iteration_, color='k', linestyle='--')
ax.legend()
plt.ylabel('AUC')
plt.xlabel('Iterations')
plt.title('LightGBM AUC')
plt.grid(True)
plt.show()

# XGBoost Baseline (Boosted Trees)

In [None]:
# XGBoost parameters
xgboost_params = {
    'random_state': RANDOM_SEED,
    'n_estimators': NUM_TREES,
    'learning_rate': ETA,
    'tree_method': 'gpu_hist',
    'predictor': "gpu_predictor",
}

In [None]:
def train_xgboost(model_params = {}, fit_params = {}):
    
    # Store the  predictions
    oof_preds = np.zeros((train.shape[0],))
    test_preds = np.zeros((test.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    times = np.zeros(NUM_FOLDS)
    print('')
    
    # Stratified k-fold cross-validation
    for fold in range(NUM_FOLDS):
        
        # Training and Validation Sets
        X_train, y_train = train[train.kfold != fold][features], train[train.kfold != fold]['target']
        X_valid, y_valid = train[train.kfold == fold][features], train[train.kfold == fold]['target']
        X_test = test[features]
        
        # Define Model
        model = XGBClassifier(**{**xgboost_params, **model_params})
        gc.collect()
        
        start = time.time()
        
        model.fit(
            X_train, y_train,
            verbose = False,
            eval_set = [(X_train, y_train), (X_valid, y_valid)],
            eval_metric = "auc",
            early_stopping_rounds = EARLY_STOP,
            **fit_params
        )
        
        # validation and test predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS
        oof_preds[train.kfold == fold] = valid_preds
        
        # fold auc score
        fold_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} (AUC):', fold_auc)
        scores[fold] = fold_auc
        times[fold] = end-start
    
    print("\nAverage AUC:", scores.mean())
    print(f'Training Time: {round(times.sum(), 2)}s')
        
    return model, test_preds, oof_preds

In [None]:
# Make XGBoost submission
xgb_model, xgb_preds, xgb_oof = train_xgboost()
submission['target'] = xgb_preds
submission.to_csv('xgboost_trees_submission.csv', index=False)

In [None]:
# Get model evaluation results
results = xgb_model.evals_result()
num_iter = len(results['validation_0']['auc'])
x_axis = range(0, num_iter)

# Plot training curve
fig, ax = plt.subplots(figsize = (9,6))
ax.plot(x_axis, results['validation_0']['auc'], label='Train')
ax.plot(x_axis, results['validation_1']['auc'], label='Valid')
plt.axvline(x=xgb_model.best_iteration, color='k', linestyle='--')
ax.legend()
plt.ylabel('AUC')
plt.xlabel('Iterations')
plt.title('XGBoost w/ Trees')
plt.grid(True)
plt.show()

# XGBoost (Linear Boosting)

In [None]:
# XGBoost parameters
xgboost_params = {
    'random_state': RANDOM_SEED,
    'booster': "gblinear",
}

In [None]:
def train_xgboost(model_params = {}, fit_params = {}):
    
    # Store the  predictions
    oof_preds = np.zeros((train.shape[0],))
    test_preds = np.zeros((test.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    times = np.zeros(NUM_FOLDS)
    print('')
    
    # Stratified k-fold cross-validation
    for fold in range(NUM_FOLDS):
        
        # Training and Validation Sets
        X_train, y_train = train[train.kfold != fold][features], train[train.kfold != fold]['target']
        X_valid, y_valid = train[train.kfold == fold][features], train[train.kfold == fold]['target']
        X_test = test[features]
        
        # Define Model
        model = XGBClassifier(**{**xgboost_params, **model_params})
        gc.collect()
        
        start = time.time()
        
        model.fit(
            X_train, y_train,
            verbose = False,
            eval_set = [(X_train, y_train), (X_valid, y_valid)],
            eval_metric = "auc",
            early_stopping_rounds = EARLY_STOP,
            **fit_params
        )
        
        # validation and test predictions
        valid_preds = model.predict_proba(X_valid)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / NUM_FOLDS
        oof_preds[train.kfold == fold] = valid_preds
        
        # fold auc score
        fold_auc = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} (AUC):', fold_auc)
        scores[fold] = fold_auc
        times[fold] = end-start
    
    print("\nAverage AUC:", scores.mean())
    print(f'Training Time: {round(times.sum(), 2)}s')
        
    return model, test_preds, oof_preds

In [None]:
# Make XGBoost submission
xgb_model, xgb_preds, xgb_oof = train_xgboost()
submission['target'] = xgb_preds
submission.to_csv('xgboost_linear_submission.csv', index=False)

In [None]:
# Get model evaluation results
results = xgb_model.evals_result()
num_iter = len(results['validation_0']['auc'])
x_axis = range(0, num_iter)

# Plot training curve
fig, ax = plt.subplots(figsize = (9,6))
ax.plot(x_axis, results['validation_0']['auc'], label='Train')
ax.plot(x_axis, results['validation_1']['auc'], label='Valid')
plt.axvline(x=xgb_model.best_iteration, color='k', linestyle='--')
ax.legend()
plt.ylabel('AUC')
plt.xlabel('Iterations')
plt.title('XGBoost w/ Linear Boosting')
plt.grid(True)
plt.show()