# Simple XGBoost Models

In this notebook we get a baseline for the GPU-enabled XGBoost model. We mostly use default settings except for the following:

* We set a higher value for `n_estimators`, a lower value for `learning_rate` and enable `early_stopping_rounds` to avoid overfitting on each fold

We use [this dataset](https://www.kaggle.com/rsizem2/tps0921foldsfeather), which is equivalent to the original data except we have a predefined cross-validation scheme and each feature has been downcast to it's lowest possible subtype.

In [None]:
# Global variables for testing changes to this notebook quickly
NUM_FOLDS = 8  
RANDOM_SEED = 0
NUM_TREES = 2000
EARLY_STOP = 50
SUBMIT = True

In [None]:
# Essentials
import numpy as np
import pandas as pd
import scipy.stats as stats
import pyarrow
import pickle
import time
import matplotlib
from matplotlib import pyplot as plt

# Models and Evaluation
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier, plot_importance

# Hide warnings
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [None]:
%%time
# Load data
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

# Drop ID
train.drop('id', axis = 'columns', inplace = True)
test.drop('id', axis = 'columns', inplace = True)

# Downcast training data
for col, dtype in train.dtypes.iteritems():
    if dtype.name.startswith('int'):
        train[col] = pd.to_numeric(train[col], downcast ='integer')
    elif dtype.name.startswith('float'):
        train[col] = pd.to_numeric(train[col], downcast ='float')

# Downcast test data
for col, dtype in test.dtypes.iteritems():
    if dtype.name.startswith('int'):
        test[col] = pd.to_numeric(test[col], downcast ='integer')
    elif dtype.name.startswith('float'):
        test[col] = pd.to_numeric(test[col], downcast ='float')

# Feature columns
features = [x for x in train.columns if x not in ['id','claim']]

## Helper Functions

### 1. Scoring Function

In [None]:
def score_xgboost():

    # Vectors to store predictions/scores
    test_preds, oof_preds = np.zeros((test.shape[0],)), np.zeros((train.shape[0],))
    fi_scores, scores = np.zeros(len(features)), np.zeros(NUM_FOLDS)

    # Stratified k-fold cross-validation
    skf = StratifiedKFold(n_splits = NUM_FOLDS, shuffle = True, random_state = RANDOM_SEED)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train["claim"])):
       
        # Training and Validation Sets
        X_train, X_valid = train[features].iloc[train_idx], train[features].iloc[valid_idx]
        y_train, y_valid = train["claim"].iloc[train_idx], train["claim"].iloc[valid_idx]

        start = time.time()

        # Define model
        model = XGBClassifier(
            random_state = RANDOM_SEED,
            n_estimators = NUM_TREES,
            tree_method='gpu_hist',
            gpu_id=0,
            predictor="gpu_predictor",
        )

        # Train model
        model.fit(
                X_train, y_train,
                verbose = False,
                eval_set = [(X_train, y_train), (X_valid, y_valid)],
                eval_metric = ["auc","logloss"],
                early_stopping_rounds = EARLY_STOP
        )

        # Get predictions
        valid_preds = model.predict_proba(X_valid)[:,1]
        test_preds += model.predict_proba(test[features])[:, 1] / NUM_FOLDS
        fi_scores += model.feature_importances_ / NUM_FOLDS
        oof_preds[valid_idx] = valid_preds
        scores[fold] = roc_auc_score(y_valid, valid_preds)
        end = time.time()
        print(f'Fold {fold} (AUC): {round(scores[fold], 6)} in {round(end-start, 3)}s')

    print("\nAverage AUC:", round(scores.mean(), 6))
    print("Worst AUC:", round(scores.min(), 6))
    return model, scores.mean(), oof_preds, test_preds, fi_scores

### 2. Training Plot

In [None]:
def training_plot(xgb_model):
    # Get model evaluation results
    results = xgb_model.evals_result()
    num_iter = len(results['validation_0']['auc'])
    x_axis = range(0, num_iter)

    # Plot training curve
    fig, ax = plt.subplots(figsize = (9,6))
    ax.plot(x_axis, results['validation_0']['auc'], label='Train')
    ax.plot(x_axis, results['validation_1']['auc'], label='Valid')
    plt.axvline(x=xgb_model.best_iteration, color='k', linestyle='--')
    ax.legend()
    plt.ylabel('AUC')
    plt.xlabel('Iterations')
    plt.title('XGBoost AUC')
    plt.grid(True)
    plt.show()

# XGBoost Baseline

In [None]:
model, score, oof_preds, test_preds, fi_scores = score_xgboost()

submission['claim'] = test_preds
submission.to_csv('baseline_submission.csv', index=False)

In [None]:
training_plot(model)

# Feature Engineering

In [None]:
def create_row_stats(data):
    data['nan_count'] = data[features].isnull().sum(axis=1)
    data['nan_std'] = data[features].isnull().std(axis=1)
    data['min'] = data[features].min(axis=1)
    data['std'] = data[features].std(axis=1)
    data['max'] = data[features].max(axis=1)
    data['median'] = data[features].median(axis=1)
    data['mean'] = data[features].mean(axis=1)
    data['var'] = data[features].var(axis=1)
    data['sum'] = data[features].sum(axis=1)
    data['sem'] = data[features].sem(axis=1)
    data['skew'] = data[features].skew(axis=1)
    data['median_abs_dev'] = stats.median_abs_deviation(data[features], axis=1)
    data['zscore'] = (np.abs(stats.zscore(data[features]))).sum(axis=1)
    return data

## Train Model

In [None]:
# Create new features
train = create_row_stats(train)
test = create_row_stats(test)
features = [x for x in train.columns if x not in ['id','claim']]

model, score, oof_preds, test_preds, fi_scores = score_xgboost()

fi_scores = pd.Series(
    data = fi_scores, 
    index = features
).sort_values(ascending = False)

## Feature Importances

We look at the most important and least important features as determined by XGBoost

In [None]:
# Most important features
fi_scores.head(5)

In [None]:
# Least important features
fi_scores.tail(5)

# Final Submission

We use only the features which XGBoost found more important than any of the original features.

## Remove Low Importance Features

In [None]:
# Remove low importance features
train.drop(['nan_std','var','zscore','std'], axis = 'columns', inplace = True)
test.drop(['nan_std','var','zscore','std'], axis = 'columns', inplace = True)
features = [x for x in test.columns]

model, score, oof_preds, test_preds, fi_scores = score_xgboost()

submission['claim'] = test_preds
submission.to_csv('best_features_submission.csv', index=False)