# 5. XGBoost Model

We train XGBoost with GPU.

In [None]:
import gc
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

GPU = True
try:
    import cupy, cudf
except ImportError:
    GPU = False

cudf = pd

In [None]:
VER = 1
FEATURE_VER = 111

SEED = 108+5*VER+100*FEATURE_VER

FOLDS = 5

FEATURE_PATH = './'

DO_SUBMIT = False

print("VER:", VER)
print("fVER:", FEATURE_VER)

## Load Dataset

Feature engineering: https://www.kaggle.com/code/roberthatch/amex-feature-engg-gpu-or-cpu-process-in-chunks

In [None]:
def do_miss_nan(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    for c in df.columns:
        if c in get_not_used(): continue
        if str( df[c].dtype )=='int64':
            df[c] = df[c].astype('int32')
        if str(df[c].dtype )=='float64':
            df[c] = df[c].astype('float32')
    return df

def get_not_used():  
    return ['row_id', 'customer_ID', 'target', 'cid', 'S_2','D_103','D_139']    

train = do_miss_nan(train)
test = do_miss_nan(test)

In [None]:
print('Reading train data...')
TRAIN_PATH = f'{FEATURE_PATH}train_fe_v1.pickle'
train = pd.read_pickle(TRAIN_PATH)
print(train.shape)
train = do_miss_nan(train)
train = train.sample(frac=1, random_state=SEED)
train = train.reset_index(drop=True)
train.head()

## Training

We will train using `DeviceQuantileDMatrix` which has a very small GPU memory footprint.

In [None]:
import xgboost as xgb
from sklearn.model_selection import KFold

print('XGB Version',xgb.__version__)

BASE_LEARNING_RATE = 0.01

xgb_params = { 
    'max_depth': 7,
    'subsample':0.75,
    'colsample_bytree': 0.35,
    'gamma':1.5,
    'lambda':70,
    'min_child_weight':8,
    'objective':'binary:logistic',
    'eval_metric':['logloss', 'auc'],  ## Early stopping is based on the last metric listed.
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':SEED,
    'num_parallel_tree':1
}

In [None]:
class IterLoadForDMatrix(xgb.core.DataIter):
    
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 # set iterator to 0
        self.batch_size = batch_size
        self.batches = int( np.ceil( len(df) / self.batch_size ) )
        super().__init__()

    def reset(self):
        '''Reset the iterator'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0 
        a = self.it * self.batch_size
        b = min( (self.it + 1) * self.batch_size, len(self.df) )
        dt = cudf.DataFrame(self.df.iloc[a:b])
        input_data(data=dt[self.features], label=dt[self.target]) #, weight=dt['weight'])
        self.it += 1
        return 1

In [None]:
def amex_metric_mod(y_true, y_pred):
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)
    print("  4%  :", top_four)
    print("  Gini:", gini[1]/gini[0])
    print("Kaggle:", 0.5 * (gini[1]/gini[0] + top_four))
    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
importances = []
PYRAMID_W = [0.5, 2/3, 0.75, 0.875, 1, 0]

def run_training(train, features):
    oof = []
    skf = KFold(n_splits=FOLDS)
    for fold,(train_idx, valid_idx) in enumerate(skf.split(
                train, train.target )):
        print('#'*25)
        print('### Fold',fold+1)
        X_train = train.loc[train_idx, features]
        y_train = train.loc[train_idx, 'target']
        X_valid = train.loc[valid_idx, features]
        y_valid = train.loc[valid_idx, 'target']
        print('### Train size',len(train_idx),'Valid size',len(valid_idx),'Valid positives',y_valid.sum())
        print(f'### Training with all of fold data...')
        print('#'*25)
        dtrain = xgb.DMatrix(data=X_train, label=y_train)
        dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
        pyramid_layers = [(100,  10,  1.56,  0.5),
                          ( 20,  50,  1.3,   2/3),
                          (  1,1000,  1.25,  0.75),
                          (  1,1000,  1.125, 0.875),
                          (  1,3000,  1.0,   1),
                          (  1,9000,  0.5,   0)]
        assert(PYRAMID_W == [layer[-1] for layer in pyramid_layers])
        for (layer, (n_trees, n_rounds, adj_learning, w)) in enumerate(pyramid_layers):
            xgb_params['num_parallel_tree'] = n_trees
            xgb_params['learning_rate'] = n_trees*adj_learning*BASE_LEARNING_RATE
            xgb_params['random_state'] += 1
            early_stop = None
            if w == 0:
                early_stop = 300
            print("Learning Rate:", xgb_params['learning_rate'])
            model = xgb.train(xgb_params, 
                        dtrain=dtrain,
                        evals=[(dtrain,'train'),(dvalid,'valid')],
                        num_boost_round=n_rounds,
                        early_stopping_rounds=early_stop,
                        verbose_eval=100//n_trees)
            model.save_model(f'XGB_v{VER}_fold{fold}_layer{layer}.xgb')
            if (w != 0):
                ptrain = model.predict(dtrain, output_margin=True)
                pvalid = model.predict(dvalid, output_margin=True)
                if (w < 1.0):
                    ptrain = ptrain * w
                    pvalid = pvalid * w
                dtrain.set_base_margin(ptrain)
                dvalid.set_base_margin(pvalid)
                plt.hist(pvalid, bins=100)
                plt.title(f'Layer {layer} OOF Predictions')
                plt.show()
                del model, ptrain, pvalid
                gc.collect()
        dd = model.get_score(importance_type='weight')
        df = pd.DataFrame({'feature':dd.keys(),f'importance_{fold}':dd.values()})
        importances.append(df)
        print("Best_ntree_limit:", model.best_ntree_limit//xgb_params['num_parallel_tree'])
        oof_preds = model.predict(dvalid, iteration_range=(0,model.best_ntree_limit//xgb_params['num_parallel_tree']))
        print('For this fold:')
        amex_metric_mod(y_valid.values, oof_preds)
        df = train.loc[valid_idx, ['customer_ID','target'] ].copy()
        df['oof_pred'] = oof_preds
        oof.append( df )
        del X_train, y_train, dd, df
        del X_valid, y_valid, dvalid, model
        gc.collect()
    print('#'*25)
    print('OVERALL CV:')
    oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
    amex_metric_mod(oof.target.values, oof.oof_pred.values)
    return oof

In [None]:
features = [col for col in train.columns if col not in ['customer_ID', 'target','S_2']]
features = [col for col in features if 'B_29' not in col]
print(f'There are {len(features)} features!')
print(train.shape)

oof = run_training(train, features)

## Out-of-Fold Predictions

In [None]:
oof_xgb = pd.read_pickle(TRAIN_PATH)[['customer_ID']].drop_duplicates()
oof_xgb = oof_xgb.set_index('customer_ID')
oof_xgb = oof_xgb.merge(oof, left_index=True, right_index=True)
oof_xgb = oof_xgb.sort_index().reset_index()
oof_xgb.to_csv(f'oof_xgb_v{VER}.csv',index=False)
oof_xgb.head()

## Feature Importance

In [None]:
import matplotlib.pyplot as plt

In [None]:
df = importances[0].copy()
for k in range(1,FOLDS): df = df.merge(importances[k], on='feature', how='left')
df['importance'] = df.iloc[:,1:].mean(axis=1)
df = df.sort_values('importance',ascending=False)
df.to_csv(f'xgb_feature_importance_v{VER}.csv',index=False)

In [None]:
NUM_FEATURES = 30
plt.figure(figsize=(10,5*NUM_FEATURES//10))
plt.barh(np.arange(NUM_FEATURES,0,-1), df.importance.values[:NUM_FEATURES])
plt.yticks(np.arange(NUM_FEATURES,0,-1), df.feature.values[:NUM_FEATURES])
plt.title(f'XGB Feature Importance - Top {NUM_FEATURES}')
plt.show()

## Test Inference

In [1]:
DO_SUBMIT = 1

In [None]:
if DO_SUBMIT:
    gc.collect()
    TEST_SECTIONS = 1
    TEST_SUB_SECTIONS = 1
    test_preds = []
    customers = False
    for k in range(TEST_SECTIONS):
        for i in range(TEST_SUB_SECTIONS):    
            print(f'\nReading test data...')
            test = cudf.read_pickle(f'test_fe_v1.pickle')
            test = do_miss_nan(test)
            if i == 0:
                print(f'=> Test part {k+1} has shape', test.shape )
                if k == 0:
                    customers = test.index.copy()
                else:
                    customers = customers.append(test.index)
            X_test = test[features]
            n_rows = len(test.index)//TEST_SUB_SECTIONS
            print(".")
            if i+1 < TEST_SUB_SECTIONS:
                X_test = X_test.iloc[i*n_rows:(i+1)*n_rows, :].copy()
            elif TEST_SUB_SECTIONS > 1:
                X_test = X_test.iloc[i*n_rows:, :].copy()
            print(f'=> Test piece {k+1}, {i+1} has shape', X_test.shape )
            del test
            gc.collect()
            dtest = xgb.DMatrix(data=X_test)
            del X_test
            gc.collect()
            reset_margin = dtest.get_base_margin()
            print(".")
            for f in range(FOLDS):
                if (f > 0):
                    dtest.set_base_margin(reset_margin)
                for (layer, w) in enumerate(PYRAMID_W[:-1]):
                    model = xgb.Booster()
                    model.load_model(f'XGB_v{VER}_fold{f}_layer{layer}.xgb')
                    print(f'Loaded fold{f}, layer{layer}')
                    ptest = model.predict(dtest, output_margin=True)
                    if (w < 1.0):
                        ptest = ptest * w
                    dtest.set_base_margin(ptest)
                layer = len(PYRAMID_W) - 1
                model = xgb.Booster()
                model.load_model(f'XGB_v{VER}_fold{f}_layer{layer}.xgb')
                print("Best_ntree_limit", model.best_ntree_limit//xgb_params['num_parallel_tree'])
                if f == 0:
                    preds = model.predict(dtest, output_margin=False, iteration_range=(0,model.best_ntree_limit//xgb_params['num_parallel_tree']))
                else:
                    preds += model.predict(dtest, output_margin=False, iteration_range=(0,model.best_ntree_limit//xgb_params['num_parallel_tree']))
            preds /= FOLDS
            test_preds.append(preds)
            del dtest, model, reset_margin
            _ = gc.collect()

## Create Submission

In [None]:
if DO_SUBMIT:
    test = cudf.DataFrame(index=customers,data={'prediction':test_preds[0]})
    sub = cudf.read_csv('../input/amex-default-prediction/sample_submission.csv')[['customer_ID']]
    sub = sub.merge(test[['prediction']], left_index=True, right_index=True, how='left')
    sub = sub.reset_index(drop=True)

    sub.to_csv(f'../sub/submission_xgb.csv',index=False)
    print('Submission file shape is', sub.shape )
    sub.head()

    plt.hist(sub.prediction, bins=100)
    plt.title('Test Predictions')
    plt.show()