**<h4>This notebook implements XGBoost on the set processed by this [notebook](https://www.kaggle.com/code/susnato/amex-data-preprocesing-feature-engineering). Some of its code is inspired from some notebooks mentioned below.</h4>**



**IMPORTING PYTHON LIBRARIES**

In [None]:
import os
import gc
import glob
import tqdm
import numpy as np
import pandas as pd

SEED = 42
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

**LOAD THE DATA**

The data used for training here is created by the notebook : [AMEX-Data Preprocesing & Feature Engineering](https://www.kaggle.com/code/susnato/amex-data-preprocesing-feature-engineering)  You can checkout the notebook for more details.

In [None]:
train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
train_labels['customer_ID'] = train_labels['customer_ID'].apply(lambda x: int(x[-16:], 16)).astype(np.int64)
train_labels = train_labels.set_axis(train_labels['customer_ID'])
train_labels = train_labels.drop(['customer_ID'], axis=1)

train_pkls = sorted(glob.glob('../input/amex-data-preprocesing-feature-engineering/train_data_*'))
test_pkls = sorted(glob.glob('../input/amex-data-preprocesing-feature-engineering/test_data_*'))

train_df = pd.read_pickle(train_pkls[0]).astype(np.float32)
print(train_pkls[0])
for i in train_pkls[1:]:
    print(i)
    train_df = train_df.append(pd.read_pickle(i))
    train_df = train_df.astype(np.float32)
    gc.collect()
    
y = train_labels.loc[train_df.index.values].values.astype(np.int8)
train_df = train_df.drop(['D_64_1', 'D_66_0', 'D_68_0'], axis=1).astype(np.float32)
print(train_df.shape, y.shape)

**XGBOOST**

In [None]:
import xgboost as xgb

def amex_metric(y_true, y_pred):
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)


def create_model(seed):
    return xgb.XGBClassifier(
        n_estimators = 1500,#
        max_depth = 4,#
        learning_rate = 0.07, #
        subsample = 0.9, #
        colsample_bytree = 0.3, #
        min_child_weight = 7, #
        reg_alpha = 2, 
        reg_lambda = 0.5, 
        tree_method ='gpu_hist',
        predictor = 'gpu_predictor',
        eval_metric = amex_metric,
        random_state = SEED)

**TRAINING**

If I try to use KFold CV the notebook craches due to shortage of system memory so I preferred to go with 75%-25% Holdout set.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_df, y,
                                                    stratify=y, 
                                                    test_size=0.20)
del train_df, y
gc.collect()

In [None]:
model = create_model(SEED)
model.fit(X_train, y_train,
         eval_set=[(X_train, y_train), (X_val, y_val)],
         #early_stopping_rounds=200,
         verbose=45)

<h4>Now let's plot the feature importance</h4>

In [None]:
feature_important = model.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.nlargest(25, columns="score").plot(kind='barh', figsize = (20,10)) ## plot top 40 features

**SUBMISSION**

In [None]:
submission = pd.read_csv('../input/amex-default-prediction/sample_submission.csv')
submission['customer_ID_encoded'] = train_labels['customer_ID'] = submission['customer_ID'].apply(lambda x: int(x[-16:], 16)).astype(np.int64)
submission.set_axis(submission['customer_ID_encoded'], inplace=True)
submission = submission.drop(['customer_ID_encoded'], axis=1)
submission['prediction'] = submission['prediction'].astype(np.float32)

In [None]:
%%time

customer_ids_list = []
preds_list = []
for t in test_pkls:
    test_df = pd.read_pickle(t)
    customer_ids = test_df.axes[0].values
    customer_ids = submission.loc[customer_ids]['customer_ID'].values
    customer_ids_list.extend(customer_ids)
    preds = model.predict_proba(test_df)[:, 1]
    preds_list.extend(preds)
    gc.collect()

preds_list = np.array(preds_list).reshape(-1, 1)
customer_ids_list = np.array(customer_ids_list).reshape(-1, 1)

In [None]:
sub = pd.DataFrame(data=np.concatenate([customer_ids_list, preds_list], axis=1), columns=['customer_ID', 'prediction'])
sub.to_csv('submission.csv', index=False)

# Thanks