# AMEX - simple XGBoost baseline model

I wanted to get an initial baseline model / score. This extremely simple XGBoost model is built using:
- train/test dataframes that have numerical features converted to 16 bits (for compression - customer_ID replaced by integer as well)
- the only encoding is a labelencoder on the categorical fields (just to make XGBoost happy)
- no imputation of missing values
- no feature engineering
- uses *only* the most recent statement for each customer
- no hyperparameter tuning

I created test and train datasets that compress features to 16 bit numerics and drop all but the most recent statement for each customer to make things simpler.

NOTE: I trained with a GPU; turn off the 'gpu_hist' in the XGBClassifier if you want to use CPU (I don't know how long it will take)

In [None]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

RANDOM_STATE = 42
INPUT_PATH = Path("../input/amex-eda")

### Load premade train dataset; 16 bit numerics, most recent statements only

Note: the customer_ID has been replaced with integer c_ID field

In [None]:
train = pd.read_feather(INPUT_PATH / "train_16_recent_data.feather")
train.set_index(['c_ID'], inplace=True)
train.sort_index(inplace=True)

labels = pd.read_feather(INPUT_PATH / "train_labels.feather")
labels.set_index('c_ID', inplace=True)
labels.sort_index(inplace=True)

display(train.head(2))
display(labels.head(2))

### Preprocessing

Here we encode the categoricals with a LabelEncoder.

(I tried to use a sklearn ColumnTransformer, but it blows out all the numerics back to float64, exhausting memory - I didn't want to fight with it anymore).

In [None]:
encoders = {}
categoricals = ['D_63', 'D_64', 'D_66', 'D_68', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']
def make_x_y(df, labels=None):
    df = df.sort_index()
    for col in categoricals:
        if not col in encoders:
            le = LabelEncoder()
            df[f'{col}_enc'] = le.fit_transform(df[col])
            df.drop(columns=col, inplace=True)
            encoders[col] = le
        else:
            le = encoders[col]
            df[f'{col}_enc'] = le.transform(df[col])
            df.drop(columns=col, inplace=True)
    if not labels is None:
        labels = labels.sort_index().target
    return df, labels


### AMEX metric

(thanks to https://www.kaggle.com/code/rohanrao/amex-competition-metric-implementations)

In [None]:
def amex_metric_numpy(y_true: np.array, y_pred: np.array) -> float:
    
    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)


# Train a first model

I use 3-fold cross-validation just to sanity check consistency of results. It's rolled out by hand so that I can use the AMEX scorer.

In [None]:
X, y = make_x_y(train, labels)
display(X.head(2))
display(X.dtypes)

xgb = XGBClassifier(objective='binary:logistic',
                    random_state=RANDOM_STATE,
                    tree_method='gpu_hist')

skf = StratifiedKFold(n_splits=3)
scores = []
for train_idx, test_idx in skf.split(X,y):
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_test = y.iloc[test_idx]

    xgb.fit(X_train, y_train)
    probs = xgb.predict_proba(X_test)[:,1]
    scores.append(amex_metric_numpy(y_test.to_numpy(), probs))

print("Scores: ")
display(scores)

# Fit full model

In [None]:
xgb = XGBClassifier(objective='binary:logistic',
                    random_state=RANDOM_STATE,
                    tree_method='gpu_hist')
xgb.fit(X, y)

In [None]:
# try to free up RAM
del(X)
del(train)

# Make Predictions

again, the test file has been converted to 16 bit numerics and contains only the most recent statement for each customer

In [None]:
import gc

# try to free up RAM
gc.collect()

test = pd.read_feather(INPUT_PATH / "test_16_recent_data.feather")
test.set_index(['c_ID'], inplace=True)
test.sort_index(inplace=True)

# original customer keys (for submission file)
cust = pd.read_feather(INPUT_PATH / "test_cust.feather")
cust.set_index('c_ID', inplace=True)
cust.sort_index(inplace=True)

display(test.head(2))
display(cust.head(2))


In [None]:
X_test, _ = make_x_y(test)
probs = xgb.predict_proba(X_test)
probs

# Make Submission

NOTE: join with the saved customer keys from the raw dataset

In [None]:
submit = pd.DataFrame(probs[:,1], columns=['prediction'],index=test.index)
submit = submit.join(cust).reset_index().set_index('customer_ID')
display(submit.head(2))

submit['prediction'].to_csv('./submission.csv')