# Keras Quickstart for the AMEX Competition: Training and Inference

This notebook shows
- how to do space-efficient feature engineering
- how to implement a simple Keras model
- how to train and cross-validate the model
- how to understand the competition metric graphically

The notebook is based on insights of the [EDA which makes sense ⭐️⭐️⭐️⭐️⭐️](https://www.kaggle.com/code/ambrosm/amex-eda-which-makes-sense).

In [None]:
#tpu
import tensorflow as tf
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))
strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
import pandas as pd
import numpy as np
import pickle
from matplotlib import pyplot as plt
import random
import datetime
import math
import gc
import warnings
import seaborn as sns
warnings.simplefilter(action='ignore', category=FutureWarning)

from matplotlib.ticker import MaxNLocator
from colorama import Fore, Back, Style

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, OneHotEncoder, PowerTransformer
from sklearn.metrics import roc_curve, roc_auc_score, average_precision_score
from sklearn.impute import SimpleImputer
from sklearn.utils import class_weight 
from sklearn.utils.class_weight import compute_class_weight

# tf.config.threading.set_inter_op_parallelism_threads(4)
import tensorflow_addons as tfa
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping
from tensorflow.keras.layers import Dense, Input, InputLayer, Add, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.utils import plot_model
import tensorflow.keras.backend as K

In [None]:
# Plot training history
def plot_history(history, *, n_epochs=None, plot_lr=False, title=None, bottom=None, top=None):
    """Plot (the last n_epochs epochs of) the training history
    
    Plots loss and optionally val_loss and lr."""
    plt.figure(figsize=(15, 6))
    from_epoch = 0 if n_epochs is None else max(len(history['loss']) - n_epochs, 0)
    
    # Plot training and validation losses
    plt.plot(np.arange(from_epoch, len(history['loss'])), history['loss'][from_epoch:], label='Training loss')
    try:
        plt.plot(np.arange(from_epoch, len(history['loss'])), history['val_loss'][from_epoch:], label='Validation loss')
        best_epoch = np.argmin(np.array(history['val_loss']))
        best_val_loss = history['val_loss'][best_epoch]
        if best_epoch >= from_epoch:
            plt.scatter([best_epoch], [best_val_loss], c='r', label=f'Best val_loss = {best_val_loss:.5f}')
        if best_epoch > 0:
            almost_epoch = np.argmin(np.array(history['val_loss'])[:best_epoch])
            almost_val_loss = history['val_loss'][almost_epoch]
            if almost_epoch >= from_epoch:
                plt.scatter([almost_epoch], [almost_val_loss], c='orange', label='Second best val_loss')
    except KeyError:
        pass
    if bottom is not None: plt.ylim(bottom=bottom)
    if top is not None: plt.ylim(top=top)
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend(loc='lower left')
    if title is not None: plt.title(title)
        
    # Plot learning rate
    if plot_lr and 'lr' in history:
        ax2 = plt.gca().twinx()
        ax2.plot(np.arange(from_epoch, len(history['lr'])), np.array(history['lr'][from_epoch:]), color='g', label='Learning rate')
        ax2.set_ylabel('Learning rate')
        ax2.legend(loc='upper right')
        
    plt.show()

In [None]:
def amex_metric(y_true, y_pred, return_components=False) -> float:
    """Amex metric for ndarrays"""
    def top_four_percent_captured(df) -> float:
        """Corresponds to the recall for a threshold of 4 %"""
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(df) -> float:
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(df) -> float:
        """Corresponds to 2 * AUC - 1"""
        df2 = pd.DataFrame({'target': df.target, 'prediction': df.target})
        df2.sort_values('prediction', ascending=False, inplace=True)
        return weighted_gini(df) / weighted_gini(df2)

    df = pd.DataFrame({'target': y_true.ravel(), 'prediction': y_pred.ravel()})
    df.sort_values('prediction', ascending=False, inplace=True)
    g = normalized_weighted_gini(df)
    d = top_four_percent_captured(df)

    if return_components: return g, d, 0.5 * (g + d)
    return 0.5 * (g + d)
def create_weighted_binary_crossentropy(zero_weight, one_weight):

    def weighted_cross_entropy_fn(y_true, y_pred):
        tf_y_true = tf.cast(y_true, dtype=y_pred.dtype)
        tf_y_pred = tf.cast(y_pred, dtype=y_pred.dtype)

        weights_v = tf.where(tf.equal(tf_y_true, 1), 2*one_weight, 2*zero_weight)
        ce = K.binary_crossentropy(tf_y_true, tf_y_pred)
        loss = K.mean(tf.multiply(ce, weights_v))
        return loss

    return weighted_cross_entropy_fn

# Reading and preprocessing the training data

We read the data from @munumbutt's [AMEX-Feather-Dataset](https://www.kaggle.com/datasets/munumbutt/amexfeather). Then we create some groups of features:
- Selected features taken as minimums, maximums, averages over all statements of a customer
- Selected features taken from the last statement of a customer
- Other featurers including the number of unique statements a customer gets and the time between their statements

We one-hot encode the categorical features and fill all missing values with 0.

The code has been optimized for memory efficiency rather than readability. In particular, `.iloc[mask_array, columns]` needs much less RAM than the groupby construction used in previous versions of the notebook.

We process test data first since it takes more memory


In [None]:
train = pd.read_feather('../input/amex-aggreation-dataset/train_processed.ftr')
train.sort_values(by = 'customer_ID', inplace = True)
train.reset_index(drop = True, inplace = True)
cid = train['customer_ID']
target = train['target']
train = train.drop(['target','customer_ID'], axis = 1)
train = pd.DataFrame(StandardScaler().fit_transform(train),columns = train.columns).astype('float16')

In [None]:
train.head()

In [None]:
target.head()

# The model

Our model has four hidden layers, enriched by a skip connection and a Dropout layer.

In [None]:
def my_model(n_inputs=len(train.columns)):
    """Sequential neural network with a skip connection.
    
    Returns a compiled instance of tensorflow.keras.models.Model.
    """
    activation = 'swish'
    l1 = 1e-7
    l2 = 4e-4
    inputs = Input(shape=(n_inputs, ))
    x0 = BatchNormalization()(inputs)
    x0 = Dense(256, 
               kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1,l2=l2),
#                activity_regularizer=tf.keras.regularizers.L1L2(l1=l1,l2=l2),
              activation=activation,
             )(x0)
    x0 = Dropout(0.1)(x0)
    x = Dense(64, 
              kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1,l2=l2),
#               activity_regularizer=tf.keras.regularizers.L1L2(l1=l1,l2=l2),
              activation=activation,
             )(x0)
    x = Dense(64, 
              kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1,l2=l2),
#               activity_regularizer=tf.keras.regularizers.L1L2(l1=l1,l2=l2),
              activation=activation,
             )(x)
    x = Concatenate()([x, x0])
    x = Dropout(0.1)(x)
    x = Dense(16, 
              kernel_regularizer=tf.keras.regularizers.L1L2(l1=l1,l2=l2),
#               activity_regularizer=tf.keras.regularizers.L1L2(l1=l1,l2=l2),
              activation=activation,
             )(x)
    x = Dense(1,
              activation='sigmoid',
             )(x)
    return Model(inputs, x)

# Cross-validation

We use a standard cross-validation loop. In the loop, we scale the data and train a model. We use a StratifiedKFold because the data is imbalanced.


In [None]:
# %%time
# Cross-validation of the classifier

FOLDS = 11
EPOCHS_EXPONENTIALDECAY = 125
VERBOSE = 0 # set to 0 for less output, or to 2 for more output
LR_START = 0.012
LR_END = 1e-5 # learning rate at the end of training
CYCLES = 1
EPOCHS = 300
DIAGRAMS = False
USE_PLATEAU = True # set to True for plateau, or to False for exponential learning rate decay
BATCH_SIZE = 2048
one_weight = 0.55

np.random.seed(1)
random.seed(1)

def fit_model(X_tr, y_tr, X_va=None, y_va=None, fold=0):
    """Scale the data, fit a model, plot the training history and optionally validate the model
    Saves a trained instance of tensorflow.keras.models.Model.
    As a side effect, updates oof_pred, y_pred_list and score_list.
    """
    global oof_pred
    start_time = datetime.datetime.now()
    
    if USE_PLATEAU and X_va is not None: # use plateau
        epochs = EPOCHS
        lr = ReduceLROnPlateau(monitor="val_loss", factor=0.80, 
                               patience=5, verbose=VERBOSE)
        es = EarlyStopping(monitor="val_loss",
                           patience=20, 
                           verbose=1,
                           mode="min", 
                           restore_best_weights=True)
        callbacks = [lr, es, tf.keras.callbacks.TerminateOnNaN()]

    else: # use exponential learning 
        epochs = EPOCHS_EXPONENTIALDECAY

        def exponential_decay(epoch):
            # v decays from e^a to 1 in every cycle
            # w decays from 1 to 0 in every cycle
            # epoch == 0                  -> w = 1 (first epoch of cycle)
            # epoch == epochs_per_cycle-1 -> w = 0 (last epoch of cycle)
            # higher a -> decay starts with a steeper decline
            a = 4
            epochs_per_cycle = epochs // CYCLES
            epoch_in_cycle = epoch % epochs_per_cycle
            if epochs_per_cycle > 1:
                v = math.exp(a * (1 - epoch_in_cycle / (epochs_per_cycle-1)))
                w = (v - 1) / (math.exp(a) - 1)
            else:
                w = 1
            return w * LR_START + (1 - w) * LR_END

        lr = LearningRateScheduler(exponential_decay, verbose=0)
        
        es = EarlyStopping(monitor="val_loss",
                           patience=100, 
                           verbose=1,
                           mode="min", 
                           restore_best_weights=True)
        
        callbacks = [lr, es, tf.keras.callbacks.TerminateOnNaN()]
            
    #reset model
    with strategy.scope():
        loss = create_weighted_binary_crossentropy(1-one_weight, one_weight)
        model = my_model(train.shape[1])
        model.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate=LR_START,
                                                      clipvalue= 0.5,
                                                      clipnorm = 1.0 # prevent gradient explosion
                                                     ),
                  loss=loss,
                 )
            
    # Train the model
    history = model.fit(X_tr, y_tr, 
                        validation_data=(X_va, y_va),
                        epochs=epochs,
                        verbose=VERBOSE,
                        batch_size=BATCH_SIZE,
                        shuffle=True,
                        callbacks=callbacks).history
        
    
    X_tr, y_tr, callbacks, es, lr = None, None, None, None, None
    
    lastloss = f"Training loss: {history['loss'][-1]:.4f} | Val loss: {history['val_loss'][-1]:.4f}"

    # Inference for validation
    oof_pred = model.predict(X_va, batch_size=len(X_va), verbose=0).ravel()

    # Evaluation: Execution time, loss and metrics
    score = amex_metric(y_va.values, oof_pred)
    print(f"{Fore.GREEN}{Style.BRIGHT}Fold {fold} | {str(datetime.datetime.now() - start_time)[-12:-7]}"
          f" | {len(history['loss']):3} ep"
          f" | {lastloss} | Score: {score:.5f}{Style.RESET_ALL}")

    if DIAGRAMS:
        # Plot training history
        plot_history(history, 
                     title=f"Learning curve",
                     plot_lr=True)
        
    return score, model


print(f"{len(train.columns)} features")
# history_list = []
oof_pred_list = []
kf = StratifiedKFold(n_splits=FOLDS, shuffle= True, random_state= 42)
for fold, (idx_tr, idx_va) in enumerate(kf.split(train, target)):
    print('-' * 15 + f'fold {fold}' + '-'*15)
    gc.collect()
    b_score = 0
    b_oofpred = None
    b_model = None
    trys = 0
    while trys < 4:
        print('seed:', trys)
        tf.random.set_seed(1+trys)
        score, model = fit_model(train.iloc[idx_tr], target.iloc[idx_tr], train.iloc[idx_va], target.iloc[idx_va], fold=fold)
        if score > b_score:
            b_model = model
            b_score = score
            b_oofpred = pd.DataFrame({'customer_ID': cid.iloc[idx_va],
                            'prediction': oof_pred})
        trys+=1
        
    oof_pred_list.append(b_oofpred)
    #save model
    import h5py
    b_model.save_weights(f"model_{fold}.h5")
    b_model = b_model.to_json()
    with open(f"model_{fold}.json", "w") as json_file:
        json_file.write(b_model)

    
    del b_score, b_oofpred, b_model, score, model
    gc.collect()

oof_pred = pd.concat(oof_pred_list, axis = 0).sort_values(by = 'customer_ID')
print(f"{Fore.GREEN}{Style.BRIGHT}OOF Score: {amex_metric(target, oof_pred['prediction'].ravel())}{Style.RESET_ALL}")
# Fold 0 | 17:40 | 125 ep | Training loss: 0.2172 | Val loss: 0.2289 | Score: 0.78622
# Fold 1 | 17:17 | 125 ep | Training loss: 0.2152 | Val loss: 0.2288 | Score: 0.79317
# Fold 2 | 16:47 | 125 ep | Training loss: 0.2156 | Val loss: 0.2304 | Score: 0.78242
# Fold 3 | 17:26 | 125 ep | Training loss: 0.2152 | Val loss: 0.2312 | Score: 0.78592
# Fold 4 | 17:21 | 125 ep | Training loss: 0.2158 | Val loss: 0.2266 | Score: 0.78993
# Fold 5 | 17:17 | 125 ep | Training loss: 0.2154 | Val loss: 0.2282 | Score: 0.78551

# Fold 0 | 18:40 | 132 ep | Training loss: 0.2172 | Val loss: 0.2323 | Score: 0.78299
# Fold 1 | 21:40 | 159 ep | Training loss: 0.2167 | Val loss: 0.2258 | Score: 0.79349

In [None]:
# write to files and clear memory 
del oof_pred_list, kf, train, target
gc.collect()

oof_pred.to_csv('oof_keras.csv')

# Submission

We submit the mean of the five predictions.

In [None]:
test = pd.read_feather('../input/amex-aggreation-dataset/test_processed.ftr')
test.drop(['customer_ID'], axis = 1, inplace = True)
test.reset_index(drop = True, inplace = True)
gc.collect()

In [None]:
#scale test and predict
SPLITS = 10
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))



y_pred_avg = np.zeros(len(test))

for fold in range(FOLDS):
#     model = load_model(f"model_{fold}", custom_objects={"weighted_cross_entropy_fn": loss})
    
    from tensorflow.keras.models import model_from_json
    json_file = open(f'model_{fold}.json', 'r')
    model = json_file.read()
    json_file.close()
    model = model_from_json(model, custom_objects={"weighted_cross_entropy_fn": loss})
    model.load_weights(f"model_{fold}.h5")
    
    
    pred = np.array([])
    split_ids = split(test.index, SPLITS)
    for (j,ids) in enumerate(split_ids):
        df = StandardScaler.fit_transform(test.iloc[ids])
        pred = np.append(pred, model.predict(df).reshape(1, -1)[0])
    del model
    gc.collect()
    y_pred_avg += pred / FOLDS
    
del scaler, test
gc.collect()

In [None]:
sub = pd.read_csv('../input/amex-default-prediction/sample_submission.csv')
sub['prediction'] = y_pred_avg
sub.to_csv('keras_submission.csv', index=False)
sub

As a plausibility test, we plot a histogram of the predictions. The histogram should resemble the OOF histogram (see above), and the majority of the predictions should be near 0 (because the classes are imbalanced).

In [None]:
plt.figure(figsize=(16, 5))
plt.hist(sub.prediction, bins=np.linspace(0, 1, 21), density=True)
plt.title("Plausibility check", fontsize=20)
plt.xlabel('Prediction')
plt.ylabel('Density')
plt.show()