# LightGBM Quickstart for the *American Express - Default Prediction* competition

This notebook shows how to apply LightGBM to the competition data.

It is based on the [EDA which makes sense ⭐️⭐️⭐️⭐️⭐️](https://www.kaggle.com/code/ambrosm/amex-eda-which-makes-sense).

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
import seaborn as sns
from cycler import cycler
from IPython.display import display
import datetime
import scipy.stats
import warnings

from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibrationDisplay
from lightgbm import LGBMClassifier, log_evaluation

# plt.rcParams['axes.facecolor'] = '#0057b8' # blue
# plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
#                                          plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
# plt.rcParams['text.color'] = 'w'

In [None]:
# From https://www.kaggle.com/code/inversion/amex-competition-metric-python
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        """Almost equal to 2 * auc - 1"""
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)
#     print(f"{g:.5f} {d:.5f}")

    return 0.5 * (g + d)

def lgb_amex_metric(y_true, y_pred):
    """The competition metric with lightgbm's calling convention"""
    return ('amex',
            amex_metric(pd.DataFrame({'target': y_true}), pd.Series(y_pred, name='prediction')),
            True)

#From : https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod_lgbm(y_pred: np.ndarray, data: lgb.Dataset):

    y_true = data.get_label()
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 'AMEX', 0.5 * (gini[1]/gini[0]+ top_four), True

# Reading and preprocessing the data

We read the data from @munumbutt's [AMEX-Feather-Dataset](https://www.kaggle.com/datasets/munumbutt/amexfeather). Then we reduce the amount of data by keeping only the most recent statement for every customer, as suggested by @inversion [here](https://www.kaggle.com/competitions/amex-default-prediction/discussion/327094).

In [None]:
%%time
train = pd.read_feather('../input/amexfeather/train_data.ftr')
test = pd.read_feather('../input/amexfeather/test_data.ftr')

In [None]:
train =  (train
            .groupby('customer_ID')
            .tail(1)
            .set_index('customer_ID', drop=True)
            .sort_index()
            .drop(['S_2'], axis='columns'))

test =  (test
            .groupby('customer_ID')
            .tail(1)
            .set_index('customer_ID', drop=True)
            .sort_index()
            .drop(['S_2'], axis='columns'))

# **EDA**

# The categorical features

According to the [data description](https://www.kaggle.com/competitions/amex-default-prediction/data), there are eleven categorical features. We plot histograms for target=0 and target=1:

 - customer_ID = Unique Customer ID
 - D_* = Delinquency variables
 - S_* = Spend variables
 - P_* = Payment variables
 - B_* = Balance variables
 - R_* = Risk variables


In [None]:
cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
plt.figure(figsize=(16, 16))
for i, f in enumerate(cat_features):
    plt.subplot(4, 3, i+1)
    temp = pd.DataFrame(train[f][train.target == 0].value_counts(dropna=False, normalize=True).sort_index().rename('count'))
    temp.index.name = 'value'
    temp.reset_index(inplace=True)
    plt.bar(temp.index, temp['count'], alpha=0.5, label='target=0')
    temp = pd.DataFrame(train[f][train.target == 1].value_counts(dropna=False, normalize=True).sort_index().rename('count'))
    temp.index.name = 'value'
    temp.reset_index(inplace=True)
    plt.bar(temp.index, temp['count'], alpha=0.5, label='target=1')
    plt.xlabel(f)
    plt.ylabel('frequency')
    plt.legend()
    plt.xticks(temp.index, temp.value)
plt.show()


**Insight:**
- Every feature has at most eight categories (including a nan category). One-hot encodings are feasible.
- The distributions for target=0 and target=1 differ. This means that every feature gives some information about the target.
- Some features are 0,1, or nans 
- D_66 is mainly 1.0 or Nan, it can be treated as a binary


# The binary features

Two features are binary:
- B_31 is always 0 or 1.
- D_87 and D_66 are always 1 or missing.


In [None]:
bin_features = ['B_31', 'D_87', 'D_66']
plt.figure(figsize=(16, 4))
for i, f in enumerate(bin_features):
    plt.subplot(1, 3, i+1)
    temp = pd.DataFrame(train[f][train.target == 0].value_counts(dropna=False, normalize=True).sort_index().rename('count'))
    temp.index.name = 'value'
    temp.reset_index(inplace=True)
    plt.bar(temp.index, temp['count'], alpha=0.5, label='target=0')
    temp = pd.DataFrame(train[f][train.target == 1].value_counts(dropna=False, normalize=True).sort_index().rename('count'))
    temp.index.name = 'value'
    temp.reset_index(inplace=True)
    plt.bar(temp.index, temp['count'], alpha=0.5, label='target=1')
    plt.xlabel(f)
    plt.ylabel('frequency')
    plt.legend()
    plt.xticks(temp.index, temp.value)
plt.show()


# The continuous features

If we plot histograms of the continuous features, we see that they have all kinds of distributions:

In [None]:
cont_features = sorted([f for f in train.columns if f not in cat_features + bin_features + ['customer_ID', 'target']])
print(cont_features, end = '')
ncols = 5
for i, f in enumerate(cont_features):
    #show 4 rows
    if i >=20:
        break
    if i % ncols == 0: 
        if i > 0: plt.show()
        plt.subplots(1, ncols, figsize=(16, 3))
    plt.subplot(1, ncols, i % ncols + 1)
    
    sns.histplot(data=train, x=f, hue="target", bins = 100, kde=True)
    
    plt.xlabel(f)
plt.show()

**Insight:** Histograms with white space at the left or right end indicate that the data contains outliers. We will have to deal with these outliers.

We also need to look for correlations in order to impute the missing values later

# Impute Continuous 

Impute with median

In [None]:
%%time
from sklearn.impute import SimpleImputer
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer
# from sklearn.neighbors import KNeighborsRegressor
continuous_imputer = SimpleImputer(strategy = 'median')


print('Fit')
continuous_imputer.fit(train[cont_features])
print('Pred')
train[cont_features] = continuous_imputer.transform(train[cont_features])
test[cont_features] = continuous_imputer.transform(test[cont_features])
print('Done')

del continuous_imputer

# Countinuous Correlations

As there are too many columns to calculate, we must calculate correlations one row at a time. Otherwise it would take too much memory

This also takes very long, so in the interest of time I copy-pasted the calculations in the code cell below the next

In [None]:
%%time
corr_dict = {x : [] for x in cont_features}
# corr_df = pd.DataFrame(index = cont_features, columns = cont_features)
for col in cont_features:
    corr_dict[col] = train[cont_features].corrwith(train[col]).to_numpy()

In [None]:
# from numpy import nan

num_corr = pd.DataFrame.from_dict(corr_dict)
num_corr.index = cont_features
num_corr

In [None]:
del corr_dict
del num_corr

# Treat Contiuous Outliers

In [None]:
%%time
for col in cont_features:
#     mean, std = train[col].astype(np.float64).agg(['mean', 'std'])
#     low  = mean - 3*std
#     high = mean + 3*std
#     train.loc[train[col] < low,:] = low
#     train.loc[train[col] > high,:] = high
#     test.loc[test[col] < low,:] = low
#     test.loc[test[col] > high,:] = high
    tenth_percentile = np.percentile(train[col], 10)
    ninetieth_percentile = np.percentile(train[col], 95)
    train[col] = np.where(train[col]<tenth_percentile, tenth_percentile, train[col])
    train[col] = np.where(train[col]>ninetieth_percentile, ninetieth_percentile, train[col])
    
    tenth_percentile = np.percentile(test[col], 3)
    ninetieth_percentile = np.percentile(test[col], 97)
    test[col] = np.where(test[col]<tenth_percentile, tenth_percentile, test[col])
    test[col] = np.where(test[col]>ninetieth_percentile, ninetieth_percentile, test[col])

In [None]:
ncols = 5
for i, f in enumerate(cont_features):
    #show 2 rows
    if i >=10:
        break
    if i % ncols == 0: 
        if i > 0: plt.show()
        plt.subplots(1, ncols, figsize=(16, 3))
    plt.subplot(1, ncols, i % ncols + 1)
    sns.histplot(data=train, x=f, hue="target", bins = 100, kde=True)
    plt.xlabel(f)
plt.show()

# Binary Preprocessing

Need to turn D_87 and D_66 into 0s and 1s.

In [None]:
train[['D_87','D_66']] = train[['D_87','D_66']].astype('float').fillna(0)
test[['D_87','D_66']] = test[['D_87','D_66']].astype('float').fillna(0)

train[['D_87','D_66']].isnull().sum()

# Categorical Correlations

Use Cramers V for correlations calculations

Include binary variables too

In [None]:
# Label Encode Categorical Variables
from sklearn.preprocessing import LabelEncoder


train_cat = train[['B_30', 'B_31', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68', 'D_87']].astype('object')

label = LabelEncoder()

for i in train_cat.columns:
    train_cat[i] = label.fit_transform(train_cat[i])
    
train_cat.head()

In [None]:
from scipy.stats.contingency import association       
    
def Cramers_V(var1, var2) :
  crosstab = np.array(pd.crosstab(index=var1, columns=var2)) # Cross Tab
  return (association(crosstab, method='cramer'))            # Return Cramer's V

# Create the dataFrame matrix with the returned Cramer's V
rows = []

for var1 in train_cat:
    col = []

    for var2 in train_cat:
        V = Cramers_V(train_cat[var1], train_cat[var2]) # Return Cramer's V
        col.append(V)                                             # Store values to subsequent columns  
    rows.append(col)                                              # Store values to subsequent rows
  
CramersV_results = np.array(rows)
CramersV_df = pd.DataFrame(CramersV_results, columns = train_cat.columns, index = train_cat.columns)

In [None]:
plt.subplots(figsize=(20,7))
sns.heatmap(CramersV_df, cmap='cool', annot=True, center=0)

**Insight**: most features have significant correlations to other categorical features.

Use categorical features to impute each other's NaN values

# Impute Categorical

Use KNNImputer to impute columns using their label encodings


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
cat_features = ['B_30', 'B_31', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68', 'D_87']

encoder = LabelEncoder()
cat_imputer = KNNImputer(n_neighbors=1)

train_cat = train[cat_features]
test_cat = test[cat_features]

train_cat = train_cat.apply(lambda series: pd.Series(
    encoder.fit_transform(series[series.notnull()]),
    index=series[series.notnull()].index
))
test_cat = test_cat.apply(lambda series: pd.Series(
    encoder.fit_transform(series[series.notnull()]),
    index=series[series.notnull()].index
))
train_cat =  pd.DataFrame(train_cat ,columns = cat_features, index = train.index).astype(float)
test_cat =  pd.DataFrame(test_cat ,columns = cat_features, index = test.index).astype(float)
train_cat.head()

In [None]:
cat_imputer.fit(train_cat.head(50000))
train_cat = cat_imputer.transform(train_cat)
test_cat = cat_imputer.transform(test_cat)

train_cat =  pd.DataFrame(train_cat ,columns = cat_features).astype(int)
test_cat =  pd.DataFrame(test_cat ,columns = cat_features).astype(int)

train_cat.isnull().sum()

In [None]:
for col in cat_features:
    encoder.fit(train[col][train[col].notnull()])
    train_cat[col] = encoder.inverse_transform(train_cat[col])
    encoder.fit(test[col][test[col].notnull()])
    test_cat[col] = encoder.inverse_transform(test_cat[col])

train_cat

In [None]:
train_cat.index = train.index
test_cat.index = test.index
train[cat_features] = train_cat
test[cat_features] = test_cat
# sort array columns for organization
train = train.sort_index(axis=1)
test = test.sort_index(axis=1)
train.head()

In [None]:
del train_cat
del test_cat
del cat_imputer
del encoder

# Final Preprocessing

Finally prep data for modeling

In [None]:
#one-hot encode the non-binary categorical features
cat_cols = [col for col in cat_features if train[col].nunique()>2]

from sklearn.preprocessing import OneHotEncoder
ohenc = OneHotEncoder(sparse=False)
OH_cols_train = pd.DataFrame(ohenc.fit_transform(train[cat_cols]))
OH_cols_test = pd.DataFrame(ohenc.transform(test[cat_cols]))

OH_cols_train.columns = ohenc.get_feature_names_out()
OH_cols_test.columns = ohenc.get_feature_names_out()

OH_cols_train.index = train.index
OH_cols_test.index = test.index

train = train.drop(cat_cols, axis=1)
test = test.drop(cat_cols, axis=1)

train = pd.concat([train, OH_cols_train], axis=1)
test = pd.concat([test, OH_cols_test], axis=1)

train

In [None]:
del OH_cols_train
del OH_cols_test

# Check Helpfullness of Each Feature for Modelling

I will be using LGBMClassifier for modeling.

Check the Permutation Importance of each feature

In [None]:
# import eli5
# from lightgbm import LGBMRegressor
# from eli5.sklearn import PermutationImportance
# from sklearn.model_selection import train_test_split

# train_X, val_X, train_y, val_y = train_test_split(train.drop('target',axis = 1).astype('float64'),train['target'].astype('float64'), random_state=1,test_size =0.1)

# model = LGBMRegressor(n_estimators=170,
#                       objective = 'binary',
#                       min_child_samples=2400,
#                       num_leaves=127,
#                       max_bins=511, random_state=1)


# model.fit(train_X,train_y)

# perm = PermutationImportance(model, random_state=1).fit(val_X, val_y)

In [None]:
# eli5.show_weights(perm, feature_names = val_X.columns.tolist(),top=188)

In [None]:
n_fold = 2
n_seed = 2
n_estimators = 100

kf = StratifiedKFold(n_splits=n_fold)

importances = []
models = {}
df_scores = []

SAMPLE = False

for fold, (idx_tr, idx_va) in enumerate(kf.split(train.drop('target',axis = 1).astype('float64'),train[['target']])):
    
    X_tr = train.drop('target',axis = 1).iloc[idx_tr]
    X_va = train.drop('target',axis = 1).iloc[idx_va]
    y_tr = train[['target']].iloc[idx_tr]
    y_va = train[['target']].iloc[idx_va]
    
    lgb_train_data = lgb.Dataset(X_tr, label=y_tr)
    lgb_val_data = lgb.Dataset(X_va, label=y_va)
    
    for seed in range(n_seed):
        print('Fold: '+str(fold)+ ' - seed: '+str(seed))
        key = str(fold)+'-'+str(seed)
        
        parameters = {
            'objective': 'binary',
            'boosting': 'gbdt',
            'learning_rate': 0.05,
            'min_child_samples': 1000,
            'reg_lambda':10,
            #'verbose': 25,
            'seed':seed,
            'n_estimators':n_estimators
        }

        clf = lgb.train(parameters,
                               lgb_train_data,
                               valid_sets = [lgb_train_data,lgb_val_data],
                               verbose_eval = 100,
                               feval=amex_metric_mod_lgbm,
                               early_stopping_rounds=50)

        score = amex_metric(y_va.reset_index(drop=True), pd.Series(clf.predict(X_va)).rename('prediction'))
        models[key] = clf
        df_scores.append((fold, seed, score))
        print(f'Fold: {fold} - seed: {seed} - score {score:.2%}')
        importances.append(clf.feature_importance(importance_type='gain'))

In [None]:
df_results = pd.DataFrame(df_scores,columns=['fold','seed','score']).pivot(index='fold',columns='seed',values='score')

df_results.loc['seed_mean']= df_results.mean(numeric_only=True, axis=0)
df_results.loc[:,'fold_mean'] = df_results.mean(numeric_only=True, axis=1)
df_results

In [None]:
def plot_importance(importances, features, PLOT_TOP_N = 20, figsize=(10, 10)):
    importance_df = pd.DataFrame(data=importances, columns=features)
    sorted_indices = importance_df.median(axis=0).sort_values(ascending=False).index
    sorted_importance_df = importance_df.loc[:, sorted_indices]
    plot_cols = sorted_importance_df.columns[:PLOT_TOP_N]
    _, ax = plt.subplots(figsize=figsize)
    ax.grid()
    ax.set_xscale('log')
    ax.set_ylabel('Feature')
    ax.set_xlabel('Importance')
    sns.boxplot(data=sorted_importance_df[plot_cols],
                orient='h',
                ax=ax)
    plt.show()
    
plot_importance(np.array(importances),train.drop('target',axis=1).columns, PLOT_TOP_N = 20, figsize=(10, 20))

# Recalibrate

Keep most important features

In [None]:
top_ratio = 3/4
importance_df = pd.DataFrame(data=importances,columns =  train.drop('target',axis=1).columns).median(axis=0).sort_values(ascending=False)
keep_features = importance_df.index[:int(len(importance_df.index)*top_ratio)].to_list()
Features = keep_features

# Cross-validation

We cross-validate with a five-fold StratifiedKFold.

Notice that lightgbm logs the validation score with the competition's scoring function every ten iterations.

In [None]:
#%%time
# Cross-validation of the classifier

INFERENCE = True

def my_booster(random_state=1, n_estimators=300):
    return LGBMClassifier(n_estimators=n_estimators,
                          min_child_samples=2400,
                          num_leaves=127,
                          max_bins=511, random_state=random_state)
      
print(f"{len(Features)} features")
score_list = []
y_pred_list = []
kf = StratifiedKFold(n_splits=5)
for fold, (idx_tr, idx_va) in enumerate(kf.split(train, train.target)):
    start_time = datetime.datetime.now()
    X_tr = train.iloc[idx_tr][Features]
    X_va = train.iloc[idx_va][Features]
    y_tr = train.iloc[idx_tr].target
    y_va = train.iloc[idx_va].target
    
    model = my_booster()
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        model.fit(X_tr, y_tr,
                  eval_set = [(X_tr, y_tr),(X_va, y_va)], 
                  eval_metric=[lgb_amex_metric],
                  callbacks=[log_evaluation(10)])
    y_va_pred = model.predict_proba(X_va)[:,1]
    score = amex_metric(pd.DataFrame({'target': y_va.values}), pd.Series(y_va_pred, name='prediction'))
    n_trees = model.best_iteration_
    if n_trees is None: n_trees = model.n_estimators
    print(f"Fold {fold} | {str(datetime.datetime.now() - start_time)[-12:-7]} |"
          f" {n_trees:5} trees |"
          f"                Score = {score:.5f}")
    score_list.append(score)
    evals_result = model.evals_result_
    
    if INFERENCE:
        y_pred_list.append(model.predict_proba(test[Features])[:,1])
        
    # break # we only want the first fold
    
print(f"OOF Score:                       {np.mean(score_list):.5f}")

# Calibration diagram

The calibration diagram shows how the model predicts the default probability of customers:

In [None]:
plt.figure(figsize=(12, 4))
CalibrationDisplay.from_predictions(y_va, y_va_pred, n_bins=50, strategy='quantile', ax=plt.gca())
plt.title('Probability calibration')
plt.show()

# Inference

Make an ensemble

In [None]:
if INFERENCE:
    sub = pd.DataFrame({'customer_ID': test.index,
                        'prediction': np.mean(y_pred_list, axis=0)})
    sub.to_csv('submission.csv', index=False)
    sub