# American Express - Default Prediction - Exploratory Data Analysis

Predict if a customer will default in the future

Quick Exploratory Data Analysis for [American Express - Default Prediction](https://www.kaggle.com/competitions/amex-default-prediction/overview) challenge    


![](https://storage.googleapis.com/kaggle-competitions/kaggle/35332/logos/header.png?t=2022-03-23-01-05-50)


<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='color:white; background:#8D8F8A; border:0' role="tab" aria-controls="home"><center>Quick Navigation</center></h3>

* [Overview](#1)
* [Visualizations](#2)
* [Modeling](#3)

<a id="1"></a>
<h2 style='background:#8D8F8A; border:0; color:white'><center>Overview<center><h2>

In [None]:
import os
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

warnings.filterwarnings('ignore')

In [None]:
TRAIN_DATA_PATH = "../input/amex-default-prediction/train_data.csv"
TRAIN_LABELS_PATH = "../input/amex-default-prediction/train_labels.csv"

Dataset is so big, that's why we can use chunk loading

In [None]:
chunksize = 13000

train_df_iter = pd.read_csv(TRAIN_DATA_PATH, chunksize=chunksize)

Load only one chunk for EDA With the for loop we can load and process all data

In [None]:
# for chunk in train_df_example:
#     process(chunk)

train_df_example = train_df_iter.__next__()

Labels

In [None]:
train_labels_df = pd.read_csv(TRAIN_LABELS_PATH)

Let's look at one customer

In [None]:
example_customer_id = "000f1c950ae4e388f44e9ba96dd6334dfa85d8be0416d9d0d30228301f2e4cc4"

In [None]:
customer_data_ex = train_df_example[train_df_example["customer_ID"] == example_customer_id]

In [None]:
customer_data_ex

Features are anonymized and normalized, and fall into the following general categories:
- D_* = Delinquency variables   
- S_* = Spend variables   
- P_* = Payment variables   
- B_* = Balance variables   
- R_* = Risk variables   

In [None]:
all_cols = list(customer_data_ex.columns)
print(all_cols)

In [None]:
b_cols = list(filter(lambda x: x.startswith("B_"), all_cols))
print(b_cols)

Check if the customer will future payment default

In [None]:
train_labels_df[train_labels_df["customer_ID"] == example_customer_id]

In [None]:
customer_data_ex.loc[:, "S_2"] = pd.to_datetime(customer_data_ex["S_2"])

In [None]:
plt.figure(figsize=(16, 5))
sn.lineplot(data=customer_data_ex, x="S_2", y="P_2")
plt.title("P_2", fontsize=16)
plt.xlabel("S_2", fontsize=14)
plt.ylabel("P_2", fontsize=14);

<a id="2"></a>
<h2 style='background:#8D8F8A; border:0; color:white'><center>Visualizations<center><h2>

Show only 10 first customer's

In [None]:
ex_customer_ids = train_labels_df.iloc[:10]["customer_ID"].tolist()
ex_customer_data = train_df_example[train_df_example["customer_ID"].isin(ex_customer_ids)]

In [None]:
ex_customer_data = pd.merge(ex_customer_data, train_labels_df.iloc[:10], on="customer_ID")
ex_customer_data["S_2"] = pd.to_datetime(ex_customer_data["S_2"])

In [None]:
ex_customer_data.head()

How their feature time series look like

In [None]:
plt.figure(figsize=(16, 5))
for _, group in ex_customer_data.groupby("customer_ID"):
    sn.lineplot(data=group, x="S_2", y="P_2", label=group["target"].max())
plt.title("P_2", fontsize=16)
plt.xlabel("S_2", fontsize=14)
plt.ylabel("P_2", fontsize=14);

In [None]:
plt.figure(figsize=(16, 5))
for _, group in ex_customer_data.groupby("customer_ID"):
    sn.lineplot(data=group, x="S_2", y="B_1", label=group["target"].max())
plt.title("B_1", fontsize=16)
plt.xlabel("S_2", fontsize=14)
plt.ylabel("B_1", fontsize=14);

In [None]:
plt.figure(figsize=(16, 5))
for _, group in ex_customer_data.groupby("customer_ID"):
    sn.lineplot(data=group, x="S_2", y="B_2", label=group["target"].max())
plt.title("B_2", fontsize=16)
plt.xlabel("S_2", fontsize=14)
plt.ylabel("B_2", fontsize=14);

Let's take 1000 customers and show features histograms for target customers and for no-target

In [None]:
ex_customer_ids = train_labels_df.iloc[:1000]["customer_ID"].tolist()
ex_customer_data = train_df_example[train_df_example["customer_ID"].isin(ex_customer_ids)]
ex_customer_data = pd.merge(ex_customer_data, train_labels_df.iloc[:1000], on="customer_ID")
ex_customer_data["S_2"] = pd.to_datetime(ex_customer_data["S_2"])

ex_customer_data.shape

We have 735 no-target customers and 265 target

In [None]:
plt.figure(figsize=(16, 5))
sn.countplot(y=ex_customer_data.groupby("customer_ID")["target"].max())
plt.title("Class distribution", fontsize=16)
plt.xlabel("count", fontsize=14)
plt.ylabel("target", fontsize=14);

In [None]:
plt.figure(figsize=(16, 5))
sn.countplot(y=ex_customer_data.groupby("customer_ID")["target"].count())
plt.title("Distribution of the number of records for the client", fontsize=16)
plt.xlabel("count", fontsize=14)
plt.ylabel("n_records", fontsize=14);

In [None]:
plt.figure(figsize=(16, 5))
sn.histplot(data=ex_customer_data, x="S_2", bins=100)
plt.title("Distribution of records by time", fontsize=16)
plt.xlabel("count", fontsize=14)
plt.ylabel("n_records", fontsize=14);

In [None]:
def sort_f(x):
    try:
        a, b = x.split("_")
        return a, int(b)
    except:
        return "0", 0

all_cols = sorted(all_cols, key=sort_f)

In [None]:
categorical_cols = [
    'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 
    'D_126', 'D_63', 'D_64', 'D_66', 'D_68',
]

In [None]:
ind = 0
for col in categorical_cols:
    if ind % 4 == 0:
        plt.figure(figsize=(16, 3))
    plt.subplot(1, 4, ind % 4 + 1)
    
    sn.countplot(data=ex_customer_data, x=col, hue="target")
    plt.ylabel("")
    
    if ind % 4 == 3:
        plt.show()
    
    ind += 1

In [None]:
ind = 0
for col in all_cols:
    if col in ["S_2", "customer_ID", "target"] + categorical_cols:
        continue
    
    if ind % 4 == 0:
        plt.figure(figsize=(16, 4))
    plt.subplot(1, 4, ind % 4 + 1)
    
    sn.histplot(data=ex_customer_data, x=col, hue="target", bins=20)
    plt.ylabel("")
    
    if ind % 4 == 3:
        plt.show()
    
    ind += 1

In [None]:
ex_customer_data[ex_customer_data["target"] == 0][b_cols[:10]].describe()

In [None]:
ex_customer_data[ex_customer_data["target"] == 1][b_cols[:10]].describe()

<a id="3"></a>
<h2 style='background:#8D8F8A; border:0; color:white'><center>Modeling<center><h2>

Select baseline features based on the graps above

In [None]:
X_cols = [
    "B_2", "B_7", "B_18", "B_23", "B_32", "D_48",
    "D_55", "D_61", "D_121", "P_2", "S_11",
    
]

Take a small portion of train data for train baseline

In [None]:
chunksize = 1000000

train_df_iter = pd.read_csv(TRAIN_DATA_PATH, chunksize=chunksize, usecols=["customer_ID"] + X_cols)

# train_df = train_df_iter.__next__()

train_df = pd.DataFrame()
for i_chunk, chunk in enumerate(train_df_iter):
    train_df = pd.concat([train_df, chunk])
    print(train_df.shape)

Create mean and last values for selected features

In [None]:
train_df_mean = train_df.groupby("customer_ID")[X_cols].mean().reset_index()
train_df_last = train_df.groupby("customer_ID")[X_cols].last().reset_index()

In [None]:
train_df = pd.merge(
    left=train_df_mean, 
    right=train_df_last, 
    how="inner",
    on="customer_ID",
    suffixes=("_mean", "_last"),
)

In [None]:
train_df = pd.merge(train_df, train_labels_df, on="customer_ID", how="left")

Hard fillna. We need to recheck this

In [None]:
train_df = train_df.fillna(0)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [None]:
_X_cols = train_df.columns[1:-1]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df[_X_cols], train_df["target"], test_size=0.2, 
    random_state=42, stratify=train_df["target"],
)

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

Train random forest model and select the best hyperparameters

In [None]:
parameters = {
    "n_estimators": [5, 50], 
    "max_depth": [3, 5],
}

model = RandomForestClassifier(
    random_state=42,
    class_weight="balanced",
)

model = GridSearchCV(
    model, 
    parameters, 
    cv=5,
    scoring="f1",
)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.best_estimator_

See the best features

In [None]:
feature_importances = model.best_estimator_.feature_importances_
vis_indexes = list(range(len(feature_importances)))
vis_indexes = sorted(vis_indexes, key=lambda x: -feature_importances[x])
plt.figure(figsize=(8, 8))
sn.barplot(
    x=feature_importances[vis_indexes], 
    y=_X_cols[vis_indexes],
)
plt.yticks(fontsize=14);

https://www.kaggle.com/code/inversion/amex-competition-metric-python

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

Check train metrics

In [None]:
y_pred_train = model.predict_proba(X_train)[:, 1]
f1_score(y_train, y_pred_train >= 0.5)

In [None]:
plt.figure(figsize=(6, 6))
sn.histplot(x=y_pred_train, bins=50);

In [None]:
amex_metric(
    pd.DataFrame({"target": y_train}).reset_index(drop=True), 
    pd.DataFrame({"prediction": y_pred_train}).reset_index(drop=True),
)

Check valid metrics

In [None]:
y_pred_valid = model.predict_proba(X_valid)[:, 1]
f1_score(y_valid, y_pred_valid >= 0.5)

In [None]:
plt.figure(figsize=(6, 6))
sn.histplot(x=y_pred_valid, bins=50);

In [None]:
amex_metric(
    pd.DataFrame({"target": y_valid}).reset_index(drop=True), 
    pd.DataFrame({"prediction": y_pred_valid}).reset_index(drop=True),
)

In [None]:
TEST_DATA_PATH = "../input/amex-default-prediction/test_data.csv"
SAMPLE_SUBMISSION_PATH = "../input/amex-default-prediction/sample_submission.csv"

In [None]:
sample_submission_df = pd.read_csv(SAMPLE_SUBMISSION_PATH)

In [None]:
chunksize = 1000000

test_df_iter = pd.read_csv(TEST_DATA_PATH, chunksize=chunksize, usecols=["customer_ID"] + X_cols)

Iterate over chunks of test data and make predictions for them

In [None]:
_index = []
_vals = []

for chunk in test_df_iter:
    _chunk_mean = chunk.groupby("customer_ID")[X_cols].mean().reset_index()
    _chunk_last = chunk.groupby("customer_ID")[X_cols].last().reset_index()
    _chunk = pd.merge(
        left=_chunk_mean, 
        right=_chunk_last, 
        how="inner",
        on="customer_ID",
        suffixes=("_mean", "_last"),
    )

    X_test = _chunk[_X_cols]
    X_test = X_test.fillna(0)
    y_test_pred = model.predict_proba(X_test)[:, 1]
    _index.extend(_chunk["customer_ID"])
    _vals.extend(y_test_pred)
    
    print(len(_index))

In [None]:
res_df = pd.DataFrame(
    {"customer_ID": _index, "prediction": _vals}
).groupby("customer_ID")["prediction"].mean().reset_index()

In [None]:
res_df.isna().sum()

See test distribution of predictions

In [None]:
plt.figure(figsize=(6, 6))
sn.histplot(data=res_df, x="prediction", bins=50);

In [None]:
res_df.to_csv("submission.csv", index=False)

In [None]:
res_df