In [None]:
# Import libraries
import os
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc  # Garbage collector

warnings.filterwarnings('ignore')

### Data Description
The objective of this competition is to predict the probability that a customer does not pay back their credit card balance amount in the future based on their monthly customer profile. The target binary variable is calculated by observing 18 months performance window after the latest credit card statement, and if the customer does not pay due amount in 120 days after their latest statement date it is considered a default event.

The dataset contains aggregated profile features for each customer at each statement date. Features are anonymized and normalized, and fall into the following general categories:

D_* = Delinquency variables
S_* = Spend variables
P_* = Payment variables
B_* = Balance variables
R_* = Risk variables
with the following features being categorical:

['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

Your task is to predict, for each customer_ID, the probability of a future payment default (target = 1).

Note that the negative class has been subsampled for this dataset at 5%, and thus receives a 20x weighting in the scoring metric.

### Data Exploration

In [None]:
# Reading feather format data(memory efficient) 
# Source: ttps://www.kaggle.com/datasets/munumbutt/amexfeather

train_raw = pd.read_feather('../input/amexfeather/train_data.ftr')

In [None]:
train_raw.head(15)

In [None]:
train_raw.info()

In [None]:
train_raw.shape

In [None]:
# Number of unique customers in the dataset
train_raw["customer_ID"].nunique()

In [None]:
# Date range
train_raw["S_2"].min(), train_raw["S_2"].max()

In [None]:
print("No. of features for each category:")
for pref in ["D_", "S_", "P_", "B_", "R_"]:
    print(f"{pref} : {len([i for i in train_raw.columns if i.startswith(pref)])}")


In [None]:
# Target values distribution
train_raw["target"].value_counts("%")

In [None]:
# Missing values
tmp = train_raw.isna().sum().mul(100).div(len(train_raw)).sort_values(ascending=False)

fig, ax = plt.subplots(2,1, figsize=(30,10))
sns.barplot(x=tmp[:100].index, y=tmp[:100].values, ax=ax[0])
sns.barplot(x=tmp[100:].index, y=tmp[100:].values, ax=ax[1])
ax[0].set_ylabel("Percentage [%]"), ax[1].set_ylabel("Percentage [%]")
ax[0].tick_params(axis='x', rotation=90); ax[1].tick_params(axis='x', rotation=90)
plt.suptitle("Amount of missing data")
plt.tight_layout()
plt.show()

In [None]:
missingDF = pd.DataFrame(tmp).reset_index()
missingDF[missingDF[0]>90]

In [None]:
# columns with no missing values
len(missingDF[missingDF[0]==0])

In [None]:
# only 3 columns in payment(P_) category, checking missing data in them
train_raw[[i for i in train_raw.columns if i.startswith("P_")]].isna().sum().div(len(train_raw)).mul(100)

### Data Exploration for Two Customers with different "target" values(risk scores)

In [None]:
filtered_customer_data =train_raw[train_raw["customer_ID"].isin(['0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a', '00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5'])]

In [None]:
filtered_customer_data.head(1)

In [None]:
filtered_customer_data.tail(1)

In [None]:
filtered_customer_data.info()

In [None]:
# filtered_customer_data.describe()

In [None]:
# filtered_customer_data.isna().sum().mul(100).div(len(train_raw)).sort_values(ascending=False)[:5]

In [None]:
# excluding categorical and date features
plot_cols = [i for i in filtered_customer_data.columns if i not in ["target","customer_ID","S_2",'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']]

In [None]:
len(plot_cols)

#### How each feature varies with time for both target values

In [None]:
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=None)
fig, axs = plt.subplots(23,8,figsize=(25, 150))
for i,ax in zip(plot_cols[:],axs.ravel()):
    filtered_customer_data.groupby("customer_ID").plot(x="S_2", y=i, marker="o", ax=ax)
    ax.legend(["0","1"])
    ax.set_title(i)


#### MORE EDA COMING SOON
# DO UPVOTE!