# Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Pandas
import pandas as pd
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)

In [None]:
# Plotly
import plotly.io as pio
import plotly.graph_objects as go
pio.templates["draft"] = go.layout.Template(
    layout_annotations=[
        dict(
            textangle=-30,
            opacity=0.1,
            font=dict(color="black", size=100),
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
        )
    ]
)
pio.templates.default = "draft"

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Reading Data

From the Competition Data Description:<br><br>
<b>D_* = Delinquency variables<br>
S_* = Spend variables<br>
P_* = Payment variables<br>
B_* = Balance variables<br>
R_* = Risk variables</b><br><br>
With the following features being categorical:<br>
['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

## Getting Optimal Column Types

We'll only read few rows from the dataset to check columns' types.

In [None]:
train_data_part = pd.read_csv("/kaggle/input/amex-default-prediction/train_data.csv", nrows=5000)
train_data_part.head()

In [None]:
cat_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
non_cat_columns = set(train_data_part.columns) - set(cat_columns)
d_columns = [col for col in non_cat_columns if "D_" in col]
s_columns = [col for col in non_cat_columns if "S_" in col]
p_columns = [col for col in non_cat_columns if "P_" in col]
r_columns = [col for col in non_cat_columns if "R_" in col]
b_columns = [col for col in non_cat_columns if "B_" in col]

In [None]:
# train_data[b_columns].dtypes # all float64 except B_31 int64
# train_data[r_columns].dtypes # all float64
# train_data[p_columns].dtypes # all float 64
# train_data[s_columns].dtypes # all float64 except S_2 object
s_columns.remove("S_2")
# train_data[d_columns].dtypes # all float64
float_cols = set([*d_columns, *s_columns, *p_columns, *r_columns, *b_columns]) - {"B_31", "S_2"}

In [None]:
print(f"B_31 Unique Values: {train_data_part['B_31'].unique()}")

In [None]:
for col in cat_columns:
    print(f"{col} Unique Values: {train_data_part[col].unique()} --- column type: {train_data_part[col].dtypes}")

<b>Using float16 dtype with Pandas isn't recommended, but we'll set this type to be able to read data easily then we can adjust when manipulating the dataset.</b><br>
Github Issue for float16 with Pandas: [ https://github.com/pandas-dev/pandas/issues/9220](http://)

In [None]:
dtypes_optimal = {col:"float16"for col in float_cols}
dtypes_optimal["B_31"] = "int16"

for col in cat_columns:
    dtypes_optimal[col] = "category"
    
dtypes_optimal["customer_ID"] = "object"
# dtypes_optimal

## Read All Train Data

In [None]:
train_data = pd.read_csv("/kaggle/input/amex-default-prediction/train_data.csv", dtype=dtypes_optimal)

In [None]:
train_labels = pd.read_csv("/kaggle/input/amex-default-prediction/train_labels.csv")

In [None]:
mem_usage = round(train_data.memory_usage().sum()/(1024**3),3)
print(f"Data Memory Usage: {mem_usage}GB")

In [None]:
train_data = pd.merge(train_data, train_labels, on="customer_ID")

## Preprocessing Data

In [None]:
nan_values_pct = 100 * round(train_data.isna().sum()/len(train_data), 4)
nan_values_pct = nan_values_pct.sort_values(ascending=False)

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(x=nan_values_pct.index,
                     y=nan_values_pct.values))

fig.update_layout(title="Percentage of NaN Values By Column", xaxis_title="Column Name", yaxis_title="NaN %")

fig.show()

### 

In [None]:
# Removing Columns With NaNs Rate Higher Than Threshold
nan_rate_threshold = 80
to_remove_cols = list(nan_values_pct[nan_values_pct>80].index)
train_data = train_data.drop(columns=to_remove_cols)

Some Customers Have Data For Multiple Days

In [None]:
customer_value_counts = train_data["customer_ID"].value_counts().sort_values()
customer_ex = customer_value_counts.idxmax()
train_data.loc[train_data.customer_ID == customer_ex].plot(x="S_2", y =["P_2", "D_39"], figsize=(14,5), grid="True", title=f"Customer {customer_ex}")

The first approach to deal with this data is to keep the latest information per customer

In [None]:
# Run this to check that target variable isn't a function of time (S_2) if customer has multiple data rows.
# train_data.groupby(["customer_ID"])["target"].nunique().unique()

In [None]:
# A groupby will take too much time and memory, we'll try another method.
# This may be updated later by keeping information in another way. 
train_data = train_data.drop_duplicates(subset=["customer_ID"], keep="last")
# Drop S_2 column since we don't need it anymore
train_data = train_data.drop(columns=["S_2"]).reset_index(drop=True)
train_data

# Exploratory Data Analysis (EDA)

In [None]:
print(f"The Number Of Customers In Train Data is {len(train_data)}")

In [None]:
# Target Variable
fig = go.Figure()

fig.add_trace(go.Pie(labels=train_data.target.value_counts().index,
                     values=train_data.target.value_counts().values,
                     textfont_size=20, marker=dict(line=dict(color='#000000', width=2)))
             )

fig.update_layout(title="Target Variable Distribution")

fig.show()

In [None]:
fig = go.Figure()

var_counts = [len(d_columns), len(s_columns), len(p_columns), len(b_columns), len(r_columns)]

fig.add_trace(go.Bar(x=["Delinquency variables", "Spend variables", "Payment variables", "Balance variables", "Risk variables"],
                     y=var_counts,
                     text=[str(x) for x in var_counts], textfont=dict(size=20), textposition='auto')
             )

fig.update_layout(title="Variables Count By Category", xaxis_title="Variables Category", yaxis_title="Count")

fig.show()

In [None]:
# Let's Start With Payment Variables

NB: Since we dropped some columns with high NaN values rates, we should update the different lists of columns## Payment Variables

In [None]:
p_columns = list(set(p_columns) - set(to_remove_cols))
s_columns = list(set(s_columns) - set(to_remove_cols))
d_columns = list(set(d_columns) - set(to_remove_cols))
b_columns = list(set(b_columns) - set(to_remove_cols))
r_columns = list(set(r_columns) - set(to_remove_cols))

In [None]:
high_skew_reduction_rates = {}

## Payment Variables

In [None]:
# Using the columns ith float16 type will generate errors while plotting
for p_col in p_columns:
    train_data[p_col] = train_data[p_col].astype("float32")

In [None]:
# Plotting P_4 Histogram Distribution will make plot unclear, unless you unselect the trace
fig = go.Figure()

for p_col in ["P_2", "P_3"]:
    fig.add_trace(go.Histogram(x=train_data[p_col], name=p_col))
fig.update_layout(title="Payment Variables Distributions")
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=train_data["P_4"]))
fig.update_layout(title="P_4 Variable Distributions")
fig.show()

In [None]:
# train_data[train_data.P_4==0]

**Skewness Checking**

"Skewness refers to a distortion or asymmetry that deviates from the symmetrical bell curve, or normal distribution, in a set of data.
If the curve is shifted to the left or to the right, it is said to be skewed."

[https://www.investopedia.com/terms/s/skewness.asp](http://)

In [None]:
def skewness_level(x):
    if np.abs(x) > 1:
        return "high"
    elif -0.5 <= x <= 0.5:
        return "symmetrical"
    else:
        return "moderate"

In [None]:
p_skewness = train_data[p_columns].agg(["skew"]).T
p_skewness["skewness_level"] = p_skewness["skew"].apply(lambda x :skewness_level(x))
p_skewness

In [None]:
skewed_p_variables = list(p_skewness.loc[p_skewness.skewness_level=="high"].index)
fig = go.Figure()
for p_col in skewed_p_variables:
    fig.add_trace(go.Histogram(x=np.log(train_data[p_col]), name=p_col))
fig.update_layout(title="Skewed Payment Variables Log Transformation")
fig.show()

All skewness levels now are moderate --> we'll log-transform these features in train data

In [None]:
for skew_p_col in skewed_p_variables:
    train_data[skew_p_col] = np.log(train_data[skew_p_col])

In [None]:
p_skewness["log_transform_skewness_level"] = train_data[p_columns].skew().T
p_skewness["log_transform_skewness_level"] = p_skewness["log_transform_skewness_level"].apply(lambda x :skewness_level(x))

In [None]:
p_rate = len(p_skewness.loc[(p_skewness.skewness_level=="high") & (p_skewness.log_transform_skewness_level != "high")]) / len(p_skewness)
high_skew_reduction_rates["PAYMENT"] = round(100*p_rate,2)

# Spend variables

In [None]:
# Using the columns with float16 type will generate errors while plotting
for s_col in s_columns:
    train_data[s_col] = train_data[s_col].astype("float32")

In [None]:
s_skewness = train_data[s_columns].agg(["skew"]).T
s_skewness["skewness_level"] = s_skewness["skew"].apply(lambda x :skewness_level(x))

In [None]:
skewed_s_variables = list(s_skewness.loc[s_skewness.skewness_level=="high"].index)
for s_col in skewed_s_variables:
    train_data[s_col] = np.log(train_data[s_col])

In [None]:
s_skewness["log_transform_skewness_level"] = train_data[s_columns].skew().T
s_skewness["log_transform_skewness_level"] = s_skewness["log_transform_skewness_level"].apply(lambda x :skewness_level(x))
s_skewness

In [None]:
s_rate = len(s_skewness.loc[(s_skewness.skewness_level=="high") & (s_skewness.log_transform_skewness_level != "high")]) / len(s_skewness)
high_skew_reduction_rates["SPEND"] = round(100*s_rate,2)

## Delinquency variables

In [None]:
# Using the columns with float16 type will generate errors while plotting
for d_col in d_columns:
    train_data[d_col] = train_data[d_col].astype("float32")

In [None]:
d_skewness = train_data[d_columns].agg(["skew"]).T
d_skewness["skewness_level"] = d_skewness["skew"].apply(lambda x :skewness_level(x))

In [None]:
skewed_d_variables = list(d_skewness.loc[d_skewness.skewness_level=="high"].index)
for d_col in skewed_d_variables:
    train_data[d_col] = np.log(train_data[d_col])

In [None]:
d_skewness["log_transform_skewness_level"] = train_data[d_columns].skew().T
d_skewness["log_transform_skewness_level"] = d_skewness["log_transform_skewness_level"].apply(lambda x :skewness_level(x))

In [None]:
d_rate = len(d_skewness.loc[(d_skewness.skewness_level=="high") & (d_skewness.log_transform_skewness_level != "high")]) / len(d_skewness)
high_skew_reduction_rates["DELIQUENCY"] = round(100*d_rate,2)

## Balance variable & Risk variables

In [None]:
# Using the columns with float16 type will generate errors while plotting
for b_col in b_columns:
    train_data[b_col] = train_data[b_col].astype("float32")
for r_col in r_columns:
    train_data[r_col] = train_data[r_col].astype("float32")

In [None]:
b_skewness = train_data[b_columns].agg(["skew"]).T
b_skewness["skewness_level"] = b_skewness["skew"].apply(lambda x :skewness_level(x))
skewed_b_variables = list(b_skewness.loc[b_skewness.skewness_level=="high"].index)
for b_col in skewed_b_variables:
    train_data[b_col] = np.log(train_data[b_col])
b_skewness["log_transform_skewness_level"] = train_data[b_columns].skew().T
b_skewness["log_transform_skewness_level"] = b_skewness["log_transform_skewness_level"].apply(lambda x :skewness_level(x))

In [None]:
r_skewness = train_data[r_columns].agg(["skew"]).T
r_skewness["skewness_level"] = r_skewness["skew"].apply(lambda x :skewness_level(x))
skewed_r_variables = list(r_skewness.loc[r_skewness.skewness_level=="high"].index)
for r_col in skewed_r_variables:
    train_data[r_col] = np.log(train_data[r_col])
r_skewness["log_transform_skewness_level"] = train_data[r_columns].skew().T
r_skewness["log_transform_skewness_level"] = r_skewness["log_transform_skewness_level"].apply(lambda x :skewness_level(x))


In [None]:
b_rate = len(b_skewness.loc[(b_skewness.skewness_level=="high") & (b_skewness.log_transform_skewness_level != "high")]) / len(b_skewness)
high_skew_reduction_rates["BALANCE"] = round(100*b_rate,2)

r_rate = len(r_skewness.loc[(r_skewness.skewness_level=="high") & (r_skewness.log_transform_skewness_level != "high")]) / len(r_skewness)
high_skew_reduction_rates["RISK"] = round(100*r_rate,2)

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(x=list(high_skew_reduction_rates.keys()),
                    y=list(high_skew_reduction_rates.values())))

fig.update_layout(title="Skewness Reduction Rate", xaxis_title="Variables Category", yaxis_title="Reductoon Rate %")

fig.show()

## Correlation

In [None]:
cat_columns = list(set(cat_columns) - set(to_remove_cols))
continuous_columns = list(set(train_data.columns) - set(cat_columns))

In [None]:
corr_matrix = train_data[continuous_columns].corr()
mask = np.triu(np.ones_like(corr_matrix))

In [None]:
plt.figure(figsize=(15,15))
heatmap = sns.heatmap(corr_matrix, mask=mask)
plt.title("Continuous Variables Correlation Heatmap")
plt.show()

## Categorical Variables

In [None]:
fig = go.Figure()
    
for cat_column in cat_columns:
    cat_col_dist = train_data[cat_column].value_counts()
    fig.add_trace(go.Bar(x=cat_col_dist.index,  y=cat_col_dist.values,  visible=cat_column==cat_columns[0]))
    
fig.update_layout(title=f"{cat_columns[0]} Variable Distribution", yaxis_title="Count")
    
buttons = []
for cat_column in cat_columns:
    buttons.append(dict(method="update", label=cat_column,
                        args=[{"visible":[c==cat_column for c in cat_columns]},  {"title":f"{cat_column} Variable Distribution"}]
                       ))

fig.update_layout(updatemenus=[{"buttons":buttons, "active":0, "showactive":True, "direction":"left",  "x":1, "y":1.35}])

fig.update_xaxes(automargin=True)

fig.show()