In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import gc
import seaborn as sns

import matplotlib.pyplot as plt
from scipy.stats import kurtosis,skew, norm

In [None]:
with open("../input/amex-datasetcategorical-encoders/train_customer2id.pkl", 'rb') as file:
    train_customer2id = pickle.load(file)
print(len(train_customer2id))

In [None]:
%%time
train_df = pd.read_parquet("../input/amex-traindataset/train_dataset.parquet")

train_label = pd.read_csv("../input/amex-default-prediction/train_labels.csv")
train_label.customer_ID=train_label.customer_ID.apply(lambda k: train_customer2id[k])

train_df.head()

In [None]:
def get_missing_value_percentages(df):
    na_df = []

    cat_features=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 
                  'D_126', 'D_63',  'D_64', 'D_66', 'D_68'] + ['customer_ID', 'S_2', 'target']
    numeric_features = [colname for colname in train_df.columns if colname not in 
                        cat_features ]

    for featname in numeric_features:
        p = (df[featname].isna().sum())/len(df)
        na_df.append({
            'feat_name': featname,
            'percent': p
        })
    na_df = pd.DataFrame.from_dict(na_df)
    na_df = na_df.sort_values('percent')
    return na_df

In [None]:
%%time
na_df = get_missing_value_percentages(train_df)
numeric_columns = na_df[na_df.percent<0.01].feat_name.values
print("number of numeric columns:", len(numeric_columns))

na_df.head()

In [None]:
df = train_df.groupby('customer_ID', as_index=False)[['S_2']].count().rename(columns={'S_2': 'num_records'})
df = df[df.num_records==13]
customer_ids = df.customer_ID.values

print("number of customers:", len(customer_ids))

In [None]:
def plot_stats_by_timeseries(df, colname):
    df=df.groupby('customer_ID')[[colname]].agg(list)
    df = df.merge(train_label, on='customer_ID')
    
    
    series_means0 = []
    series_means1 = []
    
    series_q25_0=[]
    series_q25_1=[]
    
    series_q50_0=[]
    series_q50_1=[]
    
    series_q75_0=[]
    series_q75_1=[]

    for k in range(13):
        v0 = df[df.target == 0][colname].apply(lambda lst: lst[k])
        v1 = df[df.target == 1][colname].apply(lambda lst: lst[k])
        
        v0 = v0[v0.isna()==False]
        v1 = v1[v1.isna()==False]
        
        series_means0.append(np.mean(v0))
        series_means1.append(np.mean(v1))
        
        series_q25_0.append(np.quantile(v0, 0.25))
        series_q25_1.append(np.quantile(v1, 0.25))
        
        series_q50_0.append(np.quantile(v0, 0.5))
        series_q50_1.append(np.quantile(v1, 0.5))
        
        series_q75_0.append(np.quantile(v0, 0.75))
        series_q75_1.append(np.quantile(v1, 0.75))
    
    
    
    fig, ax = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
    fig.suptitle(colname)
    
    for k in range(13):
        y0 = [series_q25_0[k], series_q50_0[k], series_q75_0[k] , series_means0[k]]
        y1 = [series_q25_1[k], series_q50_1[k], series_q75_1[k] , series_means1[k]]
        
        
        min_value0 = min(y0);max_value0 = max(y0)
        min_value1 = min(y1);max_value1 = max(y1)
        
        ax[0].vlines(x=k, ymin=min_value0, ymax = max_value0)
        ax[1].vlines(x=k, ymin=min_value1, ymax = max_value1)
    
    ax[0].set_title("Non Defaulter")
    ax[0].plot(series_means0, marker='*', color='green', label='mean')
    ax[0].plot(series_q25_0, marker='x', color='red', label='25%')
    ax[0].plot(series_q50_0, marker='s', color='m', label='50%')
    ax[0].plot(series_q75_0, marker='o', color='blue', label='75%')
    ax[0].legend(loc='upper right')
    
    ax[1].set_title("Defaulter")
    ax[1].plot(series_means1, marker='*', color='green', label='mean')
    ax[1].plot(series_q25_1, marker='x', color='red', label='25%')
    ax[1].plot(series_q50_1, marker='s', color='m', label='50%')
    ax[1].plot(series_q75_1, marker='o', color='blue', label='75%')
    plt.show()
    
    del df
    gc.collect()

In [None]:
df=train_df[train_df.customer_ID.isin(customer_ids)]
for colname in numeric_columns:
    if colname.startswith("P_"):
        plot_stats_by_timeseries(df, colname)

In [None]:
df=train_df[train_df.customer_ID.isin(customer_ids)]
for colname in numeric_columns:
    if colname.startswith("B_"):
        plot_stats_by_timeseries(df, colname)

In [None]:
for colname in numeric_columns:
    if colname.startswith("R_"):
        plot_stats_by_timeseries(df, colname)

In [None]:
for colname in numeric_columns:
    if colname.startswith("S_"):
        plot_stats_by_timeseries(df, colname)

In [None]:
for colname in numeric_columns:
    if colname.startswith("D_"):
        plot_stats_by_timeseries(df, colname)