In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import gc
import seaborn as sns

import matplotlib.pyplot as plt
from scipy.stats import kurtosis,skew, norm

In [None]:
with open("../input/amex-datasetcategorical-encoders/train_customer2id.pkl", 'rb') as file:
    train_customer2id = pickle.load(file)
print(len(train_customer2id))

In [None]:
%%time
train_df = pd.read_parquet("../input/amex-traindataset/train_dataset.parquet")

train_label = pd.read_csv("../input/amex-default-prediction/train_labels.csv")
train_label.customer_ID=train_label.customer_ID.apply(lambda k: train_customer2id[k])

train_df.head()

In [None]:
def get_uniform_qqplot(s, min_value, max_value):
    y = []
    for q in np.arange(0, 1.0, 0.05):
        y.append(np.quantile(s, q) )

    d = (max_value-min_value)
    if d!=0:
        x = np.arange(min_value, max_value, d/20)
    else:
        x =[min_value] * len(y)
    
    return (x, y)

def get_normal_qqplot(s):
    mean = np.mean(s)
    scale = np.std(s)
    z = (s-mean)/scale
    
    x=[]
    y=[]
    for q in np.arange(0.01, 1.0, 0.05):
        x.append(norm.ppf(q))
        y.append(np.quantile(z, q))
    return (x,y)

In [None]:
def get_quantile_plot(featname):
    s = train_df[featname]
    s = s[s.isna()==False]
    
    q01 = np.quantile(s, 0.01)
    q99 = np.quantile(s, 0.99)
    d = q99 - q01
    s = np.clip(s, q01-2*d, q99+2*d)
    
    min_value = np.min(s)
    max_value = np.max(s)
    
    x = [0]
    y = [min_value]
    for q in np.arange(0.05, 1.0, 0.05):
        x.append(q)
        y.append(np.quantile(s, q))
    
    x.append(1)
    y.append(max_value)
    
    
    fig, ax=plt.subplots(1, 3, figsize=(14, 6))
    fig.suptitle(featname)
    
    ax[0].set_title("Quantile graph")
    ax[0].plot(x, y, marker="s")
    ax[0].set_xticks(np.arange(0, 1.0, 0.1))
    
    
    (x, y) = get_uniform_qqplot(s, min_value, max_value)
    ax[1].set_title("Q-Q graph for uniform distribution")
    ax[1].plot(x, y, label="q-q plot", marker='s', color='blue')
    ax[1].plot(x, x, label="line: 45", color='green')
    ax[1].legend(loc='best')
    
    (x, y) = get_normal_qqplot(s)
    ax[2].set_title("Q-Q graph for normal distribution")
    ax[2].plot(x, y, label="q-q plot", marker='s', color='blue')
    ax[2].plot(x, x, label="line: 45", color='green')
    ax[2].legend(loc='best')
    
    
    plt.show()

In [None]:
cat_features=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 
              'D_126', 'D_63',  'D_64', 'D_66', 'D_68'] + ['customer_ID', 'S_2', 'target']
numeric_features = [colname for colname in train_df.columns if colname not in cat_features ]

In [None]:
for featname in sorted(numeric_features):
    get_quantile_plot(featname)