In [None]:
import datatable as dt
import pandas as pd
import numpy as np
import time
import gc
from collections import Counter

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### Functions

In [None]:
def frame_corr(dt_frame):
    numcols = [col for col in dt_frame]
    result = dt.rbind([dt_frame[:, [dt.corr(col1, col2) for col2 in numcols]] for col1 in numcols])
    result.names = dt_frame[:,numcols].names
    corr_result = result.to_pandas()

    return corr_result.set_index([pd.Index(corr_result.columns)])

### Load csv to datatable dataframe 

In [None]:
FPATH = "../input/amex-default-prediction/"

train_df = dt.fread(FPATH+"train_data.csv")



In [None]:
train_labels_df = dt.fread(FPATH+"train_labels.csv")

In [None]:
train_df.view()

In [None]:
train_labels_df.view()

In [None]:
train_labels_df.key = "customer_ID"


In [None]:
train_df = train_df[:, :, dt.join(train_labels_df)]

In [None]:
del train_labels_df

In [None]:
gc.collect()

### Column Descriptions :

customer_ID = Unique Customer ID
- D_* = Delinquency variables
- S_* = Spend variables
- P_* = Payment variables
- B_* = Balance variables
- R_* = Risk variables

In [None]:
print("Train shape (nrows, ncols): ",train_df.shape)   # (nrows, ncols)


In [None]:
Counter(list(train_df.stypes))

### Customer_ID

There are 458913 unique customer ID in train dataset and from the two graphs below we can see that there are one Customer register in each date

In [None]:
print("Customer_ID (qrows, unique):",train_df[:, dt.count(dt.f.customer_ID)].to_pandas().values[0][0], dt.unique(train_df["customer_ID"]).to_pandas().shape[0])

In [None]:
count_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(dt.time.ymd(dt.time.year(dt.f.S_2), dt.time.month(dt.f.S_2), 1))].to_pandas()
count_df.columns = ['date', 'qcustomerid']
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date", y="qcustomerid", data=count_df, ci = 0)
ax.tick_params(axis='x', rotation=90)
ax.set(title='Number row per date transaction')
plt.show()

In [None]:

df_tmp = train_df[:,{'customer_ID': dt.f[0], 'date': dt.time.ymd(dt.time.year(dt.f.S_2), dt.time.month(dt.f.S_2), 1)}].to_pandas()
count_df = df_tmp.groupby('date').nunique().reset_index()
count_df.columns = ['date', 'quniq_customerid']
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date", y="quniq_customerid", data=count_df, ci = 0)
ax.tick_params(axis='x', rotation=90)
ax.set(title='Number unique customerId per date transaction')
plt.show()


In [None]:
gc.collect()

### Null values

In [None]:
train_null = train_df.countna().to_pandas().T.reset_index()
train_null.columns = ['variable', 'qnull']
train_null = train_null[train_null['qnull']>0]
train_null = train_null.sort_values(by = 'qnull',ascending = True)
plt.figure(figsize=(28,8))
ax = sns.barplot(x="variable", y="qnull", data=train_null, ci = 0)
ax.tick_params(axis='x', rotation=90)
ax.set(title='Variable null distribution')
plt.show()

### Describe

In [None]:
qunique_df = train_df.nunique().to_pandas().T
max_df = train_df.max().to_pandas().T
min_df = train_df.min().to_pandas().T
mean_df = train_df.mean().to_pandas().T
stddev_df = train_df.sd().to_pandas().T
mode_df = train_df.mode().to_pandas().T
summary_df = pd.concat([qunique_df, max_df, min_df, mean_df, stddev_df, mode_df], axis=1)
summary_df.columns = ['nunique', 'max', 'min', 'mean', 'steddev', 'mode']
summary_df

### Categorical vars and target

In [None]:
fig, ax =plt.subplots(4,3, figsize=(20,20))


cat_var = "B_30"
agg_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(cat_var)].to_pandas()
sns.barplot(x=cat_var, y="customer_ID", data=agg_df, ci = 0, ax=ax[0,0])

cat_var = "B_38"
agg_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(cat_var)].to_pandas()
sns.barplot(x=cat_var, y="customer_ID", data=agg_df, ci = 0, ax=ax[0,1])

cat_var = "D_114"
agg_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(cat_var)].to_pandas()
sns.barplot(x=cat_var, y="customer_ID", data=agg_df, ci = 0, ax=ax[0,2])

cat_var = "D_116"
agg_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(cat_var)].to_pandas()
sns.barplot(x=cat_var, y="customer_ID", data=agg_df, ci = 0, ax=ax[1,0])

cat_var = "D_117"
agg_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(cat_var)].to_pandas()
sns.barplot(x=cat_var, y="customer_ID", data=agg_df, ci = 0, ax=ax[1,1])

cat_var = "D_120"
agg_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(cat_var)].to_pandas()
sns.barplot(x=cat_var, y="customer_ID", data=agg_df, ci = 0, ax=ax[1,2])

cat_var = "D_126"
agg_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(cat_var)].to_pandas()
sns.barplot(x=cat_var, y="customer_ID", data=agg_df, ci = 0, ax=ax[2,0])

cat_var = "D_63"
agg_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(cat_var)].to_pandas()
sns.barplot(x=cat_var, y="customer_ID", data=agg_df, ci = 0, ax=ax[2,1])

cat_var = "D_64"
agg_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(cat_var)].to_pandas()
sns.barplot(x=cat_var, y="customer_ID", data=agg_df, ci = 0, ax=ax[2,2])

cat_var = "D_66"
agg_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(cat_var)].to_pandas()
sns.barplot(x=cat_var, y="customer_ID", data=agg_df, ci = 0, ax=ax[3,0])

cat_var = "D_68"
agg_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(cat_var)].to_pandas()
sns.barplot(x=cat_var, y="customer_ID", data=agg_df, ci = 0, ax=ax[3,1])

cat_var = "target"
agg_df = train_df[:, dt.count(dt.f.customer_ID), dt.by(cat_var)].to_pandas()
sns.barplot(x=cat_var, y="customer_ID", data=agg_df, ci = 0, ax=ax[3,2])

fig.show()

### Target

In [None]:
agg_df = train_df[:,['customer_ID', 'target']].to_pandas().groupby('target').nunique().reset_index()
ax = sns.barplot(x='target', y="customer_ID", data=agg_df, ci = 0)

In [None]:
import random
 
# using random.sample()
# to generate random number list
sample = random.sample(range(0, 5531451), int(5531451/10))

cat_vars = ['B_30','B_31', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68','S_2']

### Delinquency variables

In [None]:
delinquency_vars = [s for s in train_df.names if "D_" in s if s not in cat_vars]

fig, ax =plt.subplots(13,7, figsize=(28,28))

for i in range(0,13):
    for j in range(0,7):
        
        if delinquency_vars:
            num_var = delinquency_vars.pop()
        
            if num_var is not None:
                agg_df = train_df[sample, [num_var,'target']].to_pandas()
                sns.boxplot(data=agg_df, y=num_var, x='target',ax=ax[i,j])

gc.collect()

fig.show()

In [None]:
delinquency_vars = [s for s in train_df.names if "D_" in s if s not in cat_vars if s not in ['customer_ID', 'S_2']]

corr = frame_corr(train_df[:, delinquency_vars])


f, ax = plt.subplots(figsize=(34, 34))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

In [None]:
NBINS = 1000

delinquency_vars = [s for s in train_df.names if "D_" in s if s not in cat_vars]

fig, ax =plt.subplots(13,7, figsize=(34,28))

for i in range(0,13):
    for j in range(0,7):
        
        if delinquency_vars:
            num_var = delinquency_vars.pop()
        
            if num_var is not None:
                agg_df = train_df[:, dt.cut(dt.f[num_var], nbins = NBINS)].to_pandas()
                sns.distplot(agg_df[num_var], kde = False, color ='red', bins = NBINS, ax=ax[i,j])


gc.collect()

fig.show()

### Spend variables

In [None]:
spend_vars = [s for s in train_df.names if "S_" in s if s not in cat_vars if s not in ['customer_ID', 'S_2']]

fig, ax =plt.subplots(3,7, figsize=(24,12))

for i in range(0,3):
    for j in range(0,7):
        num_var = spend_vars.pop()
        
        agg_df = train_df[sample, [num_var,'target']].to_pandas()
        sns.boxplot(data=agg_df, y=num_var, x='target',ax=ax[i,j])

gc.collect()

fig.show()

In [None]:
NBINS = 1000


spend_vars = [s for s in train_df.names if "S_" in s if s not in cat_vars if s not in ['customer_ID', 'S_2']]

fig, ax =plt.subplots(3,7, figsize=(24,12))

for i in range(0,3):
    for j in range(0,7):
        num_var = spend_vars.pop()
        

        agg_df = train_df[:, dt.cut(dt.f[num_var], nbins = NBINS)].to_pandas()
        sns.distplot(agg_df[num_var], kde = False, color ='red', bins = NBINS,ax=ax[i,j])

gc.collect()

fig.show()

In [None]:
spend_vars = [s for s in train_df.names if "S_" in s if s not in cat_vars if s not in ['customer_ID', 'S_2']]

corr = frame_corr(train_df[:, spend_vars])


f, ax = plt.subplots(figsize=(28, 28))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

### Payment variables

In [None]:
payment_vars = [s for s in train_df.names if "P_" in s if s not in cat_vars]

fig, ax =plt.subplots(3, figsize=(6,12))

for i in range(0,3):
    num_var = payment_vars.pop()

    agg_df = train_df[sample, [num_var,'target']].to_pandas()
    sns.boxplot(data=agg_df, y=num_var, x='target',ax=ax[i])

gc.collect()

fig.show()

In [None]:
NBINS = 1000


payment_vars = [s for s in train_df.names if "P_" in s if s not in cat_vars ]

fig, ax =plt.subplots(3, figsize=(10,12))

for i in range(0,3):
    num_var = payment_vars.pop()


    
    agg_df = train_df[:, dt.cut(dt.f[num_var], nbins = NBINS)].to_pandas()
    sns.distplot(agg_df[num_var], kde = False, color ='red', bins = NBINS,ax=ax[i])

gc.collect()

fig.show()

In [None]:
payment_vars = [s for s in train_df.names if "P_" in s if s not in cat_vars if s not in ['customer_ID', 'S_2']]

corr = frame_corr(train_df[:, payment_vars])


f, ax = plt.subplots(figsize=(6, 6))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

### Balance variables

In [None]:
balance_vars = [s for s in train_df.names if "B_" in s if s not in cat_vars]

fig, ax =plt.subplots(7,6, figsize=(26,13))

for i in range(0,7):
    for j in range(0,6):
        
        if balance_vars:
            num_var = balance_vars.pop()
        
            if num_var is not None:
                agg_df = train_df[sample, [num_var,'target']].to_pandas()
                sns.boxplot(data=agg_df, y=num_var, x='target',ax=ax[i,j])

gc.collect()

fig.show()

In [None]:
NBINS = 1000

balance_vars = [s for s in train_df.names if "B_" in s if s not in cat_vars]

fig, ax =plt.subplots(7,6, figsize=(24,28))

for i in range(0,7):
    for j in range(0,6):
        
        if balance_vars:
            num_var = balance_vars.pop()
        
            if num_var is not None:
                
                agg_df = train_df[:, dt.cut(dt.f[num_var], nbins = NBINS)].to_pandas()
                sns.distplot(agg_df[num_var], kde = False, color ='red', bins = NBINS, ax=ax[i,j])

gc.collect()

fig.show()

In [None]:
balance_vars = [s for s in train_df.names if "B_" in s if s not in cat_vars]

corr = frame_corr(train_df[:, balance_vars])


f, ax = plt.subplots(figsize=(28, 28))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

### Risk variables

In [None]:
risk_vars = [s for s in train_df.names if "R_" in s if s not in cat_vars]

fig, ax =plt.subplots(7,4, figsize=(20,16))

for i in range(0,7):
    for j in range(0,4):
        
        if risk_vars:
            num_var = risk_vars.pop()
        
            if num_var is not None:
                agg_df = train_df[sample, [num_var,'target']].to_pandas()
                sns.boxplot(data=agg_df, y=num_var, x='target',ax=ax[i,j])

gc.collect()

fig.show()

In [None]:
NBINS = 1000

risk_vars = [s for s in train_df.names if "R_" in s if s not in cat_vars]

fig, ax =plt.subplots(7,4, figsize=(28,28))

for i in range(0,7):
    for j in range(0,4):
        
        if risk_vars:
            num_var = risk_vars.pop()
        
            if num_var is not None:
                agg_df = train_df[:, dt.cut(dt.f[num_var], nbins = NBINS)].to_pandas()
                sns.distplot(agg_df[num_var], kde = False, color ='red', bins = NBINS, ax=ax[i,j])

gc.collect()

fig.show()

In [None]:
risk_vars = [s for s in train_df.names if "R_" in s if s not in cat_vars]

corr = frame_corr(train_df[:, risk_vars])


f, ax = plt.subplots(figsize=(24, 24))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

### To be continued..