In [None]:
import numpy as np 
import pandas as pd 
import os
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
from tqdm.auto import tqdm
import plotly.express as px
import dask.dataframe as dd

# CONTENT TABLE
----------------
* [1) Introduction](#01)
* [2) Variables Groups](#02)
* [3) Null Variables](#03)
* [4) Types of Variable](#04)
* [5) Correlation of Variables](#05)
* [6) Analysing groups correlation](#06)
    * [6.1) D Variables](#06.1)
    * [6.2) S Variables](#06.2)
    * [6.3) P Variables](#06.3)
    * [6.4) R Variables](#06.4)
    * [6.5) B Variables](#06.5)
* [7) Summary](#07)
  

<a id="01"></a>
# <p style="background-color:#002663;height: 60px;text-align: center;vertical-align: middle;line-height: 60px;;font-family:courier;color:#FFFFFF;font-size:120%;text-align:center;border-radius:12px 12px;"> Introduction </p>
<div style="font-family: courier; font-size:20px">
<li>This notebook is a brief analysis of the train dataset from amex. It was used the original dataset, without any transformations in variables. ( it is highly recommended one of the many post related to reducing the data in the discussions)
    
<li>To read the data, I used the dask.dataframe library and the main plots were done with plotly. Other libraries recommended to cope with the huge amount of data are the cudf and dask_cudf. <br>

<li>The main observations in this notebook consists of null values\variables in training dataset and the correlation between variables, target as well.
</div>

In [None]:
df = dd.read_csv('../input/amex-default-prediction/train_data.csv')
y = dd.read_csv('../input/amex-default-prediction/train_labels.csv')

In [None]:
print(f'Total number of registers {df.shape[0].compute()}, with {df.shape[1]} columns')

<a id="02"></a>
# <p style="background-color:#002663;height: 60px;text-align: center;vertical-align: middle;line-height: 60px;;font-family:courier;color:#FFFFFF;font-size:120%;text-align:center;border-radius:12px 12px;"> Variables Groups </p>
<div style="font-family: courier; font-size:20px">
    Separaing the variables according to the competion data description  <br><br>
        <b>Variables</b> 
        <li>D_* = Delinquency variables</li>
        <li>S_* = Spend variables</li>
        <li>P_* = Payment variables</li>
        <li>B_* = Balance variables</li>
        <li>R_* = Risk variables</li>
</div>

In [None]:
#Function to separate each variable into its category
def variables(cols, verbose = True):
    vars_groups = {'S':[],"D":[],
            "B":[],"R":[],"P":[], 'idx':[]}


    for c in cols:
    
        if 'customer_ID' == c:
            vars_groups['idx'].append(c)
        elif "S" in c:
            vars_groups['S'].append(c)
        elif "D" in c:
            vars_groups['D'].append(c)
        elif "B" in c:
            vars_groups['B'].append(c)
        elif "R" in c:
            vars_groups['R'].append(c)
        elif "P" in c:
            vars_groups['P'].append(c)



    if verbose:
        print(' Groups:', vars_groups.keys(),'\n','Number of Groups:', len(vars_groups))
    return vars_groups

In [None]:
vars_groups = variables(df.columns)

In [None]:
print('Number of variables for each categories ')
table = {'Group':[], 'Total':[]}
for g, i in vars_groups.items():
    table['Group'].append(g)
    table['Total'].append(len(i))
    print(g, len(i), ' variables')

In [None]:
table_df = pd.DataFrame(table).sort_values(by = 'Total', ascending = False)

fig = px.bar(table_df, 
            x="Group", 
            y="Total", 
            color = 'Group',
            text = 'Total',
            width=800,
            height=600)

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=0.9,
    title= 'Types of Variables'
))

<a id="03"></a>
# <p style="background-color:#002663;height: 60px;text-align: center;vertical-align: middle;line-height: 60px;;font-family:courier;color:#FFFFFF;font-size:120%;text-align:center;border-radius:12px 12px;"> Null Variables </p>
<div style="font-family: courier; font-size:20px">
</div>

In [None]:
nulls = df.isnull().mean().compute().sort_values(ascending = False)

In [None]:
thr = 0.6
fig = px.bar(nulls,
        color = nulls.values > thr,
        title  = f'Percentage of Nulls | Red-> higher than {thr}% of nulls | With {len(nulls[nulls > thr])}/{len(nulls)} variables above threshold',
        color_discrete_map={1: 'red', 0:'blue'})

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=0.9,
    title= 'Above Threshold'
))
fig.show()
nulls_var = list(nulls[nulls >thr].index)

<a id="04"></a>
# <p style="background-color:#002663;height: 60px;text-align: center;vertical-align: middle;line-height: 60px;;font-family:courier;color:#FFFFFF;font-size:120%;text-align:center;border-radius:12px 12px;"> Types of Variable </p>
<div style="font-family: courier; font-size:20px">

</div>

In [None]:
vars_type ={}
for i in df.columns:
    vars_type[i] =df[i].dtype

df_table =  pd.DataFrame(vars_type.values(), columns = ['types']).value_counts().reset_index()
df_table['types'] = df_table['types'].apply(str)
df_table.rename(columns = {0:'Count'}, inplace = True)

fig =px.bar(df_table, 
            x = 'types', 
            y = 'Count',
            text = 'Count',
            color ='types',
            width=800, height=400)


fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=0.9,
    title= 'Types of Variables'
))

fig.show()

<a id="05"></a>
# <p style="background-color:#002663;height: 60px;text-align: center;vertical-align: middle;line-height: 60px;;font-family:courier;color:#FFFFFF;font-size:120%;text-align:center;border-radius:12px 12px;"> Correlation of Variables </p>
<div style="font-family: courier; font-size:20px">
    <li> Group by customer ID and get the last information
    <li> Merge target column to the grouped data
</div>

In [None]:
df_grouped = df.groupby('customer_ID').last().compute()
y_grouped = y.groupby('customer_ID').last().compute()
df_grouped = dd.merge(df_grouped, y_grouped, on = 'customer_ID')

In [None]:
corr = df_grouped.corr()['target']
corr = corr.reset_index()
plt.figure(figsize = (20,30))
sns.barplot(y = 'index', x = 'target',data = corr.sort_values('target',ascending = False)[1:], palette = 'magma')
plt.title('Variables Correlation to the Target')

<div style="font-family: courier; font-size:20px">
    <li> Top positive and negative correlated variables to the target
</div>

In [None]:
top10_pos = corr.sort_values('target',ascending = False)[1:][:10]
top10_neg = corr.sort_values('target',ascending = False)[1:][-10:]
best_cor_vars = [*top10_pos['index'], *top10_neg['index']]
print('Top 10 positive Correlation with target\n',top10_pos,'\n')
print('Top 10 negative Correlation with target\n', top10_neg)


<a id="06"></a>
# <p style="background-color:#002663;height: 60px;text-align: center;vertical-align: middle;line-height: 60px;;font-family:courier;color:#FFFFFF;font-size:120%;text-align:center;border-radius:12px 12px;"> Analysing groups correlation </p>
<div style="font-family: courier; font-size:20px">
In this section it is going to be performed a analysis for each one of the groups 
</div>

<a id="06.1"></a>
# <p style="background-color:#002663;height: 30px;text-align: center;vertical-align: middle;line-height: 30px;;font-family:courier;color:#FFFFFF;font-size:120%;text-align:center;border-radius:12px 12px;"> D Group </p>

In [None]:
df_v = df_grouped[vars_groups['D']]

In [None]:
thr_c = 0.6
def correlation_filter(df_v, thr_c = thr_c):
    c = df_v.corr().abs().unstack()
    so = c.sort_values(ascending = False, kind="quicksort")
    vars_removed = []
    m = pd.DataFrame(so)
    m.rename(columns ={0:'corr_value'}, inplace = True)
    m1 = m[m['corr_value']<1].reset_index()
    var_corr = m1[m1['corr_value'] >thr_c].drop_duplicates(subset = ['level_0','level_1']).groupby('level_0')['level_1'].apply(list).reset_index(name='list')
    var_corr['len'] = var_corr['list'].apply(len)

    to_stay = []
    to_remove = []
    for i, var in enumerate(var_corr.level_0):
        for var_2 in var_corr.iloc[i].list:
            if (var_2 not in best_cor_vars) & ( var_2 not in to_stay):
                to_remove.append(var_2)
            
        to_stay.append(var)

    to_remove = list(set(to_remove))
    other_Vars = [i for i in df_v.columns if i not in to_remove]
    return var_corr, to_remove,other_Vars

In [None]:
var_corr_d, vars_filter_d, vars_filtered_d = correlation_filter(df_v)

In [None]:
var_corr_d

In [None]:
fig =px.bar(var_corr_d.sort_values(by = 'len', ascending = False), 
            x = 'level_0', 
            y = 'len',
            text = 'len',
            color ='level_0',
            title = f'Number of correlateded variables in D above threshold of {thr_c}',
            width=1800, height=600)

fig.show()

<a id="06.2"></a>
# <p style="background-color:#002663;height: 30px;text-align: center;vertical-align: middle;line-height: 30px;;font-family:courier;color:#FFFFFF;font-size:120%;text-align:center;border-radius:12px 12px;"> S Group </p>

In [None]:
df_v = df_grouped[vars_groups['S']]

In [None]:
var_corr_s, vars_filter_s, vars_filtered_s= correlation_filter(df_v)

In [None]:
var_corr_s

In [None]:
fig =px.bar(var_corr_s.sort_values(by = 'len', ascending = False), 
            x = 'level_0', 
            y = 'len',
            text = 'len',
            color ='level_0',
            title = f'Number of correlateded variables in S above threshold of {thr_c}',
            width=1800, height=600)

fig.show()

<a id="06.3"></a>
# <p style="background-color:#002663;height: 30px;text-align: center;vertical-align: middle;line-height: 30px;;font-family:courier;color:#FFFFFF;font-size:120%;text-align:center;border-radius:12px 12px;"> P Group </p>

In [None]:
df_v = df_grouped[vars_groups['P']]

In [None]:
var_corr_p, vars_filter_p,vars_filtered_p = correlation_filter(df_v)


In [None]:
var_corr_p


<a id="06.4"></a>
# <p style="background-color:#002663;height: 30px;text-align: center;vertical-align: middle;line-height: 30px;;font-family:courier;color:#FFFFFF;font-size:120%;text-align:center;border-radius:12px 12px;"> R Group </p>

In [None]:
df_v = df_grouped[vars_groups['R']]

In [None]:
var_corr_r, vars_filter_r, vars_filtered_r= correlation_filter(df_v)


In [None]:
var_corr_r

In [None]:
fig =px.bar(var_corr_r.sort_values(by = 'len', ascending = False), 
            x = 'level_0', 
            y = 'len',
            text = 'len',
            color ='level_0',
            title = f'Number of correlateded variables in R above threshold of {thr_c}',
            width=1800, height=600)

fig.show()

<a id="06.5"></a>
# <p style="background-color:#002663;height: 30px;text-align: center;vertical-align: middle;line-height: 30px;;font-family:courier;color:#FFFFFF;font-size:120%;text-align:center;border-radius:12px 12px;"> B Group </p>

In [None]:
df_v = df_grouped[vars_groups['B']]


In [None]:
var_corr_b, vars_filter_b,vars_filtered_b = correlation_filter(df_v)


In [None]:
var_corr_b

In [None]:
fig =px.bar(var_corr_b.sort_values(by = 'len', ascending = False), 
            x = 'level_0', 
            y = 'len',
            text = 'len',
            color ='level_0',
            title = f'Number of correlateded variables in B above threshold of {thr_c}',
            width=1800, height=600)

fig.show()

<a id="07"></a>
# <p style="background-color:#002663;height: 60px;text-align: center;vertical-align: middle;line-height: 60px;;font-family:courier;color:#FFFFFF;font-size:120%;text-align:center;border-radius:12px 12px;"> Summary </p>


In [None]:
print('Best Correlation with target:\n', best_cor_vars)
print(f'Above threshold of {thr} nulls:\n',nulls_var)

print('\nPossible Variables for each group\n')
print('D)\n\tVariables to filter:\n', vars_filter_d,'\n\tVariables to use:\n',vars_filtered_d)
print('S)\n\tVariables to filter:\n', vars_filter_s,'\n\tVariables to use:\n',vars_filtered_s)
print('P)\n\tVariables to filter:\n', vars_filter_p,'\n\tVariables to use:\n',vars_filtered_p)
print('R)\n\tVariables to filter:\n', vars_filter_r,'\n\tVariables to use:\n',vars_filtered_r)
print('B)\n\tVariables to filter:\n', vars_filter_b,'\n\tVariables to use:\n',vars_filtered_b)