In [None]:
import pandas as pd, numpy as np
from matplotlib import gridspec
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msnum 

pd.options.display.float_format = '{:.5f}'.format
pd.set_option('display.max_rows', 500)

# LOAD TRAIN DATA AND MERGE TARGETS ONTO FEATURES
df = pd.read_csv('../input/amex-default-prediction/train_data.csv', nrows=100_000)
df.S_2 = pd.to_datetime(df.S_2)
df2 = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
df = df.merge(df2,on='customer_ID',how='left')
del (df2)
gc.collect()


### Lets looks at the distribution of NA across dataset
Also lets remove the columns which contains 80% of NA values

In [None]:
col_na_count=(df.isna().sum()/df.shape[0]).reset_index(name='total')
rem_cols=col_na_count[col_na_count['total'] > 0.70]['index'].values.tolist()
df.drop(rem_cols, axis=1, inplace = True)
col_na_count=(df.isna().sum()/df.shape[0]).reset_index(name='total')

### Analysis of missing values in the column

In [None]:
ax=col_na_count.plot(x='index', y='total',figsize=(40,15))

ax.set_xticks(range(len(col_na_count)));
ax.set_xticklabels(["%s" % item for item in  col_na_count['index'].tolist()], rotation=90);

### Lets look at the Delinquency variables 

In [None]:
### Helper funciton
def plot_graphs(x,i, type_of_graph='normal'):
    y='target'
    perc=(df[df['target']==1].shape[0]/df.shape[0])*100
    
    if type_of_graph == 'catplot':
        df1 = df.groupby(x)[y].value_counts(normalize=True)
        df1 = df1.mul(100)
        df1 = df1.rename('percent').reset_index()
        g = sns.catplot(x=x,y='percent',hue=y,kind='bar',data=df1, height=10, aspect=0.9)
        g.ax.set_ylim(0,100)
        plt.axhline(y=perc, color='r', linestyle='-',label="% of defaulters")
        plt.text(0,perc,"Baseline credit default Percentage")
        for axes in g.axes.flat:
            _ = axes.set_xticklabels(axes.get_xticklabels(), rotation=90)
        for p in g.ax.patches:
            txt = str(p.get_height().round(2)) + '%'
            txt_x = p.get_x() 
            txt_y = p.get_height()
            g.ax.text(txt_x,txt_y,txt)
    else:
        total = float(len(df))
        plt.figure(figsize=(7, 6))
        ax=sns.countplot(x = df[x])
        for p in ax.patches:
            txt = str(((p.get_height()/total)*100).round(2)) + '%'
            txt_x = p.get_x() 
            txt_y = p.get_height()
            ax.text(txt_x,txt_y,txt)
        plt.xticks(rotation=90)
        plt.show()

In [None]:
delinq_cols = [col for col in df.columns if 'D_' in col]

In [None]:
numerical_delinq_cols = df[delinq_cols].select_dtypes(include='number').columns.tolist()
categorical_delinq_cols = df[delinq_cols].select_dtypes(exclude='number').columns.tolist()

In [None]:
for i,cat in enumerate(categorical_delinq_cols):
    plot_graphs(cat,i,'catplot')

#### Identify relation between different variables

In [None]:
d = {'color': ['r', 'b']} 
g = sns.FacetGrid(df, col="D_63",  row="target",hue_kws=d, hue='target')
g.map_dataframe(sns.histplot, x="D_64")

In [None]:
numerical_delinq_cols=numerical_delinq_cols+['target']
# create the figure and axes
fig, axes = plt.subplots(9, 9,figsize=(20,20))
axes = axes.ravel()  # flattening the array makes indexing easier

for col, ax in zip(numerical_delinq_cols, axes):
    sns.kdeplot(data=df, x=col,hue='target', ax=ax,warn_singular=False)

fig.tight_layout()
plt.show()


In [None]:
df[numerical_delinq_cols].hist(bins=20, figsize=(14,10), color='g')
plt.show()

### Lets look at correlation as well

In [None]:
correlations_target = abs(df[numerical_delinq_cols].corr())

# Select upper triangle of correlation matrix
upper = correlations_target.where(np.triu(np.ones(correlations_target.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [None]:
plt.figure(figsize=(100, 50))
dd = [x for x in numerical_delinq_cols if x not in to_drop]
sns.heatmap(df[dd].corr(), 
            annot=True, cmap='Spectral')
plt.show()

##### We have reduced our original Delinquecy columns from 76 to 69 just based on correlation. We should be able to look at the target distribution and shortlist a set of columns that will be useful for our predictions

### WIP