In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import recall_score

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Data

In [None]:
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
print("train data: %s rows and %s cols"%train.shape)
test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
print("test data: %s rows and %s cols"%train.shape)
cc = pd.read_csv('../input/home-credit-default-risk/credit_card_balance.csv')
print("Credit Card Balance: %s rows and %s cols"%cc.shape)
pmts = pd.read_csv('../input/home-credit-default-risk/installments_payments.csv')
print("Installments Payments: %s rows and %s cols"%pmts.shape)

## Explore Train Data

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

### Missing values

In [None]:
train.isnull().sum()[train.isnull().sum() > 2000]

## Data Visualization - Train Data
1) From the density plot we can see class 0 has a lot more records than class 1. Overall the default rate is at 8%. **This is an imblanced dataset.**

In [None]:
train['TARGET'].plot.hist(title = 'target var: default indicator')

In [None]:
train['TARGET'].value_counts()

2) The mean and 75% percentile of number of children are 0.417052 and 1. There are some **outliers** we can remove before building the model.

In [None]:
train['CNT_CHILDREN'].plot.hist(title = 'Num of Children')

In [None]:
sns.boxplot(x=train['CNT_CHILDREN'])

3) Visualization of some other significant features

In [None]:
sns.set(rc={'figure.figsize':(18,5)})
ax = sns.boxplot(x=train[train.AMT_INCOME_TOTAL < 0.05e+07]['AMT_INCOME_TOTAL'])
ax.set_xticks(range(0,500000,25000))
ax

In [None]:
ages = [int(-x/365) for x in train.DAYS_BIRTH]
sns.set(rc={'figure.figsize':(10,5)})
sns.distplot(ages, hist=True, kde=False, 
             bins=10,hist_kws={'edgecolor':'black'})
plt.title('Density of Ages')
plt.xlabel('Ages')
plt.ylabel('Frequency')

In [None]:
train['REG_REGION_NOT_LIVE_REGION'].plot.hist(title = 'Non-match Address')

In [None]:
fig, ax = plt.subplots(1,2)
sns.histplot(data=train, x="EXT_SOURCE_2",kde=True,ax=ax[0])
sns.histplot(data=train, x="EXT_SOURCE_3",kde=True,ax=ax[1])
fig.show()

In [None]:
# sns.set(rc={'figure.figsize':(15,9)})
# fig, ax = plt.subplots(2,2)
# sns.histplot(data=train, x="OBS_30_CNT_SOCIAL_CIRCLE",kde=True,ax=ax[0,0])
# sns.histplot(data=train, x="DEF_30_CNT_SOCIAL_CIRCLE",kde=True,ax=ax[0,1])
# sns.histplot(data=train, x="OBS_60_CNT_SOCIAL_CIRCLE",kde=True,ax=ax[1,0])
# sns.histplot(data=train, x="DEF_60_CNT_SOCIAL_CIRCLE",kde=True,ax=ax[1,1])
# fig.show()

## Categorical Vars

In [None]:
for col in train.columns:
    if train[col].dtype == 'object':
        print("object column %s have %s unique values"%(str(col),train[col].nunique()))
        if train[col].nunique() <= 3:
            print(train[col].value_counts())
            print('-------------')
        else:
            ax = sns.catplot(x=col, kind="count", data=train,height=6, aspect=2.2)
            for axes in ax.axes.flat:
                axes.set_xticklabels(axes.get_xticklabels(),rotation = 45,horizontalalignment = 'right')

## Visualization - Credit Card Balance & Installments Payments

In [None]:
cc.describe()

In [None]:
pmts.describe()

In [None]:
sns.set(rc={'figure.figsize':(18,5)})
ax = sns.boxplot(x=cc['AMT_BALANCE'])
ax.set_xticks([x/10*1e6 for x in range(-5,15,1)])
ax

In [None]:
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.histplot(data=cc[cc.CNT_DRAWINGS_CURRENT < 10], x="CNT_DRAWINGS_CURRENT")
ax.set_xticks(range(0,10,1))
ax

In [None]:
sns.histplot(data=pmts, x="DAYS_INSTALMENT")

In [None]:
sns.histplot(data=pmts, x="DAYS_ENTRY_PAYMENT")

## Prepare Data for modeling

In [None]:
# Feature Engineering and extract attributes for use from Credit Card Balance & Installments Payments
cc['AMT_DRAWINGS_PCT'] = [x/y if (y != 0) & pd.notnull(y) else np.nan for x,y in zip(cc.AMT_DRAWINGS_CURRENT,cc.AMT_CREDIT_LIMIT_ACTUAL)]
cc['AMT_DRAWINGS_ATM_PCT'] = [x/y if (y != 0) & pd.notnull(y) else np.nan for x,y in zip(cc.AMT_DRAWINGS_ATM_CURRENT,cc.AMT_CREDIT_LIMIT_ACTUAL)]
cc['AMT_DRAWINGS_OTHER_PCT'] = [x/y if (y != 0) & pd.notnull(y) else np.nan for x,y in zip(cc.AMT_DRAWINGS_OTHER_CURRENT,cc.AMT_CREDIT_LIMIT_ACTUAL)]
cc['AMT_DRAWINGS_POS_PCT'] = [x/y if (y != 0) & pd.notnull(y) else np.nan for x,y in zip(cc.AMT_DRAWINGS_POS_CURRENT,cc.AMT_CREDIT_LIMIT_ACTUAL)]

cc['AMT_PRINCIPAL_RECEIVABLE_PCT'] = [x/y if (y != 0) & pd.notnull(y) else np.nan for x,y in zip(cc.AMT_RECEIVABLE_PRINCIPAL,cc.AMT_RECIVABLE)]

cc_use = cc.groupby(['SK_ID_CURR'],as_index=False).agg({'AMT_BALANCE': np.mean,
                                                        'AMT_DRAWINGS_PCT':np.mean,
                                                        'AMT_DRAWINGS_ATM_PCT':np.mean,
                                                        'AMT_DRAWINGS_OTHER_PCT':np.mean,
                                                        'AMT_DRAWINGS_POS_PCT':np.mean,
                                                        'AMT_PRINCIPAL_RECEIVABLE_PCT':np.mean,
                                                        'CNT_DRAWINGS_ATM_CURRENT':np.mean,
                                                        'CNT_DRAWINGS_CURRENT':np.mean,
                                                        'CNT_DRAWINGS_OTHER_CURRENT':np.mean,
                                                        'CNT_DRAWINGS_POS_CURRENT':np.mean,
                                                        'SK_DPD':np.mean,
                                                        'SK_DPD_DEF':np.mean})
pmts['DAYS_INSTALMENT_DIFF'] = pmts['DAYS_INSTALMENT'] - pmts['DAYS_ENTRY_PAYMENT']
pmts['AMT_PATMENT_PCT'] = [x/y if (y != 0) & pd.notnull(y) else np.nan for x,y in zip(pmts.AMT_PAYMENT,pmts.AMT_INSTALMENT)]
pmts_use = pmts.groupby(['SK_ID_CURR'],as_index=False).agg({'DAYS_INSTALMENT_DIFF':np.mean,
                                                            'AMT_PATMENT_PCT':np.mean})                                                          

In [None]:
# Remove vars that has too many missing variable
cols_remove = train.columns[train.isnull().sum() > 100000]
train_use = train.drop(cols_remove,axis=1)

# Remove outliers
train_use = train_use[(train_use.CNT_CHILDREN <= 5) & (train_use.AMT_INCOME_TOTAL <= 350000)]

# Merge Credit Card Balance & Installment Payments
train_use = train_use.merge(cc_use,on='SK_ID_CURR',how='left')
train_use = train_use.merge(pmts_use,on='SK_ID_CURR',how='left')

# Missing Value Imputation - Assign an extreme value
train_use = train_use.fillna(-999)

# One-hot encoding for categorical variable
cat_cols = [key for key in dict(train_use.dtypes) if dict(train_use.dtypes)[key] in ['object']]
cat_cols_df = pd.get_dummies(train_use[cat_cols],drop_first=True)
train_use = pd.concat([cat_cols_df,train_use[train_use.columns[~train_use.columns.isin(cat_cols)]]], axis = 1)

# print(train_use.shape)
train_use = train_use[train_use.columns[~train_use.columns.isin(cat_cols)]]

# Remove ID column
train_use = train_use.drop(['SK_ID_CURR'],axis=1).reset_index(drop=True)
train_use.replace([np.inf, -np.inf], np.nan, inplace=True)

train_use.shape

## Build Decision Tree

In [None]:
x = train_use.drop(['TARGET'],axis=1).copy()
y = train_use['TARGET'].copy()

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state=0)
model = DecisionTreeClassifier(criterion='gini')
model = model.fit(x_train,y_train)
y_pred_p = model.predict_proba(x_test)[:,1]
y_pred = model.predict(x_test)

auc_test = roc_auc_score(y_test, y_pred_p)
print("AUC Performance: ", auc_test)
f1_test = f1_score(y_test, y_pred)
print("F1 Score: ", f1_test)
recall_test = recall_score(y_test, y_pred, average='weighted')
print("Recall Score: ", recall_test)

In [None]:
y_test.value_counts()

In [None]:
pd.Series(y_pred).value_counts()

## Check Feature Importance

In [None]:
pd.DataFrame(sorted(zip(x_train.columns,model.feature_importances_),key=lambda x: x[1],reverse=True),columns=['Name','Importance'])[:20]

## Plot evaluation metrics

In [None]:
# confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')

In [None]:
plot_confusion_matrix(model,x_test, y_test, normalize='true')

In [None]:
# Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

fig, ax = plt.subplots()
ax.plot(recall, precision, color='red')

ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')

plt.show()

In [None]:
# ROC Curve
metrics.plot_roc_curve(model, x_test, y_test)