In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import plot_confusion_matrix, accuracy_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Dataset pre-processing

In [None]:
application_data = pd.read_csv('/kaggle/input/loan-defaulter/application_data.csv')

In [None]:
label_vector = application_data['TARGET']
np.unique(label_vector, return_counts=True)

In [None]:
dataset_columns = [
    'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
    'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 
    'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
    'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OBS_30_CNT_SOCIAL_CIRCLE', 
    'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 
    'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 
    'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 
    'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 
    'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 
    'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'TARGET'
]

In [None]:
categorical_features = [
    'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
    'OCCUPATION_TYPE', 'EXT_SOURCE_1', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 
    'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 
    'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 
    'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'
]
numerical_features = [
    'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'CNT_FAM_MEMBERS', 
    'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 
    'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 
    'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 
    'AMT_REQ_CREDIT_BUREAU_YEAR'
]

### Missing value treatment

In [None]:
application_data['AMT_ANNUITY'] = application_data['AMT_ANNUITY'].fillna(0)
application_data['OCCUPATION_TYPE'] = application_data['OCCUPATION_TYPE'].fillna('UNKNOWN')
application_data['CNT_FAM_MEMBERS'] = application_data['CNT_FAM_MEMBERS'].fillna(0)
application_data['EXT_SOURCE_1'] = application_data['EXT_SOURCE_1'].fillna(0)
application_data['EXT_SOURCE_2'] = application_data['EXT_SOURCE_2'].fillna(0)
application_data['EXT_SOURCE_3'] = application_data['EXT_SOURCE_3'].fillna(0)
application_data['OBS_30_CNT_SOCIAL_CIRCLE'] = application_data['OBS_30_CNT_SOCIAL_CIRCLE'].fillna(0)
application_data['DEF_30_CNT_SOCIAL_CIRCLE'] = application_data['DEF_30_CNT_SOCIAL_CIRCLE'].fillna(0)
application_data['OBS_60_CNT_SOCIAL_CIRCLE'] = application_data['OBS_60_CNT_SOCIAL_CIRCLE'].fillna(0)
application_data['DEF_60_CNT_SOCIAL_CIRCLE'] = application_data['DEF_60_CNT_SOCIAL_CIRCLE'].fillna(0)
application_data['DAYS_LAST_PHONE_CHANGE'] = application_data['DAYS_LAST_PHONE_CHANGE'].fillna(3650)
application_data['AMT_REQ_CREDIT_BUREAU_HOUR'] = application_data['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(0)
application_data['AMT_REQ_CREDIT_BUREAU_DAY'] = application_data['AMT_REQ_CREDIT_BUREAU_DAY'].fillna(0)
application_data['AMT_REQ_CREDIT_BUREAU_WEEK'] = application_data['AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(0)
application_data['AMT_REQ_CREDIT_BUREAU_MON'] = application_data['AMT_REQ_CREDIT_BUREAU_MON'].fillna(0)
application_data['AMT_REQ_CREDIT_BUREAU_QRT'] = application_data['AMT_REQ_CREDIT_BUREAU_QRT'].fillna(0)
application_data['AMT_REQ_CREDIT_BUREAU_YEAR'] = application_data['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(0)


In [None]:
sample_class_1 = application_data[application_data['TARGET'] == 1][:20000]
sample_class_0 = application_data[application_data['TARGET'] == 0][:20000]
treated_dataset = pd.concat([sample_class_1, sample_class_0])[dataset_columns]
training_dataset, testing_dataset = train_test_split(treated_dataset, shuffle=True, stratify=treated_dataset['TARGET'])


In [None]:
features = list(set(dataset_columns) - set(['TARGET'])) 
train_features, Y_train = training_dataset[features], training_dataset['TARGET']
test_features, Y_test = testing_dataset[features], testing_dataset['TARGET']

In [None]:
column_trans = make_column_transformer(
    (OneHotEncoder(), categorical_features),
    (StandardScaler(), numerical_features)
    )
transformer = column_trans.fit(treated_dataset[features])

In [None]:
X_train = transformer.transform(train_features)
X_test = transformer.transform(test_features)

# Task 1: Decision Tree classification

In [None]:
for max_depth in [5,10,20,50,None]:
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=max_depth)
    clf.fit(X_train, Y_train)
    print("Accuracy score for depth: {} = {}".format(max_depth, accuracy_score(clf.predict(X_test), Y_test)))
    disp = plot_confusion_matrix(clf, X_test, Y_test,
                                         cmap=plt.cm.Blues,
                                         normalize='true')
    disp.ax_.set_title("Confusion matrix for Decision tree classifier with max depth: {}".format(max_depth))

# Task 2: Random Forest classification

In [None]:
for max_depth in [5,10,20,50,None]:
    clf = RandomForestClassifier(criterion='entropy', max_depth=max_depth, random_state=0)
    clf.fit(X_train, Y_train)
    print("Accuracy score for depth: {} = {}".format(max_depth, accuracy_score(clf.predict(X_test), Y_test)))
    disp = plot_confusion_matrix(clf, X_test, Y_test,
                                         cmap=plt.cm.Blues,
                                         normalize='true')
    disp.ax_.set_title("Confusion matrix for Random forrest classifier with max depth: {}".format(max_depth))

# Task 3: XGBoost classification

In [None]:
xgb_model = xgb.XGBClassifier(n_jobs=-1)
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2, 4, 6, 10, 20, 40, 60, 75, 100],
                    'n_estimators': [50, 100, 200]}, verbose=1, n_jobs=-1)
clf.fit(X_train, Y_train)

In [None]:
print("Best params for XGBoost classifier: {}".format(clf.best_params_))
print("XGBoost classifier accuracy: {}".format(accuracy_score(clf.predict(X_test), Y_test)))
disp = plot_confusion_matrix(clf, X_test, Y_test,
                                         cmap=plt.cm.Blues,
                                         normalize='true')
disp.ax_.set_title("Confusion matrix for XGBoost classifier")