The data contains the credit details about credit borrowers. Data Description:
- age - Age of Customer
- ed - Eductation level of customer
- employ: Tenure with current employer (in years)
- address: Number of years in same address
- income: Customer Income
- debtinc: Debt to income ratio
- creddebt: Credit to Debt ratio
- othdebt: Other debts
- default: Customer defaulted in the past (1= defaulted, 0=Never defaulted)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score

# Resampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
bankloan = pd.read_csv('../input/bankloans/bankloans.csv')
bankloan

In [None]:
bankloan.describe()

# Data Cleaning

*Check Missing Value and Fill*

In [None]:
bankloan.isna().sum()/len(bankloan.index)*100

In [None]:
imp_default = SimpleImputer(strategy = 'median')
bankloan[['default']] = imp_default.fit_transform(bankloan[['default']])

*Check the Imbalance*

In [None]:
bankloan['default'].value_counts()/len(bankloan['default'].index)*100

This percentage indicates that the data is **imbalanced**.

*Splitting Data*

In [None]:
X = bankloan[['employ', 'debtinc', 'creddebt', 'othdebt']]
y = bankloan['default']

In [None]:
X.shape

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
                                            X, y, 
                                            stratify = y, 
                                            test_size = 0.2, 
                                            random_state = 44)

I use 0.2 as test_size score and X.shape for random_state so the data will be devided equally.

# Define Model

In [None]:
logreg = LogisticRegression()
tree = DecisionTreeClassifier(random_state = 44)
knn = KNeighborsClassifier()

In [None]:
logreg_pipe_scale = Pipeline([
    ('scale', StandardScaler()),
    ('logreg', logreg)
])

tree_pipe_scale = Pipeline([
    ('tree', tree)
])

knn_pipe_scale = Pipeline([
    ('scale', StandardScaler()),
    ('knn', knn)
])

# Cross Validation

*Model Evaluation*

In [None]:
def model_evaluation(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_cv = cross_val_score(model, X_train_val, y_train_val, cv = skfold, scoring = metric)
    return model_cv

In [None]:
logreg_pipe_scale_cv = model_evaluation(logreg_pipe_scale, 'f1')
tree_pipe_scale_cv = model_evaluation(tree_pipe_scale, 'f1')
knn_pipe_scale_cv = model_evaluation(knn_pipe_scale, 'f1')

In [None]:
logreg_cv = logreg_pipe_scale_cv.mean()
tree_cv = tree_pipe_scale_cv.mean()
knn_cv = knn_pipe_scale_cv.mean()

In [None]:
score_list = [logreg_cv, tree_cv, knn_cv]
method_name = ['Logistic Regression CV Score', 'Decision Tree Classifier CV Score',
              'KNN Classifier CV Score']
cv_summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
cv_summary

*Fitting Data*

In [None]:
for model, model_name in zip([logreg_pipe_scale, tree_pipe_scale, knn_pipe_scale], 
                             ['Logistic Regression', 'Decision Tree Classifier', 'KNN Classifier']):
    model.fit(X_train_val, y_train_val)
    y_pred = model.predict(X_test)
    print(model_name+ ':')
    print(classification_report(y_test, y_pred))

I use 3 resampling methods to handle it, Under Sampling, Over Sampling and SMOTE.

# Resampling: *UnderSampling*

In [None]:
rus = RandomUnderSampler(random_state = 44)
X_under, y_under = rus.fit_resample(X_train_val, y_train_val)

In [None]:
logreg_pipe_scale_rus = Pipeline([
    ('scale', StandardScaler()),
    ('rus', rus),
    ('logreg', logreg)
])

tree_pipe_scale_rus = Pipeline([
    ('rus', rus),
    ('tree', tree)
])

knn_pipe_scale_rus = Pipeline([
    ('scale', StandardScaler()),
    ('rus', rus),
    ('knn', knn)
])

*Model Evaluation*

In [None]:
def model_evaluation_rus(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_rus = cross_val_score(model, X_under, y_under, cv = skfold, scoring = metric)
    return model_rus

In [None]:
logreg_pipe_rus = model_evaluation_rus(logreg_pipe_scale_rus, 'f1')
tree_pipe_rus = model_evaluation_rus(tree_pipe_scale_rus, 'f1')
knn_pipe_rus = model_evaluation_rus(knn_pipe_scale_rus, 'f1')

In [None]:
logreg_rus = logreg_pipe_rus.mean()
tree_rus = tree_pipe_rus.mean()
knn_rus = knn_pipe_rus.mean()

In [None]:
score_list = [logreg_rus, tree_rus, knn_rus]
method_name = ['Logistic Regression UnderSampling Score', 'Decision Tree Classifier UnderSampling Score',
              'KNN Classifier UnderSampling Score']
rus_summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
rus_summary

*Fitting Data*

In [None]:
for model, model_name in zip([logreg_pipe_scale_rus, tree_pipe_scale_rus, knn_pipe_scale_rus], 
                             ['Logistic Regression UnderSampling', 'Decision Tree Classifier UnderSampling', 'KNN Classifier UnderSampling']):
    model.fit(X_under, y_under)
    y_pred = model.predict(X_test)
    print(model_name+ ':')
    print(classification_report(y_test, y_pred))

# Resampling: *OverSampling*

In [None]:
ros = RandomOverSampler(random_state = 44)
X_over, y_over = ros.fit_resample(X_train_val, y_train_val)

In [None]:
logreg_pipe_scale_ros = Pipeline([
    ('scale', StandardScaler()),
    ('ros', ros),
    ('logreg', logreg)
])

tree_pipe_scale_ros = Pipeline([
    ('ros', ros),
    ('tree', tree)
])

knn_pipe_scale_ros = Pipeline([
    ('scale', StandardScaler()),
    ('ros', ros),
    ('knn', knn)
])

*Model Evaluation*

In [None]:
def model_evaluation_ros(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_ros = cross_val_score(model, X_over, y_over, cv = skfold, scoring = metric)
    return model_ros

In [None]:
logreg_pipe_ros = model_evaluation_ros(logreg_pipe_scale_ros, 'f1')
tree_pipe_ros = model_evaluation_ros(tree_pipe_scale_ros, 'f1')
knn_pipe_ros = model_evaluation_ros(knn_pipe_scale_ros, 'f1')

In [None]:
logreg_ros = logreg_pipe_ros.mean()
tree_ros = tree_pipe_ros.mean()
knn_ros = knn_pipe_ros.mean()

In [None]:
score_list = [logreg_ros, tree_ros, knn_ros]
method_name = ['Logistic Regression OverSampling Score', 'Decision Tree Classifier OverSampling Score',
              'KNN Classifier OverSampling Score']
ros_summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
ros_summary

*Fitting Data*

In [None]:
for model, model_name in zip([logreg_pipe_scale_ros, tree_pipe_scale_ros, knn_pipe_scale_ros], 
                             ['Logistic Regression OverSampling', 'Decision Tree Classifier OverSampling', 'KNN Classifier OverSampling']):
    model.fit(X_over, y_over)
    y_pred = model.predict(X_test)
    print(model_name+ ':')
    print(classification_report(y_test, y_pred))

# Resampling: *SMOTE*

In [None]:
smote = SMOTE(random_state = 44)
X_smote, y_smote = smote.fit_resample(X_train_val, y_train_val)

In [None]:
logreg_pipe_scale_smote = Pipeline([
    ('scale', StandardScaler()),
    ('smote', smote),
    ('logreg', logreg)
])

tree_pipe_scale_smote = Pipeline([
    ('smote', smote),
    ('tree', tree)
])

knn_pipe_scale_smote = Pipeline([
    ('scale', StandardScaler()),
    ('smote', smote),
    ('knn', knn)
])

*Model Evaluation*

In [None]:
def model_evaluation_smote(model, metric):
    skfold = StratifiedKFold(n_splits = 5)
    model_smote = cross_val_score(model, X_smote, y_smote, cv = skfold, scoring = metric)
    return model_smote

In [None]:
logreg_pipe_smote = model_evaluation_smote(logreg_pipe_scale_smote, 'f1')
tree_pipe_smote = model_evaluation_smote(tree_pipe_scale_smote, 'f1')
knn_pipe_smote = model_evaluation_smote(knn_pipe_scale_smote, 'f1')

In [None]:
logreg_smote = logreg_pipe_smote.mean()
tree_smote = tree_pipe_smote.mean()
knn_smote = knn_pipe_smote.mean()

In [None]:
score_list = [logreg_smote, tree_smote, knn_smote]
method_name = ['Logistic Regression SMOTE Score', 'Decision Tree Classifier SMOTE Score',
              'KNN Classifier SMOTE Score']
smote_summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
smote_summary

*fitting data*

In [None]:
for model, model_name in zip([logreg_pipe_scale_smote, tree_pipe_scale_smote, knn_pipe_scale_smote], 
                             ['Logistic Regression SMOTE', 'Decision Tree Classifier SMOTE', 'KNN Classifier SMOTE']):
    model.fit(X_smote, y_smote)
    y_pred = model.predict(X_test)
    print(model_name+ ':')
    print(classification_report(y_test, y_pred))

# HyperParam Tuning

Finally, I choose **Logistic Regression using UnderSampling** model because it has the highest of accuracy score. Let's do hyperparameter tuning to see if I can improve the score again after the imbalance data has been handled. Can it improve?

In [None]:
logreg = LogisticRegression()

rus = RandomUnderSampler(random_state = 44)
X_under, y_under = rus.fit_resample(X_train_val, y_train_val)

estimator = Pipeline([
    ('scale', StandardScaler()),
    ('rus', rus),
    ('logreg', logreg)
])

In [None]:
hyperparam_space = {
    'logreg__C': [100, 10, 1, 0.1, 0.01, 0.001],
    'logreg__solver': ['liblinear', 'newton-cg', 'saga', 'lbfgs'],
    'logreg__max_iter': [100, 200, 300, 400]
}

In [None]:
grid_search = GridSearchCV(
                estimator,
                param_grid = hyperparam_space,
                cv = StratifiedKFold(n_splits = 5),
                scoring = 'f1',
                n_jobs = -1)

In [None]:
grid_search.fit(X_train_val, y_train_val)

In [None]:
print('best score', grid_search.best_score_)
print('best param', grid_search.best_params_)

# Before VS After Tuning

In [None]:
estimator.fit(X_under, y_under)
y_pred_estimator = estimator.predict(X_test)
print(classification_report(y_test, y_pred_estimator))

In [None]:
grid_search.best_estimator_.fit(X_train_val, y_train_val)
y_pred_grid = grid_search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred_grid))

In [None]:
f1_estimator = f1_score(y_test, y_pred_estimator)
f1_best_estimator = f1_score(y_test, y_pred_grid)

In [None]:
score_list = [f1_estimator, f1_best_estimator]
method_name = ['Logistic Regression UnderSampling Before Tuning', 'Logistic Regression UnderSampling After Tuning']
best_summary = pd.DataFrame({
    'method': method_name,
    'score': score_list
})
best_summary

Usually, Hyperparameter tuning can improve the result, but in this case, **it can't**. So i have to change with another model for tuning until get the improvement.

* Best Model: Logistic Regression using UnderSampling
* Best Estimator Score: 0.52376
* Best C: 0.1
* Best max_iter: 100
* Best solver: newton-cg

Thank you for reading this notebook.