# Loan Prediction EDA

## Import libraries and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split ,KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, Binarizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

from lightgbm import LGBMClassifier, plot_importance

from imblearn.over_sampling import SMOTE

import graphviz

In [None]:
# Fetch data
data_train = pd.read_csv('../input/loan-prediction-based-on-customer-behavior/Training Data.csv')

## Data Preprocessing

In [None]:
data_train.head()

In [None]:
data_train.info()

In [None]:
data_train.isnull().sum()

In [None]:
# Drop Unnecessary Columns
data_train.drop(['Id', 'CITY', 'STATE'], axis=1, inplace=True)
data_train

## Evaluation

In [None]:
# Extract X for feature dataset, y for label dataset
X = data_train.iloc[:, :-1]
y = data_train.iloc[:, -1]

In [None]:
# # ML Algorithm cannot fit featues which contains characters
# # Therefore, we should encode them into numbers
# For applying various algorithms, it could be a good idea to select One-Hot Encoding
X = pd.get_dummies(X)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11, stratify=y)

In [None]:
# Utility Function
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('Confusion Matrix')
    print(confusion)
    print('Accuracy: {0:.4f}, Precision: {1:.4f}, Recall {2:.4f}, F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:
# Process fitting, prediction and evalution by Logistic Regression
# Create Estimator CLass
dt_clf = DecisionTreeClassifier()
lr_clf = LogisticRegression()
rf_clf = RandomForestClassifier()

# Fitting
dt_clf.fit(X_train, y_train)
lr_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)

# Prediction
dt_pred = dt_clf.predict(X_test)
lr_pred = lr_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)

# Pred_Proba
dt_pred_proba = dt_clf.predict_proba(X_test)[:, 1]
lr_pred_proba = lr_clf.predict_proba(X_test)[:, 1]
rf_pred_proba = rf_clf.predict_proba(X_test)[:, 1]

# Evaluation
get_clf_eval(y_test, dt_pred, dt_pred_proba)
get_clf_eval(y_test, lr_pred, lr_pred_proba)
get_clf_eval(y_test, rf_pred, rf_pred_proba)

In [None]:
# Plot Function
def precision_recall_curve_plot(y_test, pred_proba_c1):
    # Extarct ndarray of threshold and ndarray of precision, recall by itself
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_c1)

    # Set X axis for threshold values, Y axis for precision, recall and create plot
    plt.figure(figsize=(8, 6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary], label='recall')

    # Scaling threshold values of 0.1 units on X axis
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1), 2))

    # Set labels of X axis, y axis, legend and grid
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# DecisionTreeClassifier
precision_recall_curve_plot(y_test, dt_pred_proba)
# LogisitcRegression
precision_recall_curve_plot(y_test, lr_pred_proba)
# RandomForestClassifier
precision_recall_curve_plot(y_test, rf_pred_proba)

In [None]:
# Plot Function

def roc_curve_plot(y_test, pred_proba_c1):
    # Return values of FPR, TPR by thresholds
    fprs, tprs, thresholds = roc_curve(y_test, pred_proba_c1)
    # Plot ROC curve
    plt.plot(fprs, tprs, label='ROC')
    # Plot diagonal line
    plt.plot([0, 1], [0, 1], 'k--', label='Random')

    # Scaling threshold values of 0.1 units on X axis(FPR)
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1), 2))
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    # Set label of X, Y axis
    plt.xlabel('FPR(1 - Sensitivity)')
    plt.ylabel('TPR(Recall)')
    plt.legend()

    plt.show()

roc_curve_plot(y_test, dt_pred_proba)
roc_curve_plot(y_test, lr_pred_proba)
roc_curve_plot(y_test, rf_pred_proba)


### Summary
1. The performance with RandomForestClassifier was much better than other algorithms.
2. As you can see the last plot, we could set threshold value as 0.3 for custom.

In [None]:
# Evaluation with custom threshold value
# Set threshold value as 0.46
custom_threshold = 0.46

# Extract 'Positive Class' in order to apply Binarizer
custom_pred_proba = rf_pred_proba.reshape(-1, 1)

binarizer = Binarizer(threshold=custom_threshold).fit(custom_pred_proba)
custom_predict = binarizer.transform(custom_pred_proba)

get_clf_eval(y_test, custom_predict, custom_pred_proba)

In [None]:
# Plot
precision_recall_curve_plot(y_test, custom_pred_proba)
roc_curve_plot(y_test, custom_pred_proba)

## Classification

### DecisionTreeClassifier

In [None]:
# Re-check for the accuracy score by DecisionTreeClassifier
# Extract hyperparmeters of DecisionTreeClassifier
print('Accuracy Score by DecisionTreeClassifier: {0:.4f}'.format(accuracy_score(y_test, dt_pred)))
print('Hyperparameters of DecisionTreeClassifier:\n', dt_clf.get_params())

In [None]:
# Tuning of hyperparameters by GridSearchCV
params = {
    'max_depth' : range(10, 30, 5),
    'min_samples_split' : range(20, 40, 10)
}

grid_cv = GridSearchCV(dt_clf, param_grid=params, scoring='accuracy', cv=7, verbose=1, n_jobs=-1, refit=True)
grid_cv.fit(X_train, y_train)
print('The Best Average of Accuracy Scores by GridSearchCV: {0:.4f}'.format(grid_cv.best_score_))
print('The Best Parameters for Optimization: ', grid_cv.best_params_)

In [None]:
# Create DataFarme with results of GridSearchCv
cv_results_df = pd.DataFrame(grid_cv.cv_results_)
cv_results_df[['rank_test_score', 'param_max_depth', 'param_min_samples_split', 'mean_test_score']]

In [None]:
best_dt_clf = grid_cv.best_estimator_
best_pred = best_dt_clf.predict(X_test)
best_accuracy = accuracy_score(y_test, best_pred)
print('Accuracy Score of DecisionTreeClassifier: {0:.4f}'.format(best_accuracy))

In [None]:
ftr_values = best_dt_clf.feature_importances_
ftr_values = pd.Series(ftr_values, index=X_train.columns)
ftr_top5 = ftr_values.sort_values(ascending=False)[:5]

plt.figure(figsize=(10, 8))
plt.title('Feature Importance Top 5')
sns.barplot(x=ftr_top5, y=ftr_top5.index)
plt.show()

### Ensemble Learning(Voting Classifier)

In [None]:
# We have already created DeicisonTreeClassifier model above
knn_clf = KNeighborsClassifier(n_neighbors=8)

# Create VotingClassifier by soft voting
vo_clf = VotingClassifier(estimators=[('DT', dt_clf), ('KNN', knn_clf)], voting='soft')

# Fitting, Prediction and Evaluation of VotingClassifier
vo_clf.fit(X_train, y_train)
vo_pred = vo_clf.predict(X_test)
print('Accuracy Score of VotingClassifier: {0:.4f}'.format(accuracy_score(y_test, vo_pred)))

In [None]:
# Fitting, Prediction and Evalution by each model
classifiers = [dt_clf, knn_clf]
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    class_name = classifier.__class__.__name__
    print('Accuracy Score of {0}: {1:.4f}'.format(class_name, accuracy_score(y_test, pred)))

### Random Forest

In [None]:
# We have already created RandomForestClassifier

params = {
    'n_estimators' : [100],
    'max_depth' : [6, 8, 10, 12],
    'min_samples_leaf' : [8, 12, 18],
    'min_samples_split' : [8, 16, 20]
}

grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=5, n_jobs=-1)
grid_cv.fit(X_train, y_train)

print('The Best Average of Accuracy Scores by GridSearchCV: {0:.4f}'.format(grid_cv.best_score_))
print('The Best Parameters for Optimization: ', grid_cv.best_params_)

In [None]:
best_dt_clf = grid_cv.best_estimator_
best_pred = best_dt_clf.predict(X_test)
best_accuracy = accuracy_score(y_test, best_pred)
print('Accuracy Score of DecisionTreeClassifier: {0:.4f}'.format(best_accuracy))

In [None]:
ftr_values = best_dt_clf.feature_importances_
ftr_values = pd.Series(ftr_values, index=X_train.columns)
ftr_top5 = ftr_values.sort_values(ascending=False)[:5]

plt.figure(figsize=(10, 8))
plt.title('Feature Importance Top 5')
sns.barplot(x=ftr_top5, y=ftr_top5.index)
plt.show()

### GBM(Gradient Boosting Machine)

In [None]:
# Create estimator and process fitting, prediction and evaluation for model
gb_clf = GradientBoostingClassifier(random_state=11)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

print('Accuracy Score of GradientBoostingClassifier: {0:.4f}'.format(gb_accuracy))

In [None]:
# Optimization by GridSearchCV
params = {
    'n_estimators' : [100, 500],
    'learning_rate' : [0.05, 1]
}

grid_cv = GridSearchCV(gb_clf, param_grid=params, cv=2, n_jobs=-1 ,verbose=1)
grid_cv.fit(X_train, y_train)

print('The Best Average of Accuracy Scores by GridSearchCV: {0:.4f}'.format(grid_cv.best_score_))
print('The Best Parameters for Optimization: ', grid_cv.best_params_)

### LightGBM

In [None]:
# Create estimator and process fitting, prediction and evaluation for model
lgbm_wrapper = LGBMClassifier(n_estimators=400, num_leaves=64, n_jobs=-1, boost_from_average=False)

evals = [(X_test, y_test)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=1200, eval_metric='logloss', eval_set=evals, verbose=True)
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, preds, pred_proba)

In [None]:
# Plot Feature importance
fig, ax = plt.subplots(figsize=(10, 12))
plot_importance(lgbm_wrapper, ax=ax)

### SMOTE: Over Sampling

In [None]:
# Create estimator and process fitting, prediction and evaluation for model
smote = SMOTE(random_state=11)

X_train_over, y_train_over = smote.fit_resample(X_train, y_train)
print('Before applying SMOTE, each shape of Feature/Label datasets: ', X_train.shape, y_train.shape)
print('After appling SMOTE, each shape of Feature/Label datasets: ', X_train_over.shape, y_train_over.shape)

In [None]:
# Create estimator and process fitting, prediction and evaluation for model after applying SMOTE
rf_clf.fit(X_train_over, y_train_over)
rf_preds_over = rf_clf.predict(X_test)
rf_pred_proba_over = rf_clf.predict_proba(X_test)[:, 1]

get_clf_eval(y_test, rf_preds_over, rf_pred_proba_over)

In [None]:
precision_recall_curve_plot(y_test, rf_pred_proba_over)

In [None]:
# Create estimator and process fitting, prediction and evaluation for model after applying SMOTE
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average=False)

lgbm_clf.fit(X_train_over, y_train_over)
lgbm_preds_over = lgbm_clf.predict(X_test)
lgbm_pred_proba = lgbm_clf.predict_proba(X_test)[:, 1]

get_clf_eval(y_test, lgbm_preds_over, lgbm_pred_proba)

### Stacking Ensemble

In [None]:
# Create individual ML model
knn_clf = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=11)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)

# Create the model which will be fitted by dataset Stacking processed
lr_final = LogisticRegression(C=10)

In [None]:
# Fitting each models
knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)

In [None]:
# Predict each models and predict them

knn_pred = knn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)

print('Accuracy Score of KNN: {0:.4f}'.format(accuracy_score(y_test, knn_pred)))
print('Accuracy Score of RandomForestClassifier: {0:.4f}'.format(accuracy_score(y_test, rf_pred)))
print('Accuracy Score of DeicisionTreeClassifier: {0:.4f}'.format(accuracy_score(y_test, dt_pred)))
print('Accuracy Score of AdaBoostClassifier: {0:.4f}'.format(accuracy_score(y_test, ada_pred)))

In [None]:
# Combine preds to one ndarray
pred = np.array([knn_pred, rf_pred, dt_pred, ada_pred])
print(pred.shape)

# Transponse 'pred' in order to convert as Feature
pred = np.transpose(pred)
print(pred.shape)

In [None]:
# Fit, Predict, Evaluate for final model
lr_final.fit(pred, y_test)
final = lr_final.predict(pred)

print('Accuracy Score of Final Model: {0:.4f}'.format(accuracy_score(y_test, final)))