<pre>
           _.-------._
        _-'_.------._ `-_
      _- _-          `-_/
     -  -
 ___/  /______________
/___  .______________/
 ___| |_____________
/___  .____________/
    \  \
     -_ -_             /|
       -_ -._        _- |
         -._ `------'_./
            `-------'
            
  <b>Bank Loan Approval</b>
  Ensemble methods
</pre>

In [None]:
import numpy as np
from numpy import mean, std, hstack
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="white")

from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

<h2 class="list-group-item list-group-item-action active" data-toggle="list" style="color:black; background:white; border:0.2px dotted;" role="tab" aria-controls="home"><center>Prepare Dataset</center></h2>

In [None]:
path = '../input/credit-risk-analysis-for-extending-bank-loans/bankloans.csv'

df = pd.read_csv(path)

df = df[df.default.notnull()]
df = shuffle(df)

df.head()

In [None]:
def z_score(df):
    df_std = df.copy()
    # apply the z-score method
    for column in df_std.columns:
        df_std[column] = (df_std[column] - df_std[column].mean()) / df_std[column].std()
    return df_std

In [None]:
features = df[list(df.columns)[:-1]]
features = z_score(features).values
labels = df['default'].values

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.3, random_state=42)

In [None]:
# Compute the correlation matrix
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

<h2 class="list-group-item list-group-item-action active" data-toggle="list" style="color:black; background:white; border:0.2px dotted;" role="tab" aria-controls="home"><center>Ensemble Methods</center></h2>

In [None]:
estimators, accs = [], []

# 1. Bagging Algorithms

The three bagging models covered in this section are as follows:
1. Bagged Decision Trees
2. Random Forest
3. Extra Trees

## Decision Trees

In [None]:
kfold = model_selection.KFold(n_splits=10)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees)
results = model_selection.cross_val_score(model, features, labels, cv=kfold)
acc = results.mean()
print("Accuracy:{:1.3f}".format(acc))

# append model and accuracy
estimators.append(('decision_tree', model))
accs.append(['decision_tree', round(acc, 3)])

## Random Forest

In [None]:
max_features = 3
kfold = model_selection.KFold(n_splits=10)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, features, labels, cv=kfold)
acc = results.mean()
print("Accuracy:{:1.3f}".format(acc))

# append model and accuracy
estimators.append(('random_forest', model))
accs.append(['random_forest', round(acc, 3)])

## Extra Trees

In [None]:
num_trees = 100
max_features = 7
kfold = model_selection.KFold(n_splits=10)
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)
results = model_selection.cross_val_score(model, features, labels, cv=kfold)
acc = results.mean()
print("Accuracy:{:1.3f}".format(acc))

# append model and accuracy
estimators.append(('extra_trees', model))
accs.append(['extra_trees', round(acc, 3)])

# 2. Boosting Algorithms

The two most common boosting ensemble machine learning algorithms are:

1. AdaBoost
2. Stochastic Gradient Boosting

## AdaBoost

In [None]:
num_trees = 30
kfold = model_selection.KFold(n_splits=10)
model = AdaBoostClassifier(n_estimators=num_trees)
results = model_selection.cross_val_score(model, features, labels, cv=kfold)
acc = results.mean()
print("Accuracy: {:1.3f}".format(acc))

# append model and accuracy
estimators.append(('ada_boost', model))
accs.append(['ada_boost', round(acc, 3)])

## Stochastic Gradient Boosting

In [None]:
num_trees = 100
kfold = model_selection.KFold(n_splits=10)
model = GradientBoostingClassifier(n_estimators=num_trees)
results = model_selection.cross_val_score(model, features, labels, cv=kfold)
acc = results.mean()
print("Accuracy: {:1.3f}".format(acc))

# append model and accuracy
estimators.append(('gradient_boost', model))
accs.append(['gradient_boost', round(acc, 3)])

# 3. Voting Ensemble

In [None]:
kfold = model_selection.KFold(n_splits=10)
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, features, labels, cv=kfold)
acc = results.mean()
print("Accuracy: {:1.3f}".format(acc))

# append accuracy
accs.append(['voting', round(acc, 3)])

# 4. Stacking Ensemble

In [None]:
def fit_ensemble(estimators, X_train, X_test, y_train, y_test):
    meta_X = list()
    for name, model in estimators:
        # train model
        model.fit(X_train, y_train)
        # predict model
        yhat = model.predict(X_test)
        yhat = yhat.reshape(len(yhat), 1)
        meta_X.append(yhat)
    # stack all the yhat outputs
    meta_X = hstack(meta_X)
    blender = LogisticRegression()
    blender.fit(meta_X, y_test)
    return blender

In [None]:
def predict_ensemble(estimators, blender, X_test):
    meta_X = list()
    for name, model in estimators:
        # predict model
        yhat = model.predict(X_test)
        yhat = yhat.reshape(len(yhat), 1)
        meta_X.append(yhat)
    # stack all the yhat outputs
    meta_X = hstack(meta_X)
    return blender.predict(meta_X)

In [None]:
blender = fit_ensemble(estimators, X_train, X_test, y_train, y_test)
yhat = predict_ensemble(estimators, blender, X_test)
score = mean_absolute_error(y_test, yhat)
print('Blending Mean Absolute Error: {:1.3f}'.format(score))

results = model_selection.cross_val_score(blender, features, labels, cv=kfold)
acc = results.mean()
print("Accuracy: {:1.3f}".format(acc))

# append accuracy
accs.append(['blending', round(acc, 3)])

<h2 class="list-group-item list-group-item-action active" data-toggle="list" style="color:black; background:white; border:0.2px dotted;" role="tab" aria-controls="home"><center>Analyze</center></h2>

In [None]:
accs = np.array(accs)

df_accs = pd.DataFrame({ "id" : np.arange(len(accs)),
                         "label" : accs[:,0], 
                         "accuracy" : accs[:,1] })

In [None]:
plt.figure(figsize=(10,4))
ax = sns.barplot(x="id", y="accuracy", hue="label", data=df_accs)
ax.set_title('Accuracy per Ensemble')