# **Predicting Bank Failures Using Machine Learning**

***Model Development and Operational Details***

In this workbook, we perform in depth exploratory data analysis, data wrangling, and finally apply a logistic regression analysis to categorize banks as risky or healthy, in line with the FDIC's collected data. As discussed above, we use the Homeland Infrastructure Foundation's 2016 FDIC Insured Banks Data Set.




# Imports

#### Local imports

In [1]:
import sys
sys.path.append('../')

#### Third party imports

In [9]:
##Install Packages
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 10)
import numpy as np
import sklearn.tree
import sklearn.metrics
import sklearn.model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, auc, confusion_matrix
import graphviz
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from IPython.display import Image
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

#### Local application imports

In [12]:
%load_ext autoreload
%autoreload 2

from pkg_dir.config import *
from pkg_dir.src.utils import *
from pkg_dir.src.functions import *
from pkg_dir.src.parameters import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Loading data

In [13]:
extract_pipeline_func()

Dataset already present locally... skipping download...


  dfx = pd.read_csv(os.path.join(dataset_local_files, file))


UnboundLocalError: local variable 'data_prefix' referenced before assignment

# Data exploration

### `SCORE_T`

In [None]:
px.histogram(
    x=dfk['SCORE'],
)

# Data wrangling

In [None]:
transform_pipeline_func()

### Setting index field as dataframe index

In [None]:
dfk.set_index('index', inplace=True)

### Target variable - label

In [None]:
# risk score, I think the thing we can try to predict. normally 100, sometimes less. 
# transformed SCORE to make a binary categorical thing for prediction
# Right now since most of the data has a risk score of 100, I've made that the
# cutoff, to get more representation for anything with a lower risk score.
# We can adjust the cutoff though and be more permissive, depending on how
# results look! 
dfk['SCORE_T'] = (dfk['SCORE'] < 100)*1
target = ['SCORE_T']

### Adding relevant variables

In [None]:
dfk['DEP_RATIOS'] = dfk['DEPSUMBR']/dfk['DEPDOM']

# Feature engineering

### Categorical features transformation

In [None]:
# Data transformation to use following
# categorical features that may need transformation:
    # - bank classification - 1-2 letters string categorization
    # 'BKCLASS',
    # - regulatory agent. surprised it's not always "FDIC" - various strings. could convert category to # if need be. 
    # 'REGAGNT',
    # - status. string, single letter. don't know values. 
    # 'STATUS',
dfk['BKCLASS_T'] = dfk['BKCLASS'].astype('category').cat.codes
dfk['REGAGNT_T'] = dfk['REGAGNT'].astype('category').cat.codes
dfk['STATUS_T'] = dfk['STATUS'].astype('category').cat.codes
dfk['DEP_RATIO'] = dfk['DEPSUMBR']/dfk['DEPDOM']
dfk['DEP_RATIO'][dfk['DEPDOM']==0] = 0

### Features definition

In [None]:
# - denovo or not - all 0, not useful!
# report date is all June 30 2014
features = [
    # possible labels incl:
    # - bank number
    'BRNUM',
    # - unique id for bank
    'UNINUMBR',

    # geography that may need larger dimension to be useful:
    # - statistical geo area code, but numerical
    'CBSABR',
    # - zip code bank
    'ZIPBR',

    # actual trainable features
    # - total deposit $ <- this actually means Branch
    'DEPSUMBR',
    # - asset size bank
    'ASSET',
    # - domestic deposits $. <- this actually means Institution
    'DEPDOM',
    # $ branch/ $ institution
    'DEP_RATIO',
    # incl. categorical features that got transformed:
    # - bank classification - 1-2 letters string categorization
    'BKCLASS_T',
    # - regulatory agent. surprised it's not always "FDIC" - various strings. 
    'REGAGNT_T',
    # - status. string, single letter. don't know values. 
    'STATUS_T',
    ]

In [None]:
dfk = dfk[features + target].copy()

In [None]:
dfk[features].describe()

In [None]:
dfk[target].describe()

# Building model

### Splitting data in test and train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dfk.loc[:, dfk.columns != 'SCORE_T'],
    dfk['SCORE_T'], 
    test_size=data_test_size,
    random_state=random_state
)

### Logistic regression model

In [None]:
lrm = LogisticRegression(
    random_state=random_state,
    max_iter=1000,
).fit(X_train, y_train)

### Random forest model

In [None]:
rfc = RandomForestClassifier(
    random_state=random_state,
    max_depth=100
).fit(X_train, y_train)

### Decision tree

In [None]:
# lowered max depth so it would run
dtm = sklearn.tree.DecisionTreeClassifier(
    max_depth=2, 
    random_state=random_state,
).fit(X_train, y_train)

### Models compilation

In [None]:
models = {
    'logistic_reg': {
        'model': lrm,
    },
    'random_forest': {
        'model': rfc,
    },
    'decision_tree': {
        'model': dtm,
    },
}

# Evaluating models

### General model metrics

In [None]:
dfx = pd.DataFrame(
    # columns=['Logistic Regression', 'Decision Tree', 'Random Forest'],
    index=[
        'Training performance',
        'Test performance',
        'Training sensitivity',
        'Training specificity',
        'Test sensitivity',
        'Test specificity',
    ]
)

In [None]:
# model_name = 'logistic_reg'
# model_name = 'random_forest'
model_name = 'decision_tree'

model = models[model_name]['model']

predicted_train = model.predict(X_train)
predicted_test = model.predict(X_test)

In [None]:
cm_train = sklearn.metrics.confusion_matrix(y_train, predicted_train)
cm_test = sklearn.metrics.confusion_matrix(y_test, predicted_test)

def SensitivityAndSpecificity(cm):
  # True positives are in the lower-right (row 1, column 1)
  TP = cm[1, 1]
  # True negatives are in the upper-left (row 0, column 0)
  TN = cm[0, 0]
  # False positives are in the upper-right (row 0, columns 1)
  FP = cm[0, 1]
  # False negatives are in the lower-left (row 1, column 0)
  FN = cm[1, 0]
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  return {'Sensitivity': sensitivity, 'Specificity': specificity}

dfx.loc['Training performance', model_name] = np.mean(predicted_train == y_train)
dfx.loc['Test performance', model_name] = np.mean(predicted_test == y_test)
dfx.loc['Training sensitivity', model_name] = SensitivityAndSpecificity(cm_train)['Sensitivity']
dfx.loc['Training specificity', model_name] = SensitivityAndSpecificity(cm_train)['Specificity']
dfx.loc['Test sensitivity', model_name] = SensitivityAndSpecificity(cm_test)['Sensitivity']
dfx.loc['Test specificity', model_name] = SensitivityAndSpecificity(cm_test)['Specificity']

### CART regression feature importance

In [None]:
plt.figure(figsize=[12, 4])
I = np.argsort(dtm.feature_importances_)
plt.figure(figsize=[12, 4])
plt.bar([features[i] for i in I], [dtm.feature_importances_[i] for i in I])
plt.xticks(rotation=90)
plt.show()

### Confusion matrix

In [None]:
# model_name = 'logistic_reg'
# model_name = 'random_forest'
model_name = 'decision_tree'

model = models[model_name]['model']

y_pred = model.predict(X_test)

z = confusion_matrix(y_test, y_pred)
y = ['False', 'True']
x = ['False', 'True']

In [None]:
# change each element of z to type string for annotations
z_text = [[str(y) for y in x] for x in z]

# set up figure 
fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text)

# add title
fig.update_layout(title_text='Confusion matrix: ' + model_name,
                 )

# add custom xaxis title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=0.5,
                        y=-0.15,
                        showarrow=False,
                        text="Predicted value",
                        xref="paper",
                        yref="paper"))

# add custom yaxis title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=-0.35,
                        y=0.5,
                        showarrow=False,
                        text="Real value",
                        textangle=-90,
                        xref="paper",
                        yref="paper"))

# adjust margins to make room for yaxis title
fig.update_layout(margin=dict(t=50, l=20))

# add colorbar
fig['data'][0]['showscale'] = True
fig.show()

### Decision tree visualization

In [None]:
def GetDecisionTreeGraphViz(dt):
  return graphviz.Source(sklearn.tree.export_graphviz(
      dt, out_file=None, filled=True, impurity=False,
      feature_names=features))
# The default visualization
# display(GetDecisionTreeGraphViz(dt_fit))

# The zoomed-out visualization
display(Image(GetDecisionTreeGraphViz(dtm).pipe(format='png'), width=1024))

### ROC Curve

In [None]:
fig = go.Figure()

fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

    
for model_name in models:
    
    model = models[model_name]['model']
    y_score = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    auc_score = roc_auc_score(y_test, y_score)
    
    name = f"{model_name} (AUC={auc_score:.2f})"

    fig.add_trace(
        go.Scatter(
            x=fpr, 
            y=tpr, 
            name=name, 
            mode='lines'
        )
    )

fig.update_layout(
    title='ROC Curves',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    width=700, height=500
)
fig.show()

# *Additional notes*