# **Predicting Bank Failures Using Machine Learning**

***Model Development and Operational Details***

In this workbook, we perform in depth exploratory data analysis, data wrangling, and finally apply a logistic regression analysis to categorize banks as risky or healthy, in line with the FDIC's collected data. As discussed above, we use the Homeland Infrastructure Foundation's 2016 FDIC Insured Banks Data Set.




# Imports

#### Local imports

In [1]:
import sys
sys.path.append('../')
import os
import pickle

#### Third party imports

In [2]:
##Install Packages
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 10)
import numpy as np
import sklearn.tree
import sklearn.metrics
import sklearn.model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import warnings
from sklearn.metrics import roc_curve, roc_auc_score, auc, confusion_matrix
import graphviz
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from IPython.display import Image
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

#### Local application imports

In [3]:
%load_ext autoreload
%autoreload 2

from pkg_dir.config import *
from pkg_dir.src.utils import *
from pkg_dir.src.functions import *
from pkg_dir.src.parameters import *

# Extraction and initial wrangling

#### Reading data from Google Drive

In [None]:
##Loading Data
dfk = pd.read_csv('https://drive.google.com/uc?export=download&id=1vD7uj5Tpz2IvDj49YXR_4Xw2Pebhk_Ix')

#### Initial data wrangling

###### Setting index

###### Target variable - label

In [None]:
dfk['SCORE_T'] = (dfk['SCORE'] < 100)*1

###### Adding relevant variables

In [None]:
dfk['DEP_RATIOS'] = dfk['DEPSUMBR']/dfk['DEPDOM']
dfk['DEP_RATIOS'][dfk['DEPDOM']==0] = 0

###### Data information

In [None]:
dfk.info()

###### Generating data schema

###### Splitting data in test and train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dfk.loc[:, dfk.columns != 'SCORE_T'],
    dfk['SCORE_T'], 
    test_size=0.3,
    random_state=9,
)

In [None]:
## Merging data to solve pipeline problem
dfk_train = pd.merge(
    left=X_train,
    right=y_train,
    left_index=True,
    right_index=True,
)

dfk_train.to_csv('../pkg_dir/data/dataset/banks_data_/train.csv')

In [None]:
## Merging data to solve pipeline problem
dfk_test = pd.merge(
    left=X_test,
    right=y_test,
    left_index=True,
    right_index=True,
)

dfk_test.to_csv('../pkg_dir/data/dataset/banks_data_/test.csv')

#### Applying extract pipeline function

In [None]:
extract_pipeline_func()

#### Evaluating saved pickles

In [None]:
dataset_objs_path = '../pkg_dir/data/pickles/pipeline/extract/'
objects = os.listdir(dataset_objs_path)
obj = objects[2]

In [None]:
with open(dataset_objs_path + obj, 'rb') as obj_content:
    dfx = pickle.load(obj_content)

# Data transformation

In [4]:
transform_pipeline_func()

#### Evaluating saved pickles

In [None]:
dataset_objs_path = '../pkg_dir/data/pickles/pipeline/trans/'
objects = os.listdir(dataset_objs_path)
obj = objects[2]

In [None]:
with open(dataset_objs_path + obj, 'rb') as obj_content:
    dfx = pickle.load(obj_content)

# Data exploration

### `SCORE_T`

In [None]:
px.histogram(
    x=dfk['SCORE'],
)

# Feature engineering

In [5]:
feateng_pipeline_func()

#### Evaluating saved pickles

In [None]:
dataset_objs_path = '../pkg_dir/data/pickles/pipeline/feateng/'
objects = os.listdir(dataset_objs_path)
obj = objects[5]

In [None]:
with open(dataset_objs_path + obj, 'rb') as obj_content:
    dfx = pickle.load(obj_content)

# Model training

In [6]:
modtrain_pipeline_func()

Model in training:  random_forest


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Model in training:  decision_tree
Model in training:  logistic_regression


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
30 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rp_mbp/Documents/educacion/maestria/berkeley_mba_meng/academics/2023_spring/data_science_applied_to_finance_and_accounting/UCB_gclass_ds_for_finance_and_accounting/.venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", lin

#### Evaluating saved pickles

In [None]:
dataset_objs_path = '../pkg_dir/data/pickles/pipeline/modtrain/'
objects = os.listdir(dataset_objs_path)
obj = objects[2]

In [None]:
with open(dataset_objs_path + obj, 'rb') as obj_content:
    dfx = pickle.load(obj_content)

# Models evaluation and selection

In [8]:
modevalsel_pipeline_func()

ValueError: X has 38812 features, but RandomForestClassifier is expecting 128797 features as input.

### Models compilation

In [None]:
dataset_objs_path = '../pkg_dir/data/pickles/pipeline/modtrain/'
objects = os.listdir(dataset_objs_path)
obj = objects[5]

In [None]:
with open(dataset_objs_path + obj, 'rb') as obj_content:
    models = pickle.load(obj_content)

In [None]:
lr = models['logistic_regression']['best_estimator']
rf = models['random_forest']['best_estimator']
dt = models['decision_tree']['best_estimator']
# gb = models['gradient_boosting']['best_estimator']

### Datasets compilation

In [None]:
dataset_objs_path = '../pkg_dir/data/pickles/pipeline/modtrain/'
objects = os.listdir(dataset_objs_path)

with open(dataset_objs_path + objects[6], 'rb') as obj_content:
    X_train = pickle.load(obj_content)
    
with open(dataset_objs_path + objects[4], 'rb') as obj_content:
    y_train = pickle.load(obj_content)
    
with open(dataset_objs_path + objects[2], 'rb') as obj_content:
    X_test = pickle.load(obj_content)
    
with open(dataset_objs_path + objects[3], 'rb') as obj_content:
    y_test = pickle.load(obj_content)
    
with open(dataset_objs_path + objects[0], 'rb') as obj_content:
    X_val = pickle.load(obj_content)
    
with open(dataset_objs_path + objects[1], 'rb') as obj_content:
    y_val = pickle.load(obj_content)

# Evaluating models

### General model metrics

In [None]:
dfx = pd.DataFrame(
    # columns=['Logistic Regression', 'Decision Tree', 'Random Forest'],
    index=[
        'Training performance',
        'Test performance',
        'Training sensitivity',
        'Training specificity',
        'Test sensitivity',
        'Test specificity',
    ]
)

In [None]:
model_name = 'random_forest'
model_name = 'logistic_reg'
model_name = 'decision_tree'

model = rf
model = lr
model = dt

predicted_train = model.predict(X_train)
predicted_test = model.predict(X_test)

In [None]:
cm_train = sklearn.metrics.confusion_matrix(y_train, predicted_train)
cm_test = sklearn.metrics.confusion_matrix(y_test, predicted_test)

def SensitivityAndSpecificity(cm):
  # True positives are in the lower-right (row 1, column 1)
  TP = cm[1, 1]
  # True negatives are in the upper-left (row 0, column 0)
  TN = cm[0, 0]
  # False positives are in the upper-right (row 0, columns 1)
  FP = cm[0, 1]
  # False negatives are in the lower-left (row 1, column 0)
  FN = cm[1, 0]
  sensitivity = TP / (TP + FN)
  specificity = TN / (TN + FP)
  return {'Sensitivity': sensitivity, 'Specificity': specificity}

dfx.loc['Training performance', model_name] = np.mean(predicted_train == y_train['label'])
dfx.loc['Test performance', model_name] = np.mean(predicted_test == y_test['dummy_label'])
dfx.loc['Training sensitivity', model_name] = SensitivityAndSpecificity(cm_train)['Sensitivity']
dfx.loc['Training specificity', model_name] = SensitivityAndSpecificity(cm_train)['Specificity']
dfx.loc['Test sensitivity', model_name] = SensitivityAndSpecificity(cm_test)['Sensitivity']
dfx.loc['Test specificity', model_name] = SensitivityAndSpecificity(cm_test)['Specificity']

### CART regression feature importance

In [None]:
plt.figure(figsize=[12, 4])
I = np.argsort(dtm.feature_importances_)
plt.figure(figsize=[12, 4])
plt.bar([features[i] for i in I], [dtm.feature_importances_[i] for i in I])
plt.xticks(rotation=90)
plt.show()

### Confusion matrix

In [None]:
# model_name = 'logistic_reg'
# model_name = 'random_forest'
model_name = 'decision_tree'

model = models[model_name]['model']

y_pred = model.predict(X_test)

z = confusion_matrix(y_test, y_pred)
y = ['False', 'True']
x = ['False', 'True']

In [None]:
# change each element of z to type string for annotations
z_text = [[str(y) for y in x] for x in z]

# set up figure 
fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text)

# add title
fig.update_layout(title_text='Confusion matrix: ' + model_name,
                 )

# add custom xaxis title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=0.5,
                        y=-0.15,
                        showarrow=False,
                        text="Predicted value",
                        xref="paper",
                        yref="paper"))

# add custom yaxis title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=-0.35,
                        y=0.5,
                        showarrow=False,
                        text="Real value",
                        textangle=-90,
                        xref="paper",
                        yref="paper"))

# adjust margins to make room for yaxis title
fig.update_layout(margin=dict(t=50, l=20))

# add colorbar
fig['data'][0]['showscale'] = True
fig.show()

### Decision tree visualization

In [None]:
def GetDecisionTreeGraphViz(dt):
  return graphviz.Source(sklearn.tree.export_graphviz(
      dt, out_file=None, filled=True, impurity=False,
      feature_names=features))
# The default visualization
# display(GetDecisionTreeGraphViz(dt_fit))

# The zoomed-out visualization
display(Image(GetDecisionTreeGraphViz(dtm).pipe(format='png'), width=1024))

### ROC Curve

In [None]:
fig = go.Figure()

fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

    
for model_name in models:
    
    model = models[model_name]['model']
    y_score = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    auc_score = roc_auc_score(y_test, y_score)
    
    name = f"{model_name} (AUC={auc_score:.2f})"

    fig.add_trace(
        go.Scatter(
            x=fpr, 
            y=tpr, 
            name=name, 
            mode='lines'
        )
    )

fig.update_layout(
    title='ROC Curves',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    width=700, height=500
)
fig.show()

# *Additional notes*