# Exercise - Neural Networks using Keras

The data set for this exercise is from the banking industry. It contains data about the home loans of 2,500 bank clients. Each row represents a single loan. The columns include the characteristics of the client who used a loan. This is a binary classification task: predict whether a loan will be bad or not (1=Yes, 0=No). This is an important task for banks to prevent bad loans from being issued.

## Description of Variables

The description of variables are provided in "Loan - Data Dictionary.docx"

## Goal

Use the **loan.csv** data set and build a model to predict **BAD**. 

Since you have a relatively small data set, I recommend using cross-validation to evaluate your accuracy.

# Read and Prepare the Data

In [1]:
# Common imports

import pandas as pd
import numpy as np

np.random.seed(42)

# Get the data

In [2]:
#We will predict the "price" value in the data set:

loan = pd.read_csv("loan.csv")
loan.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,0,25900,61064.0,94714.0,DebtCon,Office,2.0,0.0,0.0,98.809375,0.0,23.0,34.565944
1,0,26100,113266.0,182082.0,DebtCon,Sales,18.0,0.0,0.0,304.852469,1.0,31.0,33.193949
2,1,50000,220528.0,300900.0,HomeImp,Self,5.0,0.0,0.0,0.0,0.0,2.0,
3,1,22400,51470.0,68139.0,DebtCon,Mgr,9.0,0.0,0.0,31.168696,2.0,8.0,37.95218
4,0,20900,62615.0,87904.0,DebtCon,Office,5.0,,,177.864849,,15.0,36.831076


# Split data (train/test)

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(loan, test_size=0.3)

# Data Prep

Perform your data prep here. You can use pipelines like we do in the tutorials. Otherwise, feel free to use your own data prep steps. Eventually, you should do the following at a minimum:<br>
- Separate inputs from target<br>
- Impute/remove missing values<br>
- Standardize the continuous variables<br>
- One-hot encode categorical variables<br>

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Separate the target variable 

In [5]:
train_target = train['BAD']
test_target = test['BAD']

train_inputs = train.drop(['BAD'], axis=1)
test_inputs = test.drop(['BAD'], axis=1)

## Feature Engineering: Derive a new column

Examples:
- Ratio of delinquent to total number of credit lines
- Ratio of loan to value of current property
- Convert yr_renovated to a binary variable (i.e., renovated or not)
- (etc.)

In [6]:
# Loan / Mortgage Due
def lpm_col(df):
    df1 = df.copy()

    # Use the formula, though fill in 0s when the value is 0/0 (because 0/0 generates "nan" values)
    df1['loan_per_mortgagedue'] = (df1['LOAN']/df1['MORTDUE']).fillna(0)

    # Replace the infinity values with 1 (because a value divided by 0 generates infinity)
    df1['loan_per_mortgagedue'].replace(np.inf, 1, inplace=True)

    return df1[['loan_per_mortgagedue']]


In [7]:
lpm_col(train)

Unnamed: 0,loan_per_mortgagedue
1552,0.947375
2290,0.224507
1398,0.070863
1775,0.280365
2299,0.181217
...,...
1638,0.106061
1095,0.264890
1130,0.437655
1294,0.086405


##  Identify the numeric, binary, and categorical columns

In [8]:
feat_eng_columns = ['LOAN','MORTDUE']
feat_eng_columns

['LOAN', 'MORTDUE']

In [9]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [10]:
numeric_columns

['LOAN',
 'MORTDUE',
 'VALUE',
 'YOJ',
 'DEROG',
 'DELINQ',
 'CLAGE',
 'NINQ',
 'CLNO',
 'DEBTINC']

In [11]:
categorical_columns

['REASON', 'JOB']

# Pipeline

In [12]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [13]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [14]:
from sklearn.preprocessing import FunctionTransformer
# Create a pipeline for the transformed column here
lpm_column = Pipeline(steps=[('lpm_col', FunctionTransformer(lpm_col)),
                               ('scaler', StandardScaler())])


In [15]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('trans', lpm_column, feat_eng_columns)
        ],   
        remainder='drop')


# Transform: fit_transform() for TRAIN

In [16]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[-0.31412013, -1.30301181, -0.86148829, ...,  0.        ,
         0.        ,  1.36287056],
       [ 0.45733454,  0.7398414 ,  0.58636192, ...,  0.        ,
         0.        , -0.2509229 ],
       [-1.10330939,  0.2001631 ,  0.18146318, ...,  0.        ,
         0.        , -0.59393096],
       ...,
       [-0.21657988, -0.83000156, -0.82081329, ...,  0.        ,
         0.        ,  0.22492737],
       [-0.46486414,  1.79196675,  1.36974799, ...,  0.        ,
         0.        , -0.55923357],
       [-0.31412013, -0.08740643, -0.21782887, ...,  0.        ,
         0.        , -0.27278239]])

In [17]:
train_x.shape

(1750, 21)

# Tranform: transform() for TEST

In [18]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[ 0.06717356,  0.36706438,  0.32127798, ...,  0.        ,
         0.        , -0.27077813],
       [ 0.32432512,  0.57631513,  0.42769944, ...,  1.        ,
         0.        , -0.24888772],
       [-0.33185472,  0.41209537,  0.12227549, ...,  0.        ,
         0.        , -0.39335905],
       ...,
       [-0.8993616 , -0.51071616, -0.32326299, ...,  0.        ,
         0.        , -0.39266904],
       [-0.5446698 , -0.82706576, -0.8366813 , ...,  0.        ,
         0.        , -0.00796942],
       [-0.5446698 , -0.06422056, -0.11380525, ...,  0.        ,
         0.        , -0.36387298]])

In [19]:
test_x.shape

(750, 21)

# Calculate the Baseline

In [20]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_target)

In [21]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_target, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.6034285714285714


In [22]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_target, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.5773333333333334


# Train a shallow (one-layer) Keras model

In [23]:
import tensorflow as tf
from tensorflow import keras

np.random.seed(42)
tf.random.set_seed(42)

In [24]:
train_x.shape

(1750, 21)

In [25]:
train_x.shape[1]

21

In [26]:
train_target.value_counts()

0    1056
1     694
Name: BAD, dtype: int64

In [27]:
model = keras.models.Sequential()

model.add(keras.layers.Input(shape=train_x.shape[1]))
model.add(keras.layers.Dense(10, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [28]:
adam = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [29]:
history = model.fit(train_x, train_target, 
                    validation_data=(test_x, test_target), 
                    epochs=10, batch_size=250)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
# Train values
train_scores = model.evaluate(train_x, train_target, verbose=0)
train_scores


[0.4817656874656677, 0.7708571553230286]

In [31]:
# Print the values
print(f"Train {model.metrics_names[0]}: {train_scores[0]:.2f}")
print(f"Train {model.metrics_names[1]}: {train_scores[1]*100:.2f}%")


Train loss: 0.48
Train accuracy: 77.09%


In [32]:
# Test values
test_scores = model.evaluate(test_x, test_target, verbose=0)
test_scores


[0.5191351771354675, 0.7400000095367432]

In [33]:
# Print the values
print(f"Test {model.metrics_names[0]}: {test_scores[0]:.2f}")
print(f"Test {model.metrics_names[1]}: {test_scores[1]*100:.2f}%")


Test loss: 0.52
Test accuracy: 74.00%


# Train a deep (multi-layered) Keras model 

In [34]:
model = keras.models.Sequential()

model.add(keras.layers.Input(shape=train_x.shape[1]))
model.add(keras.layers.Dense(15, activation='relu'))
model.add(keras.layers.Dense(10, activation='relu'))
model.add(keras.layers.Dense(5, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

#final layer: there has to be 1 nodes with sigmoid this is binary

In [35]:
# Compile model
#Optimizer:
adam = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [36]:
history = model.fit(train_x, train_target, 
                    validation_data=(test_x, test_target), 
                    epochs=10, batch_size=250)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
train_scores = model.evaluate(train_x, train_target, verbose=0)
train_scores

[0.42401570081710815, 0.8045714497566223]

In [38]:
print(f"Train {model.metrics_names[0]}: {train_scores[0]:.2f}")
print(f"Train {model.metrics_names[1]}: {train_scores[1]*100:.2f}%")

Train loss: 0.42
Train accuracy: 80.46%


In [39]:
test_scores = model.evaluate(test_x, test_target, verbose=0)
test_scores

[0.4915168881416321, 0.7760000228881836]

In [40]:
print(f"Test {model.metrics_names[0]}: {test_scores[0]:.2f}")
print(f"Test {model.metrics_names[1]}: {test_scores[1]*100:.2f}%")

Test loss: 0.49
Test accuracy: 77.60%


# Optional: try different activation functions, optimizers, or configurations (such as wide and deep) to build other models

In [41]:
# Trying inverse funnel
xavier = keras.initializers.glorot_normal(seed=None)
model = keras.models.Sequential()

inputlayer = keras.layers.Input(shape=train_x.shape[1])

hidden1 = keras.layers.Dense(20, activation='relu', kernel_initializer=xavier)(inputlayer)
hidden2 = keras.layers.Dense(40, activation='relu', kernel_initializer=xavier)(hidden1)
hidden3 = keras.layers.Dense(50, activation='relu', kernel_initializer=xavier)(hidden2)
hidden4 = keras.layers.Dense(30, activation='relu', kernel_initializer=xavier)(hidden3)
hidden5 = keras.layers.Dense(20, activation='relu', kernel_initializer=xavier)(hidden2)

concat = keras.layers.Concatenate()([inputlayer, hidden5])

output = keras.layers.Dense(1, activation='sigmoid')(concat)

model = keras.Model(inputs =[inputlayer], outputs = output)



In [42]:
nadam = keras.optimizers.Nadam(learning_rate=0.01)

model.compile(loss='binary_crossentropy', optimizer=nadam, metrics=['accuracy'])

In [43]:
from tensorflow.keras.callbacks import EarlyStopping
earlystop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto')
callback = [earlystop]

history = model.fit(train_x, train_target, 
                    validation_data=(test_x, test_target), 
                    epochs=25, batch_size=100, callbacks=callback)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 18: early stopping


In [44]:
train_scores = model.evaluate(train_x, train_target, verbose=0)
train_scores

[0.21057990193367004, 0.9177142977714539]

In [45]:
print(f"Train {model.metrics_names[0]}: {train_scores[0]:.2f}")
print(f"Train {model.metrics_names[1]}: {train_scores[1]*100:.2f}%")

Train loss: 0.21
Train accuracy: 91.77%


In [46]:
test_scores = model.evaluate(test_x, test_target, verbose=0)
test_scores


[0.45275676250457764, 0.8333333134651184]

In [47]:
print(f"Test {model.metrics_names[0]}: {test_scores[0]:.2f}")
print(f"Test {model.metrics_names[1]}: {test_scores[1]*100:.2f}%")

# Best model, some overfitting - Early stopping did not help that much (2 percentage point from original)

Test loss: 0.45
Test accuracy: 83.33%
