# Prepare Credit Risk Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import sqlite3


In [2]:
# Establish connection
conn = sqlite3.Connection("Resources/credit_risk.sqlite")

In [3]:
# Load data 
df = pd.read_sql('SELECT * FROM credit_risk', conn)

# Display data
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [4]:
# Generate our categorical variable lists
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()
# Check the number of unique values in each column
df[df_cat].nunique()

person_home_ownership        4
loan_intent                  6
loan_grade                   7
cb_person_default_on_file    2
dtype: int64

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder()

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[df_cat]).toarray())

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(df_cat)
encode_df.head()

Unnamed: 0,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df, left_index=True, right_index=True)
df = df.drop(columns=df_cat)
df.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_MORTGAGE_x,person_home_ownership_OTHER_x,...,loan_intent_VENTURE_y,loan_grade_A_y,loan_grade_B_y,loan_grade_C_y,loan_grade_D_y,loan_grade_E_y,loan_grade_F_y,loan_grade_G_y,cb_person_default_on_file_N_y,cb_person_default_on_file_Y_y
0,21,9600,5.0,1000,11.14,0,0.1,2,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,25,9600,1.0,5500,12.87,1,0.57,3,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,23,65500,4.0,35000,15.23,1,0.53,2,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,24,54400,8.0,35000,14.27,1,0.55,4,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,21,9900,2.0,2500,7.14,1,0.25,2,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
# Split data into features and target
y = df['loan_status'].values
X = df.drop('loan_status', axis=1).values

display(X)
display(y)


array([[2.10e+01, 9.60e+03, 5.00e+00, ..., 0.00e+00, 1.00e+00, 0.00e+00],
       [2.50e+01, 9.60e+03, 1.00e+00, ..., 0.00e+00, 1.00e+00, 0.00e+00],
       [2.30e+01, 6.55e+04, 4.00e+00, ..., 0.00e+00, 1.00e+00, 0.00e+00],
       ...,
       [6.50e+01, 7.60e+04, 3.00e+00, ..., 0.00e+00, 1.00e+00, 0.00e+00],
       [5.60e+01, 1.50e+05, 5.00e+00, ..., 0.00e+00, 1.00e+00, 0.00e+00],
       [6.60e+01, 4.20e+04, 2.00e+00, ..., 0.00e+00, 1.00e+00, 0.00e+00]])

array([0, 1, 1, ..., 1, 0, 0], dtype=int64)

In [17]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [21]:
# Create a StandardScaler instance
scaler = StandardScaler()

#Fit
X_scaler = scaler.fit(X_train)

#Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
X_train.shape

(21477, 45)

In [23]:
X_test.shape

(7159, 45)

# Train Model


In [24]:
# Import tensorflow
import tensorflow as tf

In [26]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

In [27]:
# Define the TensorFlow model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"),
    tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"),
    tf.keras.layers.Dense(units=1, activation="sigmoid")
])
# Check the structure
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [28]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [35]:
# Train the model
history = model.fit(X_train_scaled, y_train, epochs=200)

Epoch 1/200
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9253 - loss: 0.2242
Epoch 2/200
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 974us/step - accuracy: 0.9222 - loss: 0.2330
Epoch 3/200
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 883us/step - accuracy: 0.9240 - loss: 0.2279
Epoch 4/200
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9231 - loss: 0.2266
Epoch 5/200
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9204 - loss: 0.2365
Epoch 6/200
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9237 - loss: 0.2289
Epoch 7/200
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9260 - loss: 0.2243
Epoch 8/200
[1m672/672[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9235 - loss: 0.2265
Epoch 9/200
[1m672/672[0m 

In [36]:
# Evaluate the model using the test data
model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

224/224 - 0s - 2ms/step - accuracy: 0.9236 - loss: 0.2267
Loss: 0.22665786743164062, Accuracy: 0.9235926866531372


In [38]:
model.save('Resources/tensorflowmodel.h5')

