In [40]:
#imports
import pandas as pd
pd.set_option('display.max_columns', None) # Display any number of columns

In [49]:
#get csv and read
df = pd.read_csv('output/LC_Final.csv.gz',compression='gzip',low_memory=True)
df.head()

Unnamed: 0.1,Unnamed: 0,loan_amnt,term,int_rate,sub_grade,emp_length,home_ownership,annual_inc,verification_status,purpose,addr_state,dti,open_acc,pub_rec,revol_bal,revol_util,initial_list_status,application_type,mo_sin_old_il_acct,mort_acc,earliest_cr_year,fico_score,charged_off
0,0,3600.0,36,13.99,C4,10.0,MORTGAGE,4.740371,Not Verified,debt_consolidation,PA,5.91,7.0,0.0,3.441852,29.7,w,Individual,148.0,1.0,2003,677.0,0
1,1,24700.0,36,11.99,C1,10.0,MORTGAGE,4.81292,Not Verified,small_business,SD,16.06,22.0,0.0,4.331852,19.2,w,Individual,113.0,4.0,1999,717.0,0
2,2,20000.0,60,10.78,B4,10.0,MORTGAGE,4.799347,Not Verified,home_improvement,IL,10.78,6.0,0.0,3.895975,56.2,w,Joint App,125.0,5.0,2000,697.0,0
3,4,10400.0,60,22.45,F1,3.0,MORTGAGE,5.018842,Source Verified,major_purchase,PA,25.37,12.0,0.0,4.341039,64.5,w,Individual,128.0,6.0,1998,697.0,0
4,5,11950.0,36,13.44,C3,4.0,RENT,4.531492,Source Verified,debt_consolidation,GA,10.2,5.0,0.0,3.945616,68.4,w,Individual,338.0,0.0,1987,692.0,0


In [50]:
#drop columns to be used later
def drop_cols(cols):
    df.drop(labels=cols, axis=1, inplace=True)

In [51]:
# 0 fully paid, 1 charged off
df["charged_off"].value_counts()

0    1076751
1     268559
Name: charged_off, dtype: int64

In [52]:
#drop unnamed
drop_cols('Unnamed: 0')
df.head()

Unnamed: 0,loan_amnt,term,int_rate,sub_grade,emp_length,home_ownership,annual_inc,verification_status,purpose,addr_state,dti,open_acc,pub_rec,revol_bal,revol_util,initial_list_status,application_type,mo_sin_old_il_acct,mort_acc,earliest_cr_year,fico_score,charged_off
0,3600.0,36,13.99,C4,10.0,MORTGAGE,4.740371,Not Verified,debt_consolidation,PA,5.91,7.0,0.0,3.441852,29.7,w,Individual,148.0,1.0,2003,677.0,0
1,24700.0,36,11.99,C1,10.0,MORTGAGE,4.81292,Not Verified,small_business,SD,16.06,22.0,0.0,4.331852,19.2,w,Individual,113.0,4.0,1999,717.0,0
2,20000.0,60,10.78,B4,10.0,MORTGAGE,4.799347,Not Verified,home_improvement,IL,10.78,6.0,0.0,3.895975,56.2,w,Joint App,125.0,5.0,2000,697.0,0
3,10400.0,60,22.45,F1,3.0,MORTGAGE,5.018842,Source Verified,major_purchase,PA,25.37,12.0,0.0,4.341039,64.5,w,Individual,128.0,6.0,1998,697.0,0
4,11950.0,36,13.44,C3,4.0,RENT,4.531492,Source Verified,debt_consolidation,GA,10.2,5.0,0.0,3.945616,68.4,w,Individual,338.0,0.0,1987,692.0,0


In [53]:
#check column types
print(df.dtypes)

loan_amnt              float64
term                     int64
int_rate               float64
sub_grade               object
emp_length             float64
home_ownership          object
annual_inc             float64
verification_status     object
purpose                 object
addr_state              object
dti                    float64
open_acc               float64
pub_rec                float64
revol_bal              float64
revol_util             float64
initial_list_status     object
application_type        object
mo_sin_old_il_acct     float64
mort_acc               float64
earliest_cr_year         int64
fico_score             float64
charged_off              int64
dtype: object


In [54]:
#code column obj
df['sub_grade'] = df['sub_grade'].astype("category")
df['sub_grade'] = df.sub_grade.cat.codes

df['home_ownership'] = df['home_ownership'].astype("category")
df['home_ownership'] = df.home_ownership.cat.codes

df['verification_status'] = df['verification_status'].astype("category")
df['verification_status'] = df.verification_status.cat.codes

df['purpose'] = df['purpose'].astype("category")
df['purpose'] = df.purpose.cat.codes

df['addr_state'] = df['addr_state'].astype("category")
df['addr_state'] = df.addr_state.cat.codes

df['initial_list_status'] = df['initial_list_status'].astype("category")
df['initial_list_status'] = df.initial_list_status.cat.codes

df['application_type'] = df['application_type'].astype("category")
df['application_type'] = df.application_type.cat.codes


In [55]:
#double check
print(df.dtypes)

loan_amnt              float64
term                     int64
int_rate               float64
sub_grade                 int8
emp_length             float64
home_ownership            int8
annual_inc             float64
verification_status       int8
purpose                   int8
addr_state                int8
dti                    float64
open_acc               float64
pub_rec                float64
revol_bal              float64
revol_util             float64
initial_list_status       int8
application_type          int8
mo_sin_old_il_acct     float64
mort_acc               float64
earliest_cr_year         int64
fico_score             float64
charged_off              int64
dtype: object


In [56]:
#drop any NA rows
df = df.dropna()

In [57]:
#check df size
print(len(df.index))

1167952


In [58]:
#split data
from sklearn.model_selection import train_test_split
y = df["charged_off"]
X = df.drop(columns=["charged_off"])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [61]:
#scale data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [72]:
#imports for ML model
import tensorflow as tf
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder


In [75]:
#get model type
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

#fit training data for labels
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

LabelEncoder()

In [76]:
#fit training data
classifier.fit(X_train_scaled, y_train)

LogisticRegression()

In [77]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.8041814503792394
Testing Data Score: 0.8039508472950944


In [78]:
#set y data as category
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [79]:
#model, 21 inputs, 2 outputs
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=21))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 100)               2200      
_________________________________________________________________
dense_13 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_14 (Dense)             (None, 2)                 202       
Total params: 12,502
Trainable params: 12,502
Non-trainable params: 0
_________________________________________________________________


In [80]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [81]:
#run model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=10,
    shuffle=True,
    verbose=2
)

Epoch 1/10
 - 28s - loss: 0.4513 - accuracy: 0.8038
Epoch 2/10
 - 27s - loss: 0.4490 - accuracy: 0.8046
Epoch 3/10
 - 26s - loss: 0.4484 - accuracy: 0.8047
Epoch 4/10
 - 26s - loss: 0.4481 - accuracy: 0.8050
Epoch 5/10
 - 26s - loss: 0.4478 - accuracy: 0.8049
Epoch 6/10
 - 26s - loss: 0.4475 - accuracy: 0.8052
Epoch 7/10
 - 26s - loss: 0.4474 - accuracy: 0.8051
Epoch 8/10
 - 26s - loss: 0.4472 - accuracy: 0.8054
Epoch 9/10
 - 26s - loss: 0.4471 - accuracy: 0.8052
Epoch 10/10
 - 26s - loss: 0.4470 - accuracy: 0.8054


<keras.callbacks.callbacks.History at 0x135ac37a668>

In [82]:
#evaluate model loss and acc
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: 0.4476959362301707, Accuracy: 0.8047419786453247


In [92]:
#load test data to predict classes with model
encoded_predictions = model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)


In [93]:
#print 10 tests
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:10])}")

Predicted classes: [0 0 0 0 0 0 0 0 0 0]
Actual Labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [94]:
# save fitted model to file
import joblib
filename = 'deep_learning_log_reg.sav'
joblib.dump(model, filename)

['deep_learning_log_reg.sav']