### data source: https://www.kaggle.com/datasets/laotse/credit-risk-dataset

In [1]:
# Import our dependencies
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Input
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd
import tensorflow as tf

In [None]:
# Connect to database
database_url = 'postgresql://postgres:postgres@localhost:5432/loan_approval'
engine = create_engine(database_url)

In [5]:
# Query the transaction table
query = 'SELECT * FROM loan'
df1 = pd.read_sql(query, engine)
df1.head()

Unnamed: 0,age,income,home_ownership,employment_duration,loan_purpose,loan_grade,loan_amount,int_rate,loan_status,loan_income_pct,past_default_status,credit_history_length
0,21.0,9600.0,OWN,5.0,EDUCATION,B,1000.0,11.14,0.0,0.1,N,2.0
1,25.0,9600.0,MORTGAGE,1.0,MEDICAL,C,5500.0,12.87,1.0,0.57,N,3.0
2,23.0,65500.0,RENT,4.0,MEDICAL,C,35000.0,15.23,1.0,0.53,N,2.0
3,24.0,54400.0,RENT,8.0,MEDICAL,C,35000.0,14.27,1.0,0.55,Y,4.0
4,21.0,9900.0,OWN,2.0,VENTURE,A,2500.0,7.14,1.0,0.25,N,2.0


In [6]:
#Converting categorical columns to numerical data

categorical_cols = ['home_ownership', 'loan_purpose', 'loan_grade', 'past_default_status']

# Using get dummies function to convert into numerical values
df1 = pd.get_dummies(df1, columns=categorical_cols, drop_first=False)


In [7]:
#Separating the feature (X) from target y
y = df1["loan_status"]
X = df1.drop(columns="loan_status")

In [8]:
# Display the dataframe
df1.head()

Unnamed: 0,age,income,employment_duration,loan_amount,int_rate,loan_status,loan_income_pct,credit_history_length,home_ownership_MORTGAGE,home_ownership_OTHER,...,loan_purpose_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,past_default_status_N,past_default_status_Y
0,21.0,9600.0,5.0,1000.0,11.14,0.0,0.1,2.0,False,False,...,False,False,True,False,False,False,False,False,True,False
1,25.0,9600.0,1.0,5500.0,12.87,1.0,0.57,3.0,True,False,...,False,False,False,True,False,False,False,False,True,False
2,23.0,65500.0,4.0,35000.0,15.23,1.0,0.53,2.0,False,False,...,False,False,False,True,False,False,False,False,True,False
3,24.0,54400.0,8.0,35000.0,14.27,1.0,0.55,4.0,False,False,...,False,False,False,True,False,False,False,False,False,True
4,21.0,9900.0,2.0,2500.0,7.14,1.0,0.25,2.0,False,False,...,True,True,False,False,False,False,False,False,True,False


In [9]:
#Splitting our data into training and testing

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(21474, 26)

In [10]:
#Scaling the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
#Creating a logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

In [12]:
#fit(train) or model using the training data

classifier.fit(X_train_scaled, y_train)

In [13]:
#Scoring the model using the test data
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.8687249697308372
Testing Data Score: 0.8667225481978206


In [14]:
#Making predictions
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0.0,0.0
1,0.0,1.0
2,0.0,0.0
3,1.0,1.0
4,0.0,0.0
5,0.0,0.0
6,1.0,1.0
7,0.0,0.0
8,0.0,0.0
9,0.0,0.0


In [15]:
#Calculating the Accuracy score

from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.8667225481978206

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# This needs to be saved
print('Confusion Matrix:')
print(confusion_matrix(y_test, predictions))
target_names = ["Loan Status 0", "Loan Status 1"]
print(classification_report(y_test, predictions, target_names=target_names))
print('\nAccuracy Score:')
print(accuracy_score(y_test, predictions))

Confusion Matrix:
[[5350  257]
 [ 697  854]]
               precision    recall  f1-score   support

Loan Status 0       0.88      0.95      0.92      5607
Loan Status 1       0.77      0.55      0.64      1551

     accuracy                           0.87      7158
    macro avg       0.83      0.75      0.78      7158
 weighted avg       0.86      0.87      0.86      7158


Accuracy Score:
0.8667225481978206


In [None]:
import joblib


# Save the trained model and scaler
joblib.dump(classifier, 'logistic_regression_model.pkl')
joblib.dump(scaler, 'scaler_model.pkl')

['scaler_model.pkl']

In [20]:
# Close engine
engine.dispose()