In [72]:
# usual imports
import numpy as np 
import pandas as pd
import seaborn as sns

In [73]:
df = pd.read_csv("loan_data.csv")
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


# Data Cleaning & Preprocessing

In this exercise, we will train a logistic regression model. This dataset is Loan Approval Classification Dataset where we Build predictive models to classify the loan_status variable (approved/not approved) for potential applicants. With no null values.

- float64(6)
- int64(3)
- object(5)
 
Our target variable is loan_status already numeric nothing to there but there are 5 columns non-numeric which needs to be converted.

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

In [75]:
df.describe()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status
count,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0
mean,27.764178,80319.05,5.410333,9583.157556,11.006606,0.139725,5.867489,632.608756,0.222222
std,6.045108,80422.5,6.063532,6314.886691,2.978808,0.087212,3.879702,50.435865,0.415744
min,20.0,8000.0,0.0,500.0,5.42,0.0,2.0,390.0,0.0
25%,24.0,47204.0,1.0,5000.0,8.59,0.07,3.0,601.0,0.0
50%,26.0,67048.0,4.0,8000.0,11.01,0.12,4.0,640.0,0.0
75%,30.0,95789.25,8.0,12237.25,12.99,0.19,8.0,670.0,0.0
max,144.0,7200766.0,125.0,35000.0,20.0,0.66,30.0,850.0,1.0


## Converting Categorical Data

In [76]:
# binary conversion
df['person_gender'].unique()

array(['female', 'male'], dtype=object)

In [77]:
df.person_gender =[1 if value == "male" else 0 for value in df.person_gender]


In [78]:
# ordinal categories
df['person_education'].unique()

array(['Master', 'High School', 'Bachelor', 'Associate', 'Doctorate'],
      dtype=object)

In [79]:
# ordinal categories
category_mapper = {'Doctorate': 4, 'Master': 3, 'Bachelor': 2, 'Associate': 1, 'High School': 0}
df['person_education'] = df['person_education'].map(category_mapper)

In [80]:
# Nominal Categories
df['person_home_ownership'].unique()

array(['RENT', 'OWN', 'MORTGAGE', 'OTHER'], dtype=object)

In [81]:
# Nominal Categories
df['loan_intent'].unique()

array(['PERSONAL', 'EDUCATION', 'MEDICAL', 'VENTURE', 'HOMEIMPROVEMENT',
       'DEBTCONSOLIDATION'], dtype=object)

In [82]:
# Binary Categories
df['previous_loan_defaults_on_file'].unique()

array(['No', 'Yes'], dtype=object)

In [83]:
# convert binary categories
df.previous_loan_defaults_on_file =[1 if value == "Yes" else 0 for value in df.previous_loan_defaults_on_file]

In [84]:
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,0,3,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,0,1
1,21.0,0,0,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,1,0
2,25.0,0,0,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,0,1
3,23.0,0,2,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,0,1
4,24.0,1,3,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,0,1


In [85]:
# One-hot encode nominal variables
from sklearn.preprocessing import OneHotEncoder

variables = ['person_home_ownership', 'loan_intent']

# use encoder
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encoded = encoder.fit_transform(df[variables]).astype(int)
df = pd.concat([df,one_hot_encoded],axis=1).drop(columns=variables)


In [86]:
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,...,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
0,22.0,0,3,71948.0,0,35000.0,16.02,0.49,3.0,561,...,0,0,0,1,0,0,0,0,1,0
1,21.0,0,0,12282.0,0,1000.0,11.14,0.08,2.0,504,...,0,0,1,0,0,1,0,0,0,0
2,25.0,0,0,12438.0,3,5500.0,12.87,0.44,3.0,635,...,1,0,0,0,0,0,0,1,0,0
3,23.0,0,2,79753.0,0,35000.0,15.23,0.44,2.0,675,...,0,0,0,1,0,0,0,1,0,0
4,24.0,1,3,66135.0,1,35000.0,14.27,0.53,4.0,586,...,0,0,0,1,0,0,0,1,0,0


# Logistic Regression
## Preprocessing
Once our dataset is clean that that we know that our variables are reliable, we can proceed to train our model. The first thing to do is to do is to separate the target variable (here called "y") and the predictors (here called "X")

In [87]:
# Prepare the model
y = df["loan_status"] # our target variable
X = df.drop(["loan_status"], axis=1) # our predictors

## Normalize the data
This will improve the performance of our machine learning algorithm.

In [88]:
from sklearn.preprocessing import StandardScaler

# Create a scaler object
scaler = StandardScaler()

# Fit the scaler to the data and transform the data
X_scaled = scaler.fit_transform(X)

## Train Test Split

In [89]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=42)

## Logistic Regression Model

In [90]:
from sklearn.linear_model import LogisticRegression

# Create logistic regression model
lr = LogisticRegression()

# Train the model on the training data
lr.fit(X_train, y_train)

# Predict the target variable on the test data
y_pred = lr.predict(X_test)

# Evaluate the Model

Accuracy is 89% which looks good here will check the other scores now.

In [91]:
from sklearn.metrics import accuracy_score

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.89


In [92]:
# importing all the metrics for further check
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


In [93]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.8942962962962963
Precision: 0.7709190672153635
Recall: 0.747588959095444
F1 Score: 0.7590747931791322


In [94]:
# The AUC score is a super sensitive metric
# you often get low scores, even 0.5

# in binary logistic regression, AUC values are often interpreted as follows:
# A binary classifier is useful only when it achieves ROC-AUC score greater than 0.5 and as near to 1 as possible. 
# If a classifier yields a score less than 0.5, it simply means that the model is performing worse 
# than a random classifier, and therefore is useless.

# In multinomial logistic regression , AUC values are often interpreted as follows: 
# 0.5-0.6 (failed)
# 0.6-0.7 (worthless)
# 0.7-0.8 (poor)
# 0.8-0.9 (good)
# > 0.9 (excellent)

# basically 0.5 means you could get the same result with just random guessing
print("ROC-AUC Score:", roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1]))


ROC-AUC Score: 0.9530489406353883


In [95]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[9825  668]
 [ 759 2248]]


# Analysis of the Scores.

- Accuracy (89%): The model is correct 89 out of 100 times.
- Precision (77%): When it predicts approval (1), it's right 77% of the time.
- Recall (75%): It catches 75% of all actual approved cases.
- F1 Score (76%): Balanced score between precision and recall.
- ROC-AUC (0.95): Excellent! The model is very good at separating approved vs. rejected.

Read: https://www.kdnuggets.com/2022/10/classification-metrics-walkthrough-logistic-regression-accuracy-precision-recall-roc.html

In [110]:
tester_row = {
    'person_age': 30,
    'person_gender': 1,
    'person_education': 3,
    'person_income': 60000,
    'person_emp_exp': 5,
    'loan_amnt': 12000,
    'loan_int_rate': 10.5,
    'loan_percent_income': 0.2,
    'cb_person_cred_hist_length': 4,
    'credit_score': 700,
    'previous_loan_defaults_on_file': 0,
    'person_home_ownership_MORTGAGE': 0,
    'person_home_ownership_OTHER': 0,
    'person_home_ownership_OWN': 1,
    'person_home_ownership_RENT': 0,
    'loan_intent_DEBTCONSOLIDATION': 1,
    'loan_intent_EDUCATION': 0,
    'loan_intent_HOMEIMPROVEMENT': 0,
    'loan_intent_MEDICAL': 0,
    'loan_intent_PERSONAL': 0,
    'loan_intent_VENTURE': 0
}

# create a pandas DataFrame and scale the values
tester_row = pd.DataFrame([tester_row])
tester_row = scaler.transform(tester_row)

print("All probabilities by category:")
print(lr.predict_proba(tester_row))
print()

# change these based on your original data
labels = ["No", "Yes"]

print("Probability of Approval: (Yes/No):")
result = labels[lr.predict(tester_row)[0]]
print(result)
print("-------------------")


All probabilities by category:
[[0.88176263 0.11823737]]

Probability of Approval: (Yes/No):
No
-------------------


In [112]:
# Making the income to a lot and check and the model works
tester_row = {
    'person_age': 30,
    'person_gender': 1,
    'person_education': 3,
    'person_income': 6000000,
    'person_emp_exp': 5,
    'loan_amnt': 12000,
    'loan_int_rate': 10.5,
    'loan_percent_income': 0.2,
    'cb_person_cred_hist_length': 4,
    'credit_score': 700,
    'previous_loan_defaults_on_file': 0,
    'person_home_ownership_MORTGAGE': 0,
    'person_home_ownership_OTHER': 0,
    'person_home_ownership_OWN': 1,
    'person_home_ownership_RENT': 0,
    'loan_intent_DEBTCONSOLIDATION': 1,
    'loan_intent_EDUCATION': 0,
    'loan_intent_HOMEIMPROVEMENT': 0,
    'loan_intent_MEDICAL': 0,
    'loan_intent_PERSONAL': 0,
    'loan_intent_VENTURE': 0
}

# create a pandas DataFrame and scale the values
tester_row = pd.DataFrame([tester_row])
tester_row = scaler.transform(tester_row)

print("All probabilities by category:")
print(lr.predict_proba(tester_row))
print()

# change these based on your original data
labels = ["No", "Yes"]

print("Probability of Approval: (Yes/No):")
result = labels[lr.predict(tester_row)[0]]
print(result)
print("-------------------")


All probabilities by category:
[[0.4023118 0.5976882]]

Probability of Approval: (Yes/No):
Yes
-------------------


# Personal Analysis

Logistic regressions was easy to use after cleaning and encoding the data correctly and most time spent there. I learned how to prepare features properly using one-hot and ordinal encoding. The model gave high accuracy and a good ROC-AUC score, which means it can predict loan approvals well learned throughts different scores and evaluate them. This kind of model could be useful in banks or fintech companies for making quick loan decisions easily. Testing with new values also worked well. The main challenge was making sure the test inputs matched the training features as it was failed also I had to use the same scaler here which was used to train the model.



# Build WebApp in Flask

Will save the model and scaler in pickle to use in flask.

In [None]:
# https://medium.com/@agulyamova/deploying-classification-model-with-flask-1a694d3534a2
import joblib

# Save model, scaler, and column orders here
joblib.dump(lr, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Last time in HTML format I had to write the column names by one and one to build the form,
# which took a lot time this time use this method
joblib.dump(X.columns.tolist(), 'columns.pkl')

['columns.pkl']

# Experimenting with Different Solvers in Logistic Regression.

An online solver is a type of optimization algorithm that updates its parameters incrementally as it processes each data point. Comparing different solvers accuracy score below. liblinear is showing highest with 90%

https://www.geeksforgeeks.org/comparing-various-online-solvers-in-scikit-learn/

In [122]:
# Import the necessary libraries 
from sklearn.datasets import load_digits 
from sklearn.linear_model import LogisticRegression, SGDClassifier 
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 

target = df["loan_status"]
data = df.drop(["loan_status"], axis=1)

# split data into train and test sets 
X_train, X_test, y_train, y_test = train_test_split( 
	data, target, test_size=0.3) 

# define solvers to compare 
solvers = [ 
	('SAG', LogisticRegression(penalty='l2', 
							solver='sag', 
							max_iter=100)), 
	('SAGA', LogisticRegression(penalty='l1', 
								solver='saga', 
								max_iter=100)), 
	('L-BFGS', LogisticRegression(penalty='l2', 
								solver='lbfgs', 
								max_iter=100)), 
	('liblinear', LogisticRegression(penalty='l1', 
									solver='liblinear', 
									max_iter=100)), 
	('Passive-Aggressive', PassiveAggressiveClassifier(max_iter=100)), 
	('Perceptron', Perceptron(max_iter=100)) 
] 

# train and evaluate each solver 
for name, clf in solvers: 
	clf.fit(X_train, y_train) 
	y_pred = clf.predict(X_test) 
	acc = accuracy_score(y_test, y_pred) 
	print(f"{name} accuracy: {acc}") 




SAG accuracy: 0.7830370370370371




SAGA accuracy: 0.7765925925925926


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


L-BFGS accuracy: 0.8325185185185185
liblinear accuracy: 0.9005925925925926
Passive-Aggressive accuracy: 0.5875555555555556
Perceptron accuracy: 0.808962962962963
