In [6]:
import pandas as pd

df=pd.read_csv("loan_approval_dataset.csv")

print(df.head(2))
print(df.info())
print(df.isnull().sum())

   loan_id   no_of_dependents      education  self_employed   income_annum  \
0        1                  2       Graduate             No        9600000   
1        2                  0   Not Graduate            Yes        4100000   

    loan_amount   loan_term   cibil_score   residential_assets_value  \
0      29900000          12           778                    2400000   
1      12200000           8           417                    2700000   

    commercial_assets_value   luxury_assets_value   bank_asset_value  \
0                  17600000              22700000            8000000   
1                   2200000               8800000            3300000   

   loan_status  
0     Approved  
1     Rejected  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no

In [7]:
df.columns=df.columns.str.strip()  #" loan_status"   # with a space at the beginning

print(df["loan_status"].unique())   #prints the unique values in loan_status column

df["loan_status"]=df["loan_status"].str.strip() #" Approved" (leading space), "Rejected " (trailing space)

df["loan_status"] = df["loan_status"].map({
    "Approved": 1,
    "Rejected": 0
})
print(df["loan_status"].isnull().sum())
print(df["loan_status"].value_counts())


[' Approved' ' Rejected']
0
loan_status
1    2656
0    1613
Name: count, dtype: int64


In [8]:
x=df.drop(["loan_status","loan_id"],axis=1)
y=df["loan_status"]

print(x.shape,y.shape)




(4269, 11) (4269,)


In [9]:

x_encoded=pd.get_dummies(x,columns=['education','self_employed'],drop_first=True)
print(x_encoded.head(2))

   no_of_dependents  income_annum  loan_amount  loan_term  cibil_score  \
0                 2       9600000     29900000         12          778   
1                 0       4100000     12200000          8          417   

   residential_assets_value  commercial_assets_value  luxury_assets_value  \
0                   2400000                 17600000             22700000   
1                   2700000                  2200000              8800000   

   bank_asset_value  education_ Not Graduate  self_employed_ Yes  
0           8000000                    False               False  
1           3300000                     True                True  


In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_encoded,y,test_size=0.2,random_state=7)
print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)

(3415, 11) (3415,) (854, 11) (854,)


In [11]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)

from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(x_train_scaled,y_train)

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

y_pred=model.predict(x_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9192037470725996
Confusion Matrix:
 [[294  39]
 [ 30 491]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.88      0.89       333
           1       0.93      0.94      0.93       521

    accuracy                           0.92       854
   macro avg       0.92      0.91      0.91       854
weighted avg       0.92      0.92      0.92       854



In [12]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(x_train,y_train)

y_pred_dt=dt.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))




Accuracy: 0.977751756440281
Confusion Matrix:
 [[325   8]
 [ 11 510]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.97       333
           1       0.98      0.98      0.98       521

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854



In [13]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)

y_pred_rf=rf.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))



Accuracy: 0.9812646370023419
Confusion Matrix:
 [[324   9]
 [  7 514]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98       333
           1       0.98      0.99      0.98       521

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854



In [14]:
import joblib

joblib.dump(rf,"loan_approval_model.pkl")
joblib.dump(scaler,"scaler.pkl")


['scaler.pkl']

In [15]:
df["loan_status"].value_counts()


loan_status
1    2656
0    1613
Name: count, dtype: int64