<a href="https://colab.research.google.com/github/rameenhamad/Elvovo_ML_Internship_Tasks/blob/main/4_Loan_Approval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

In [5]:
df = pd.read_csv("/content/drive/MyDrive/Elvovo_Tasks/4.Loan_Approval_Prediction/loan_approval_dataset.csv")
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [7]:
df.drop_duplicates()
df.isnull().sum()

Unnamed: 0,0
loan_id,0
no_of_dependents,0
education,0
self_employed,0
income_annum,0
loan_amount,0
loan_term,0
cibil_score,0
residential_assets_value,0
commercial_assets_value,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


Preprocessing

In [9]:
df.columns = [col.lstrip() for col in df.columns]

In [14]:
X = df.drop(['loan_status','loan_id'], axis=1)
y = df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
y.value_counts()

Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1
0,2656
1,1613


Encoding

In [16]:
encoder = LabelEncoder()
for col in df.columns:
  if df[col].dtype == "object":
    X_train[col] = encoder.fit_transform(X_train[[col]])
    X_test[col] = encoder.transform(X_test[[col]])
X_train.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
1675,5,1,0,7900000,29900000,6,568,5800000,13900000,15900000,8700000
1164,0,1,1,9600000,34000000,12,710,23800000,10300000,38100000,7800000
192,1,0,0,800000,2900000,8,682,2200000,1100000,2900000,700000
910,2,0,1,4900000,13100000,18,754,8200000,3300000,16500000,7200000
567,5,0,1,3000000,11100000,12,441,8500000,2500000,7300000,2000000


Normalization (scaling)

In [17]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [33]:
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [34]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       536
           1       0.97      0.97      0.97       318

    accuracy                           0.98       854
   macro avg       0.97      0.97      0.97       854
weighted avg       0.98      0.98      0.98       854



Trying to Balance Target Variable with SMOTE

In [22]:
smote = SMOTE()
X_train_resample, y_train_resample = smote.fit_resample(X_train, y_train)

In [23]:
y_train_resample.value_counts()

Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1
0,2120
1,2120


Now training ogistic regression vs. decision tree on balanced Data

In [25]:
model_lg = LogisticRegression()
model_lg.fit(X_train_resample, y_train_resample)

y_pred_lg = model_lg.predict(X_test)

In [26]:
print(classification_report(y_test, y_pred_lg))

              precision    recall  f1-score   support

           0       0.95      0.91      0.93       536
           1       0.85      0.92      0.88       318

    accuracy                           0.91       854
   macro avg       0.90      0.91      0.90       854
weighted avg       0.91      0.91      0.91       854



In [27]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train_resample, y_train_resample)

y_pred_dt = model_dt.predict(X_test)

In [28]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       536
           1       0.97      0.96      0.96       318

    accuracy                           0.97       854
   macro avg       0.97      0.97      0.97       854
weighted avg       0.97      0.97      0.97       854



Comparing Logistic regression vs. Deision tree

In [30]:
accuracy_lg = accuracy_score(y_test, y_pred_lg)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

if accuracy_lg > accuracy_dt:
  print(f"Logistic Regression performed well!")
elif accuracy_lg < accuracy_dt:
  print(f"Decision Tree performed well!")
else:
  print(f"Both models performed same!")

Decision Tree performed well!
