In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [3]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [4]:
import copy

In [5]:
data = pd.read_csv('loan_prediction.csv')
df = copy.deepcopy(data)

In [6]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [8]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
df['Gender'].value_counts()

Gender
Male      489
Female    112
Name: count, dtype: int64

In [10]:
cat_cols_with_na = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']
for col in cat_cols_with_na:
    df[col].fillna(df[col].mode()[0], inplace=True)

num_cols_with_na = ['LoanAmount', 'Loan_Amount_Term']
for col in num_cols_with_na:
    df[col].fillna(df[col].median(), inplace=True)

In [11]:
print("Missing Values After Imputation:")
print(df.isnull().sum())
print("-" * 30)


Missing Values After Imputation:
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64
------------------------------


In [12]:
df['Dependents'].value_counts()

Dependents
0     360
1     102
2     101
3+     51
Name: count, dtype: int64

In [13]:
df = df.drop('Loan_ID', axis=1)

In [14]:
df['Loan_Status'].value_counts()

Loan_Status
Y    422
N    192
Name: count, dtype: int64

In [15]:
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    int64  
dtypes: float64(4), int64(2), object(6)
memory usage: 57.7+ KB


In [17]:
categorical_cols = df.select_dtypes(include='object').columns
print(f"Categorical columns to encode: {list(categorical_cols)}")

Categorical columns to encode: ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']


In [18]:
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [19]:
print("Data Head After Preprocessing and Encoding:")
df.head()

Data Head After Preprocessing and Encoding:


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,5849,0.0,128.0,360.0,1.0,1,True,False,False,False,False,False,False,False,True
1,4583,1508.0,128.0,360.0,1.0,0,True,True,True,False,False,False,False,False,False
2,3000,0.0,66.0,360.0,1.0,1,True,True,False,False,False,False,True,False,True
3,2583,2358.0,120.0,360.0,1.0,1,True,True,False,False,False,True,False,False,True
4,6000,0.0,141.0,360.0,1.0,1,True,False,False,False,False,False,False,False,True


In [20]:
print("Final Data Info:")
df.info()
print("-" * 30)

Final Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ApplicantIncome          614 non-null    int64  
 1   CoapplicantIncome        614 non-null    float64
 2   LoanAmount               614 non-null    float64
 3   Loan_Amount_Term         614 non-null    float64
 4   Credit_History           614 non-null    float64
 5   Loan_Status              614 non-null    int64  
 6   Gender_Male              614 non-null    bool   
 7   Married_Yes              614 non-null    bool   
 8   Dependents_1             614 non-null    bool   
 9   Dependents_2             614 non-null    bool   
 10  Dependents_3+            614 non-null    bool   
 11  Education_Not Graduate   614 non-null    bool   
 12  Self_Employed_Yes        614 non-null    bool   
 13  Property_Area_Semiurban  614 non-null    bool   
 14  Property_

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score

In [22]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [24]:
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y,
                                                    shuffle=True)

In [26]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [27]:
x_train = copy.deepcopy(X_train_scaled)
x_test = copy.deepcopy(X_test_scaled)

In [28]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((491, 14), (123, 14), (491,), (123,))

Voting Classifier

In [29]:
print("\n--- Training Voting Classifier ---")
clf1 = LogisticRegression(random_state=42, max_iter=1000)
clf2 = DecisionTreeClassifier(random_state=42)
clf3 = SVC(probability=True, random_state=42) # SVC with probability=True for soft voting


--- Training Voting Classifier ---


In [37]:
voting_clf = VotingClassifier(
    estimators=[('lr', clf1), ('dt', clf2), ('svc', clf3)],
    voting='soft',
    weights=[0.7, 0.2, 0.1] # 'soft' uses predicted probabilities, often better
)

In [39]:
voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_test)
print("--- Voting Classifier Results ---")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_voting)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_voting):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, voting_clf.predict_proba(X_test)[:, 1]):.4f}")
print(classification_report(y_test, y_pred_voting))

--- Voting Classifier Results ---
Confusion Matrix:
[[23 15]
 [ 3 82]]
Accuracy: 0.8537
ROC AUC: 0.8266
              precision    recall  f1-score   support

           0       0.88      0.61      0.72        38
           1       0.85      0.96      0.90        85

    accuracy                           0.85       123
   macro avg       0.86      0.78      0.81       123
weighted avg       0.86      0.85      0.84       123



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Bagging Classifier

In [40]:
print("\n--- Training Bagging Classifier ---")
bagging_clf = BaggingClassifier(DecisionTreeClassifier(random_state=42), 
                                n_estimators=100,
                                max_samples=0.8, 
                                bootstrap=True, 
                                random_state=42, 
                                n_jobs=-1)


--- Training Bagging Classifier ---


In [41]:
bagging_clf.fit(X_train, y_train)
y_pred_bagging = bagging_clf.predict(X_test)
print("--- Bagging Classifier Results ---")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_bagging)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_bagging):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, bagging_clf.predict_proba(X_test)[:, 1]):.4f}")
print(classification_report(y_test, y_pred_bagging))

--- Bagging Classifier Results ---
Confusion Matrix:
[[23 15]
 [ 4 81]]
Accuracy: 0.8455
ROC AUC: 0.8159
              precision    recall  f1-score   support

           0       0.85      0.61      0.71        38
           1       0.84      0.95      0.90        85

    accuracy                           0.85       123
   macro avg       0.85      0.78      0.80       123
weighted avg       0.85      0.85      0.84       123



Gradien Boosting Classifier

In [43]:
print("\n--- Training Gradient Boosting Classifier ---")
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_clf.fit(X_train, y_train)
y_pred_gb = gb_clf.predict(X_test)
print("--- Gradient Boosting Classifier Results ---")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_gb)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, gb_clf.predict_proba(X_test)[:, 1]):.4f}")
print(classification_report(y_test, y_pred_gb))



--- Training Gradient Boosting Classifier ---
--- Gradient Boosting Classifier Results ---
Confusion Matrix:
[[21 17]
 [ 4 81]]
Accuracy: 0.8293
ROC AUC: 0.7700
              precision    recall  f1-score   support

           0       0.84      0.55      0.67        38
           1       0.83      0.95      0.89        85

    accuracy                           0.83       123
   macro avg       0.83      0.75      0.78       123
weighted avg       0.83      0.83      0.82       123



XG Boost

In [47]:
print("\n--- Training XGBoost Classifier ---")
xgb_clf = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
print("--- XGBoost Classifier Results ---")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_xgb)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1]):.4f}")
print(classification_report(y_test, y_pred_xgb))


--- Training XGBoost Classifier ---
--- XGBoost Classifier Results ---
Confusion Matrix:
[[24 14]
 [14 71]]
Accuracy: 0.7724
ROC AUC: 0.8022
              precision    recall  f1-score   support

           0       0.63      0.63      0.63        38
           1       0.84      0.84      0.84        85

    accuracy                           0.77       123
   macro avg       0.73      0.73      0.73       123
weighted avg       0.77      0.77      0.77       123



LightGBM

In [49]:
lgbm_clf = LGBMClassifier(random_state=42, verbose=-1)
lgbm_clf.fit(X_train, y_train)
y_pred_lgbm = lgbm_clf.predict(X_test)
print("--- LightGBM Classifier Results ---")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_lgbm)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lgbm):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:, 1]):.4f}")
print(classification_report(y_test, y_pred_lgbm))

--- LightGBM Classifier Results ---
Confusion Matrix:
[[24 14]
 [11 74]]
Accuracy: 0.7967
ROC AUC: 0.7904
              precision    recall  f1-score   support

           0       0.69      0.63      0.66        38
           1       0.84      0.87      0.86        85

    accuracy                           0.80       123
   macro avg       0.76      0.75      0.76       123
weighted avg       0.79      0.80      0.79       123

