In [1]:
# Loan Approval Prediction Project
# Author: Sucheta Nandy
# Date: 27-August-2025

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [3]:
# 1. Load dataset
# Replace 'loan_data.csv' with your dataset path
data = pd.read_csv('loan_data.csv')

# Preview data
print(data.head())
print(data.info())

  Loan_ID  Gender Married Dependents Education Self_Employed  ApplicantIncome  \
0  LP1000    Male     Yes          0  Graduate            No            19854   
1  LP1001  Female     Yes          0  Graduate            No             7642   
2  LP1002    Male     Yes          2  Graduate            No            10594   
3  LP1003    Male      No          1  Graduate            No             1864   
4  LP1004    Male     Yes          2  Graduate            No            17188   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
0               9689         169               360               1   
1               3171         266               360               1   
2               1567         105               360               1   
3               1151         415               360               1   
4               7619         553               360               1   

  Property_Area Loan_Status  
0         Urban           Y  
1     Semiurban           Y  
2 

In [4]:
# 2. Data Preprocessing
# Separate target variable
target = 'Loan_Status'
X = data.drop(columns=[target])
y = data[target]

In [5]:
# Encode target variable if necessary
le_target = LabelEncoder()
y = le_target.fit_transform(y)  # 'Y' -> 1, 'N' -> 0

In [6]:
# Handling missing values
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

# Impute numerical features with median
imputer_num = SimpleImputer(strategy='median')
X[num_features] = imputer_num.fit_transform(X[num_features])

# Impute categorical features with mode
imputer_cat = SimpleImputer(strategy='most_frequent')
X[cat_features] = imputer_cat.fit_transform(X[cat_features])

In [7]:
# Encode categorical variables
X = pd.get_dummies(X, drop_first=True)

In [8]:
# Feature Scaling (optional for tree-based models but good for logistic regression)
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])

In [9]:
# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
# 4. Model Training
# Define individual classifiers
log_reg = LogisticRegression(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
gb_clf = GradientBoostingClassifier(random_state=42)
dt_clf = DecisionTreeClassifier(random_state=42)

# Ensemble Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('lr', log_reg), ('rf', rf_clf), ('gb', gb_clf), ('dt', dt_clf)],
    voting='soft'
)

# Fit ensemble model
voting_clf.fit(X_train, y_train)

In [11]:
# 5. Model Evaluation
y_pred = voting_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.65      0.77        26
           1       0.89      0.99      0.94        74

    accuracy                           0.90       100
   macro avg       0.92      0.82      0.85       100
weighted avg       0.90      0.90      0.89       100


Confusion Matrix:
 [[17  9]
 [ 1 73]]


In [12]:
# 6. Feature Importance (for tree-based models)
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_clf.fit(X_train, y_train).feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:\n", feature_importances)


Feature Importances:
                Feature  Importance
4       Credit_History    0.281184
0      ApplicantIncome    0.097543
2           LoanAmount    0.081822
1    CoapplicantIncome    0.069134
3     Loan_Amount_Term    0.025523
..                 ...         ...
8       Loan_ID_LP1004    0.000000
501     Loan_ID_LP1497    0.000000
491     Loan_ID_LP1487    0.000000
16      Loan_ID_LP1012    0.000000
496     Loan_ID_LP1492    0.000000

[513 rows x 2 columns]


In [13]:
# 7. Save Model for future use
import joblib
joblib.dump(voting_clf, 'loan_approval_model.pkl')
print("\nModel saved as 'loan_approval_model.pkl'")


Model saved as 'loan_approval_model.pkl'
