In [57]:
import pandas as pd
import numpy as np
import joblib
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [59]:
data = pd.read_csv('loan.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [61]:
data['Loan_Status'] = data['Loan_Status'].map({'Y': 1, 'N': 0})
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1


In [63]:
X = data.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = data['Loan_Status']

In [65]:
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 
                   'Self_Employed', 'Property_Area']
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 
                 'Loan_Amount_Term', 'Credit_History']

In [67]:
le = LabelEncoder()

In [69]:
label_mappings = {}
for col in categorical_cols:
    X[col] = X[col].fillna("Unknown")
    
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])  # Encode labels
    
    label_mappings[col] = {val: idx for idx, val in enumerate(le.classes_)}

In [71]:
with open("label_mappings.json", "w") as f:
    json.dump(label_mappings, f)

In [73]:
imputer = SimpleImputer(strategy='median')
X[numerical_cols] = imputer.fit_transform(X[numerical_cols])

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [79]:
y_pred = rf_model.predict(X_test)

In [81]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.76

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.42      0.55        43
           1       0.75      0.95      0.84        80

    accuracy                           0.76       123
   macro avg       0.79      0.68      0.70       123
weighted avg       0.78      0.76      0.74       123



In [83]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
})
print("\nFeature Importance:")
print(feature_importance.sort_values('importance', ascending=False))


Feature Importance:
              feature  importance
9      Credit_History    0.256131
5     ApplicantIncome    0.199737
7          LoanAmount    0.177559
6   CoapplicantIncome    0.103766
2          Dependents    0.053986
10      Property_Area    0.050436
8    Loan_Amount_Term    0.050282
0              Gender    0.030824
4       Self_Employed    0.030715
1             Married    0.024233
3           Education    0.022330


In [85]:
def predict_loan_approval(new_data):
    if not isinstance(new_data, pd.DataFrame):
        new_data = pd.DataFrame([new_data])
    
    for col in categorical_cols:
        if col in new_data.columns:
            new_data[col] = new_data[col].fillna('Unknown')
            new_data[col] = le.fit_transform(new_data[col])
    
    new_data[numerical_cols] = imputer.transform(new_data[numerical_cols])
    
    prediction = rf_model.predict(new_data)
    probability = rf_model.predict_proba(new_data)
    
    return {
        'prediction': 'Approved ✅' if prediction[0] == 1 else 'Not Approved ❌',
        'probability': probability[0][1]  
    }

In [87]:
new_applicant = {
    'Gender': 'Male',
    'Married': 'Yes',
    'Dependents': '0',
    'Education': 'Graduate',
    'Self_Employed': 'No',
    'ApplicantIncome': 5000,
    'CoapplicantIncome': 2000,
    'LoanAmount': 150,
    'Loan_Amount_Term': 360,
    'Credit_History': 1,
    'Property_Area': 'Urban'
}

In [89]:
result = predict_loan_approval(new_applicant)
print("\nNew Applicant Prediction:")
print(f"Loan Status: {result['prediction']}")
print(f"Approval Probability: {result['probability']:.2f}")


New Applicant Prediction:
Loan Status: Approved ✅
Approval Probability: 0.79
