In [1]:
# Import required libraries
import numpy as np
from numpy.random import logseries
import random
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import time
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
print('✓ Libraries imported successfully')

✓ Libraries imported successfully


In [2]:
data = pd.read_csv('.\loan_data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

In [4]:
#Handle missing values
missing_values = data.isnull().sum()
print(missing_values)
print('Missing Values:', missing_values.sum())

person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64
Missing Values: 0


In [5]:
categorical_cols = data.select_dtypes(include='object')
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns

In [6]:
#encoding categorical variables
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

In [7]:
#scale features
scaler = MinMaxScaler()
columns = data.columns
data[columns] = scaler.fit_transform(data[columns])

In [8]:
print(data)

       person_age  person_gender  person_education  person_income  \
0        0.016129            0.0              1.00       0.008891   
1        0.008065            0.0              0.75       0.000595   
2        0.040323            0.0              0.75       0.000617   
3        0.024194            0.0              0.25       0.009976   
4        0.032258            1.0              1.00       0.008082   
...           ...            ...               ...            ...   
44995    0.056452            1.0              0.00       0.005557   
44996    0.137097            0.0              0.00       0.008036   
44997    0.104839            1.0              0.00       0.006804   
44998    0.072581            1.0              0.25       0.003499   
44999    0.032258            1.0              0.75       0.006063   

       person_emp_exp  person_home_ownership  loan_amnt  loan_intent  \
0               0.000               1.000000   1.000000          0.8   
1               0.000      

In [9]:
y = data['loan_status']
X = data.drop('loan_status', axis=1)


#Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#X_train.shape, X_test.shape, y_train.shape, y_test.shape
train_samples =  X_train.shape[0]   # Number of training samples
test_samples = X_test.shape[0]      # Number of test samples
train_test_ratio = 0.8

print(f"Train samples: {train_samples}")
print(f"Test samples: {test_samples}")
print(f"Split ratio: {train_test_ratio:.1%}")

Train samples: 36000
Test samples: 9000
Split ratio: 80.0%


In [10]:
# Import classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, matthews_corrcoef
import xgboost as xgb

print("✓ Classification models imported successfully")

✓ Classification models imported successfully


In [11]:
# Define a function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train the model
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Get prediction probabilities for AUC score
    try:
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        auc_score = roc_auc_score(y_test, y_pred_proba)
    except:
        auc_score = roc_auc_score(y_test, y_pred)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    print(f"\n{'='*70}")
    print(f"Model: {model_name}")
    print(f"{'='*70}")
    print(f"Training Time:  {training_time:.4f} seconds")
    print(f"Accuracy:       {accuracy:.4f}")
    print(f"AUC Score:      {auc_score:.4f}")
    print(f"Precision:      {precision:.4f}")
    print(f"Recall:         {recall:.4f}")
    print(f"F1-Score:       {f1:.4f}")
    print(f"MCC Score:      {mcc:.4f}")
    
    return {
        'Model': model_name,
        'Accuracy': accuracy,
        'AUC Score': auc_score,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'MCC Score': mcc,
        'Training Time': training_time
    }

results = []

In [12]:
# 1. Logistic Regression
print("\n" + "="*60)
print("1. LOGISTIC REGRESSION")
print("="*60)
lr_model = LogisticRegression(random_state=42, max_iter=1000)
result_lr = evaluate_model(lr_model, X_train, X_test, y_train, y_test, "Logistic Regression")
results.append(result_lr)


1. LOGISTIC REGRESSION

Model: Logistic Regression
Training Time:  0.1459 seconds
Accuracy:       0.8894
AUC Score:      0.9481
Precision:      0.7650
Recall:         0.7289
F1-Score:       0.7465
MCC Score:      0.6762


In [13]:
# 2. Decision Tree Classifier
print("\n" + "="*60)
print("2. DECISION TREE CLASSIFIER")
print("="*60)
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)
result_dt = evaluate_model(dt_model, X_train, X_test, y_train, y_test, "Decision Tree Classifier")
results.append(result_dt)


2. DECISION TREE CLASSIFIER

Model: Decision Tree Classifier
Training Time:  0.2403 seconds
Accuracy:       0.9140
AUC Score:      0.9605
Precision:      0.8527
Recall:         0.7433
F1-Score:       0.7943
MCC Score:      0.7430


In [14]:
# 3. K-Nearest Neighbor Classifier
print("\n" + "="*60)
print("3. K-NEAREST NEIGHBOR CLASSIFIER")
print("="*60)
knn_model = KNeighborsClassifier(n_neighbors=5)
result_knn = evaluate_model(knn_model, X_train, X_test, y_train, y_test, "K-Nearest Neighbor Classifier")
results.append(result_knn)


3. K-NEAREST NEIGHBOR CLASSIFIER

Model: K-Nearest Neighbor Classifier
Training Time:  0.1825 seconds
Accuracy:       0.8900
AUC Score:      0.9185
Precision:      0.7849
Recall:         0.6990
F1-Score:       0.7395
MCC Score:      0.6719


In [15]:
# 4. Naive Bayes Classifier (Gaussian)
print("\n" + "="*60)
print("4. NAIVE BAYES CLASSIFIER (GAUSSIAN)")
print("="*60)
nb_model = GaussianNB()
result_nb = evaluate_model(nb_model, X_train, X_test, y_train, y_test, "Naive Bayes Classifier (Gaussian)")
results.append(result_nb)


4. NAIVE BAYES CLASSIFIER (GAUSSIAN)

Model: Naive Bayes Classifier (Gaussian)
Training Time:  0.0358 seconds
Accuracy:       0.7369
AUC Score:      0.9360
Precision:      0.4590
Recall:         0.9965
F1-Score:       0.6285
MCC Score:      0.5490


In [16]:
# 5. Random Forest Classifier (Ensemble Model)
print("\n" + "="*60)
print("5. RANDOM FOREST CLASSIFIER (ENSEMBLE MODEL)")
print("="*60)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
result_rf = evaluate_model(rf_model, X_train, X_test, y_train, y_test, "Random Forest Classifier")
results.append(result_rf)


5. RANDOM FOREST CLASSIFIER (ENSEMBLE MODEL)



Model: Random Forest Classifier
Training Time:  1.7818 seconds
Accuracy:       0.9287
AUC Score:      0.9736
Precision:      0.8904
Recall:         0.7761
F1-Score:       0.8293
MCC Score:      0.7875


In [17]:
# 6. XGBoost Classifier (Ensemble Model)
print("\n" + "="*60)
print("6. XGBOOST CLASSIFIER (ENSEMBLE MODEL)")
print("="*60)
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1, verbosity=0)
result_xgb = evaluate_model(xgb_model, X_train, X_test, y_train, y_test, "XGBoost Classifier")
results.append(result_xgb)


6. XGBOOST CLASSIFIER (ENSEMBLE MODEL)

Model: XGBoost Classifier
Training Time:  3.4591 seconds
Accuracy:       0.9339
AUC Score:      0.9777
Precision:      0.8881
Recall:         0.8055
F1-Score:       0.8448
MCC Score:      0.8044


In [18]:
# Summary Comparison of All Models
print("\n" + "="*80)
print("SUMMARY: MODEL COMPARISON")
print("="*80)

results_df = pd.DataFrame(results)
print("\n", results_df.to_string(index=False))

# Find best model
best_accuracy_idx = results_df['Accuracy'].idxmax()
print(f"\n✓ Best Model (by Accuracy): {results_df.iloc[best_accuracy_idx]['Model']}")
print(f"  Accuracy: {results_df.iloc[best_accuracy_idx]['Accuracy']:.4f}")


SUMMARY: MODEL COMPARISON

                             Model  Accuracy  AUC Score  Precision   Recall  F1-Score  MCC Score  Training Time
              Logistic Regression  0.889444   0.948141   0.765013 0.728856  0.746497   0.676181       0.145922
         Decision Tree Classifier  0.914000   0.960497   0.852740 0.743284  0.794258   0.743011       0.240276
    K-Nearest Neighbor Classifier  0.890000   0.918470   0.784916 0.699005  0.739474   0.671860       0.182453
Naive Bayes Classifier (Gaussian)  0.736889   0.935957   0.458983 0.996517  0.628491   0.548962       0.035825
         Random Forest Classifier  0.928667   0.973583   0.890411 0.776119  0.829346   0.787481       1.781817
               XGBoost Classifier  0.933889   0.977740   0.888097 0.805473  0.844769   0.804442       3.459116

✓ Best Model (by Accuracy): XGBoost Classifier
  Accuracy: 0.9339
