### Importing Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score
)

### Loading the Clean Dataset

In [2]:
loan_df_clean = pd.read_csv("loan_df_clean.csv")

  loan_df_clean = pd.read_csv("loan_df_clean.csv")


### Defining Target and Predictors

In [3]:
# Target
y = loan_df_clean["loan_default"]

# Selected predictors (approval-time only)
selected_features = [
    'loan_amnt','term','int_rate','installment',
    'grade','sub_grade','purpose','initial_list_status',
    'annual_inc','verification_status','emp_length',
    'home_ownership','dti',
    'credit_history_years','open_acc','total_acc',
    'delinq_2yrs','pub_rec','inq_last_6mths',
    'revol_util','bc_util','percent_bc_gt_75',
    'acc_now_delinq','mort_acc',
    'num_accts_ever_120_pd','num_tl_30dpd','num_tl_90g_dpd_24m'
]

X = loan_df_clean[selected_features].copy()

### Train-Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    stratify=y,
    random_state=42
)

### Identifying Numeric & Categorical Columns

In [5]:
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(include="object").columns

### Preprocessing Pipelines (Missing Values + Encoding + Scaling Pipeline)

In [6]:
# Numeric pipeline

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [7]:
# Categorical pipeline

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [8]:
# Combine

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])

### Logistic Regression Model

In [9]:
model = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(
        class_weight="balanced",
        max_iter=1000
    ))
])

### Training the Model

In [10]:
model.fit(X_train, y_train)

### Evaluating the Model

In [11]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nROC AUC", roc_auc_score(y_test, y_prob))

Confusion Matrix:
[[318958 174728]
 [ 23208  47856]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.65      0.76    493686
           1       0.22      0.67      0.33     71064

    accuracy                           0.65    564750
   macro avg       0.57      0.66      0.54    564750
weighted avg       0.84      0.65      0.71    564750


ROC AUC 0.7191325445089725


### Interpretting Model Coefficients

In [12]:
feature_names = model.named_steps["preprocessing"].get_feature_names_out()
coefficients = model.named_steps["classifier"].coef_[0]

feature_importance = (
    pd.DataFrame({
        "feature": feature_names,
        "coefficient": coefficients
    })
    .sort_values(by="coefficient", ascending=False)
)

feature_importance.head(15)

Unnamed: 0,feature,coefficient
26,cat__grade_G,6.639078
25,cat__grade_F,6.10719
24,cat__grade_E,5.117782
23,cat__grade_D,4.164688
22,cat__grade_C,3.147807
21,cat__grade_B,2.079572
55,cat__sub_grade_F5,1.58986
30,cat__sub_grade_A5,1.560674
50,cat__sub_grade_E5,1.452209
54,cat__sub_grade_F4,1.445131
