# Advanced Loan Creditworthiness Prediction

This notebook upgrades the baseline into a leakage-safe, reproducible, portfolio-grade ML workflow:
- Proper preprocessing with `ColumnTransformer` + `Pipeline`
- One-Hot encoding for categorical variables
- Robust imputation
- Stratified splitting + cross-validation
- Multiple evaluation metrics (ROC-AUC, F1, etc.)
- Hyperparameter tuning
- Threshold tuning
- Model export


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.calibration import CalibratedClassifierCV
from sklearn.inspection import permutation_importance
import joblib


## Load data

In [None]:
df = pd.read_csv('loan.csv')
df.head()

## Basic checks

In [None]:
df.shape, df.isna().sum(), df['Loan_Status'].value_counts()

## Feature engineering (simple but effective)
We add:
- `TotalIncome = ApplicantIncome + CoapplicantIncome`
- `LoanAmount_to_Income = LoanAmount / TotalIncome`


In [None]:
df = df.copy()
df['TotalIncome'] = df['ApplicantIncome'].fillna(0) + df['CoapplicantIncome'].fillna(0)
df['LoanAmount_to_Income'] = df['LoanAmount'] / df['TotalIncome'].replace(0, np.nan)
df.head()

## Split X/y and train/test
Key upgrade: **stratified split** so the class ratio is preserved.

In [None]:
TARGET = 'Loan_Status'
ID_COL = 'Loan_ID'

X = df.drop(columns=[TARGET, ID_COL])
y = df[TARGET].map({'Y': 1, 'N': 0})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape

## Preprocessing pipeline
Leakage-safe preprocessing with `ColumnTransformer`:
- Numeric: median imputation + scaling
- Categorical: most-frequent imputation + one-hot encoding


In [None]:
categorical_cols = [c for c in X_train.columns if X_train[c].dtype == 'object']
numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

preprocess = ColumnTransformer([
    ('num', numeric_pipe, numeric_cols),
    ('cat', categorical_pipe, categorical_cols),
], verbose_feature_names_out=False)

preprocess

## Model benchmarking (cross-validation)
We compare several scikit-learn models using 5-fold Stratified CV.


In [None]:
models = {
    'logreg_balanced': LogisticRegression(max_iter=20000, class_weight='balanced'),
    'random_forest': RandomForestClassifier(n_estimators=600, class_weight='balanced_subsample', random_state=42, n_jobs=-1),
    'grad_boosting': GradientBoostingClassifier(random_state=42),
    'hist_gbdt': HistGradientBoostingClassifier(random_state=42),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {'accuracy':'accuracy','precision':'precision','recall':'recall','f1':'f1','roc_auc':'roc_auc'}

rows = []
for name, model in models.items():
    pipe = Pipeline([('preprocess', preprocess), ('model', model)])
    scores = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    row = {'model': name}
    for k, v in scores.items():
        if k.startswith('test_'):
            row[k.replace('test_','')] = float(np.mean(v))
    rows.append(row)

bench = pd.DataFrame(rows).sort_values('roc_auc', ascending=False).reset_index(drop=True)
bench

## Hyperparameter tuning (Logistic Regression)
Logistic Regression is a strong and explainable baseline. We tune `C` for ROC-AUC.


In [None]:
pipe = Pipeline([
    ('preprocess', preprocess),
    ('model', LogisticRegression(max_iter=20000, class_weight='balanced'))
])

param_dist = {'model__C': np.logspace(-3, 3, 80)}
search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=25, scoring='roc_auc', cv=cv, random_state=42, n_jobs=-1)
search.fit(X_train, y_train)

search.best_params_, search.best_score_

## Evaluate best model on holdout set

In [None]:
best_model = search.best_estimator_
best_model.fit(X_train, y_train)

proba = best_model.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print('ROC-AUC:', roc_auc_score(y_test, proba))
print('Accuracy:', accuracy_score(y_test, pred))
print('Precision:', precision_score(y_test, pred, zero_division=0))
print('Recall:', recall_score(y_test, pred, zero_division=0))
print('F1:', f1_score(y_test, pred, zero_division=0))

print('\nConfusion matrix:\n', confusion_matrix(y_test, pred))
print('\nClassification report:\n', classification_report(y_test, pred, digits=4))

## Threshold tuning
Using a fixed 0.5 threshold is usually lazy. Tune a threshold for your objective.


In [None]:
def best_threshold(y_true, proba, metric='f1'):
    thresholds = np.linspace(0.05, 0.95, 91)
    best_t, best_s = 0.5, -1
    for t in thresholds:
        p = (proba >= t).astype(int)
        if metric == 'f1':
            s = f1_score(y_true, p, zero_division=0)
        elif metric == 'recall':
            s = recall_score(y_true, p, zero_division=0)
        elif metric == 'precision':
            s = precision_score(y_true, p, zero_division=0)
        else:
            raise ValueError('metric must be f1/recall/precision')
        if s > best_s:
            best_t, best_s = float(t), float(s)
    return best_t, best_s

t, s = best_threshold(y_test.to_numpy(), proba, metric='f1')
t, s

In [None]:
pred_tuned = (proba >= t).astype(int)
print('Threshold:', t)
print('Accuracy:', accuracy_score(y_test, pred_tuned))
print('Precision:', precision_score(y_test, pred_tuned, zero_division=0))
print('Recall:', recall_score(y_test, pred_tuned, zero_division=0))
print('F1:', f1_score(y_test, pred_tuned, zero_division=0))
print('ROC-AUC:', roc_auc_score(y_test, proba))
print('\nConfusion matrix (tuned):\n', confusion_matrix(y_test, pred_tuned))

## Optional: Probability calibration
If you will use predicted probabilities in business decisioning, calibration can matter.


In [None]:
cal = CalibratedClassifierCV(best_model, method='sigmoid', cv=3)
cal.fit(X_train, y_train)
proba_cal = cal.predict_proba(X_test)[:, 1]
print('ROC-AUC (calibrated):', roc_auc_score(y_test, proba_cal))

## Permutation importance (Top 15)
A portable explainability method that works with many models.


In [None]:
best_model.fit(X_train, y_train)
pre = best_model.named_steps['preprocess']
mdl = best_model.named_steps['model']
X_test_tr = pre.transform(X_test)
feature_names = pre.get_feature_names_out()

r = permutation_importance(mdl, X_test_tr, y_test, n_repeats=10, random_state=42, n_jobs=-1)
imp = pd.DataFrame({'feature': feature_names, 'importance_mean': r.importances_mean})
imp = imp.sort_values('importance_mean', ascending=False).head(15)
imp

## Export model
Export the trained pipeline for later inference (API, batch scoring, etc.).


In [None]:
joblib.dump(best_model, 'loan_model.joblib')
print('Saved: loan_model.joblib')