In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)

---
Load All Datasets

In [None]:
application_train = pd.read_csv("application_train.csv")
bureau = pd.read_csv("bureau.csv")
bureau_balance = pd.read_csv("bureau_balance.csv")
pos_cash = pd.read_csv("POS_CASH_balance.csv")
credit_card = pd.read_csv("credit_card_balance.csv")
previous_application = pd.read_csv("previous_application.csv")
installments_payments = pd.read_csv("installments_payments.csv")

---
Quick Data Overview

In [None]:
print("application_train shape:", application_train.shape)
print("bureau shape:", bureau.shape)
print("bureau_balance shape:", bureau_balance.shape)
print("pos_cash shape:", pos_cash.shape)
print("credit_card shape:", credit_card.shape)
print("previous_application shape:", previous_application.shape)
print("installments_payments shape:", installments_payments.shape)

application_train.head()

---
Understand Target and Class Balance

In [None]:
application_train["TARGET"].value_counts(normalize=True)

In [None]:
sns.countplot(data=application_train, x="TARGET")
plt.title("Target Distribution (0: Non-Defaulter, 1: Defaulter)")
plt.show()

---
Remove Extreme Missing Columns

In [None]:
missing = application_train.isnull().mean().sort_values(ascending=False)
missing.head(20)

In [None]:
cols_to_drop = missing[missing > 0.6].index
application_train_reduced = application_train.drop(columns=cols_to_drop)
print("Dropped columns:", len(cols_to_drop))
print("New shape:", application_train_reduced.shape)

In [None]:
numeric_cols_example = ["AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "DAYS_BIRTH"]

application_train_reduced[numeric_cols_example].describe()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
sns.histplot(data=application_train_reduced, x="AMT_INCOME_TOTAL", hue="TARGET", kde=True, ax=axes[0])
axes[0].set_title("Income vs Target")

sns.histplot(data=application_train_reduced, x="AMT_CREDIT", hue="TARGET", kde=True, ax=axes[1])
axes[1].set_title("Credit Amount vs Target")

sns.histplot(data=application_train_reduced, x= -application_train_reduced["DAYS_BIRTH"]/365, hue="TARGET", kde=True, ax=axes[2])
axes[2].set_title("Age (years) vs Target")
plt.tight_layout()
plt.show()

These plots help understand how income, loan size, and age differ between defaulters and non-defaulters.

---
Correlation analysis 

In [None]:
app = application_train_reduced.copy()

# Turn age and employment into positive years
app["AGE_YEARS"] = -app["DAYS_BIRTH"] / 365
if "DAYS_EMPLOYED" in app.columns:
    app["EMPLOYED_YEARS"] = -app["DAYS_EMPLOYED"] / 365

# Select numeric columns
num_cols = app.select_dtypes(exclude=["object"]).columns

# Compute correlation with TARGET
corr_with_target = app[num_cols].corr()["TARGET"].sort_values(ascending=False)

# Show top 15 positively and negatively correlated features
print("Top 15 positively correlated with TARGET (more likely to default):")
print(corr_with_target.head(15))

print("\nTop 15 negatively correlated with TARGET (less likely to default):")
print(corr_with_target.tail(15))

In [None]:
# Plot top 15 absolute correlations with TARGET
top_corr = corr_with_target.abs().sort_values(ascending=False).head(15)

plt.figure(figsize=(6, 8))
sns.barplot(x=top_corr.values, y=top_corr.index)
plt.title("Features most correlated with TARGET")
plt.xlabel("Absolute correlation")
plt.tight_layout()
plt.show()

---
Compare distributions of key features (defaulters vs non-defaulters)

In [None]:
features_to_plot = ["AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "AGE_YEARS"]

for col in features_to_plot:
    plt.figure(figsize=(6, 4))
    sns.kdeplot(data=app, x=col, hue="TARGET", common_norm=False)
    plt.title(f"{col} distribution by TARGET")
    plt.tight_layout()
    plt.show()

In [None]:
sns.kdeplot(data=app[app["TARGET"]==0], x="AMT_INCOME_TOTAL", common_norm=False)

In [None]:
sns.kdeplot(data=app[app["TARGET"]==1], x="AMT_INCOME_TOTAL", common_norm=False)

---
Categorical feature analysis (bar plots and default rates)

In [None]:
cat_features = ["NAME_CONTRACT_TYPE", "CODE_GENDER", "ORGANIZATION_TYPE"]
for col in cat_features:
    if col not in app.columns:
        continue

    plt.figure(figsize=(8, 4))
    sns.countplot(data=app, x=col, hue="TARGET")
    plt.xticks(rotation=45, ha="right")
    plt.title(f"{col} count by TARGET")
    plt.tight_layout()
    plt.show()

    # Default rate per category
    default_rate = app.groupby(col)["TARGET"].mean().sort_values(ascending=False)
    print(f"\nDefault rate by {col}:")
    print(default_rate)

---
Income vs credit and simple risk segmentation

In [None]:
plt.figure(figsize=(6, 5))
sns.scatterplot(
    data=app.sample(20000, random_state=42),  # sample for speed
    x="AMT_INCOME_TOTAL",
    y="AMT_CREDIT",
    hue="TARGET",
    alpha=0.4
)
plt.title("Income vs Credit Amount colored by TARGET")
plt.tight_layout()
plt.xlim(0, 0.3 * 10**7)
plt.show()

In [None]:
# Create income bands
app["INCOME_BAND"] = pd.qcut(app["AMT_INCOME_TOTAL"], 5, duplicates="drop")

income_default = app.groupby("INCOME_BAND")["TARGET"].mean()
print("Default rate by income band:")
print(income_default)

---
Active vs closed bureau credits and default

In [None]:
# For each client, count active and closed credits
bureau_status = bureau.groupby(["SK_ID_CURR", "CREDIT_ACTIVE"]).size().unstack(fill_value=0)
bureau_status.columns = [f"BUREAU_{c}_COUNT" for c in bureau_status.columns]
bureau_status.reset_index(inplace=True)

# Merge to main
app_bureau = application_train[["SK_ID_CURR", "TARGET"]].merge(bureau_status, on="SK_ID_CURR", how="left")

# Replace NaN with 0 (no bureau records)
for col in app_bureau.columns:
    if "BUREAU_" in col:
        app_bureau[col] = app_bureau[col].fillna(0)

# Default rate by whether client has any active credit
app_bureau["HAS_ACTIVE_CREDIT"] = (app_bureau["BUREAU_Active_COUNT"] > 0).astype(int)

print(app_bureau.groupby("HAS_ACTIVE_CREDIT")["TARGET"].mean())

---
Average number of previous credits vs default

In [None]:
credits_per_client = bureau.groupby("SK_ID_CURR")["SK_ID_BUREAU"].nunique().reset_index()
credits_per_client.rename(columns={"SK_ID_BUREAU": "N_PREV_CREDITS"}, inplace=True)

app_bureau2 = application_train[["SK_ID_CURR", "TARGET"]].merge(credits_per_client, on="SK_ID_CURR", how="left")
app_bureau2["N_PREV_CREDITS"] = app_bureau2["N_PREV_CREDITS"].fillna(0)

plt.figure(figsize=(6, 4))
sns.boxplot(data=app_bureau2, x="TARGET", y="N_PREV_CREDITS")
plt.title("Number of previous bureau credits vs TARGET")
plt.tight_layout()
plt.show()

---
EDA on previous_application

Status of previous applications vs current default

In [None]:
prev_status = previous_application.groupby(["SK_ID_CURR", "NAME_CONTRACT_STATUS"]).size().unstack(fill_value=0)
prev_status.columns = [f"PREV_STATUS_{c}_COUNT" for c in prev_status.columns]
prev_status.reset_index(inplace=True)

app_prev = application_train[["SK_ID_CURR", "TARGET"]].merge(prev_status, on="SK_ID_CURR", how="left")

for col in app_prev.columns:
    if "PREV_STATUS_" in col:
        app_prev[col] = app_prev[col].fillna(0)

status_cols = [c for c in app_prev.columns if "PREV_STATUS_" in c]
default_rates = {}
for col in status_cols:
    has_flag = (app_prev[col] > 0).astype(int)
    default_rates[col] = app_prev.groupby(has_flag)["TARGET"].mean()[1] if 1 in has_flag.values else np.nan

print("Default rate if client ever had that previous status:")
print(pd.Series(default_rates).sort_values(ascending=False))

---
Ratio of previous credit to current income

In [None]:
prev_agg2 = previous_application.groupby("SK_ID_CURR").agg(
    PREV_MEAN_AMT_CREDIT=("AMT_CREDIT", "mean")
).reset_index()

app_prev2 = application_train[["SK_ID_CURR", "TARGET", "AMT_INCOME_TOTAL"]].merge(prev_agg2, on="SK_ID_CURR", how="left")

app_prev2["PREV_CREDIT_TO_INCOME"] = app_prev2["PREV_MEAN_AMT_CREDIT"] / app_prev2["AMT_INCOME_TOTAL"]

plt.figure(figsize=(6, 4))
sns.boxplot(data=app_prev2, x="TARGET", y="PREV_CREDIT_TO_INCOME")
plt.ylim(0, app_prev2["PREV_CREDIT_TO_INCOME"].quantile(0.95))
plt.title("Previous credit / income vs TARGET")
plt.tight_layout()
plt.show()

---
EDA on installments_payments: late payments

In [None]:
inst = installments_payments.copy()

# Calculate days and amount differences
inst["DAYS_PAYMENT_DELAY"] = inst["DAYS_ENTRY_PAYMENT"] - inst["DAYS_INSTALMENT"]
inst["AMT_PAYMENT_DIFF"] = inst["AMT_PAYMENT"] - inst["AMT_INSTALMENT"]

# Late if payment is done after instalment date
inst["IS_LATE"] = (inst["DAYS_PAYMENT_DELAY"] > 0).astype(int)

# Aggregate at client level
inst_eda = inst.groupby("SK_ID_CURR").agg(
    LATE_PAYMENT_RATIO=("IS_LATE", "mean"),
    MEAN_DAYS_DELAY=("DAYS_PAYMENT_DELAY", "mean"),
    MAX_DAYS_DELAY=("DAYS_PAYMENT_DELAY", "max")
).reset_index()

app_inst = application_train[["SK_ID_CURR", "TARGET"]].merge(inst_eda, on="SK_ID_CURR", how="left")

In [None]:
plt.figure(figsize=(6, 4))
sns.boxplot(data=app_inst, x="TARGET", y="LATE_PAYMENT_RATIO")
plt.title("Late payment ratio vs TARGET")
plt.tight_layout()
plt.show()

---
EDA on POS_CASH_balance and credit_card_balance

POS: months on book and default

In [None]:
pos = pos_cash.copy()

pos_agg2 = pos.groupby("SK_ID_CURR").agg(
    POS_MONTHS_MIN=("MONTHS_BALANCE", "min"),
    POS_MONTHS_MAX=("MONTHS_BALANCE", "max"),
    POS_RECORDS=("MONTHS_BALANCE", "size")
).reset_index()

app_pos = application_train[["SK_ID_CURR", "TARGET"]].merge(pos_agg2, on="SK_ID_CURR", how="left")

plt.figure(figsize=(6, 4))
sns.boxplot(data=app_pos, x="TARGET", y="POS_RECORDS")
plt.ylim(0, app_pos["POS_RECORDS"].quantile(0.95))
plt.title("Number of POS records vs TARGET")
plt.tight_layout()
plt.show()

---
Credit card balance: mean utilization

In [None]:
cc = credit_card.copy()

cc["LIMIT_USED_RATIO"] = cc["AMT_BALANCE"] / (cc["AMT_CREDIT_LIMIT_ACTUAL"].replace(0, np.nan))

cc_eda = cc.groupby("SK_ID_CURR").agg(
    CC_MEAN_LIMIT_USED=("LIMIT_USED_RATIO", "mean")
).reset_index()

app_cc = application_train[["SK_ID_CURR", "TARGET"]].merge(cc_eda, on="SK_ID_CURR", how="left")

plt.figure(figsize=(6, 4))
sns.boxplot(data=app_cc, x="TARGET", y="CC_MEAN_LIMIT_USED")
plt.ylim(0, app_cc["CC_MEAN_LIMIT_USED"].quantile(0.95))
plt.title("Credit card limit usage vs TARGET")
plt.tight_layout()
plt.show()

---
Feature Engineering from Secondary Tables

Bureau Aggregations

In [None]:
# Aggregate bureau info per current loan
bureau_agg = bureau.groupby("SK_ID_CURR").agg({
    "SK_ID_BUREAU": "count",
    "AMT_CREDIT_SUM": ["mean", "max"],
    "AMT_CREDIT_SUM_DEBT": ["mean", "max"],
    "AMT_CREDIT_SUM_OVERDUE": ["mean", "max"],
    "CREDIT_DAY_OVERDUE": ["mean", "max"]
})

bureau_agg.columns = ["BUREAU_" + "_".join(col).upper() for col in bureau_agg.columns]
bureau_agg.reset_index(inplace=True)

bureau_agg.head()

This creates features like number of previous credits, average debt and overdue amounts per client.

---
Bureau Balance Aggregations

In [None]:
# First aggregate per bureau credit
bb_agg_level1 = bureau_balance.groupby("SK_ID_BUREAU").agg({
    "MONTHS_BALANCE": ["min", "max", "size"]
})
bb_agg_level1.columns = ["BB_" + "_".join(col).upper() for col in bb_agg_level1.columns]
bb_agg_level1.reset_index(inplace=True)

# Join to bureau to get SK_ID_CURR
bureau_bb = bureau.merge(bb_agg_level1, on="SK_ID_BUREAU", how="left")

# Aggregate per SK_ID_CURR
bureau_bb_agg = bureau_bb.groupby("SK_ID_CURR").agg({
    "BB_MONTHS_BALANCE_MIN": "min",
    "BB_MONTHS_BALANCE_MAX": "max",
    "BB_MONTHS_BALANCE_SIZE": "sum"
})

bureau_bb_agg.columns = [col.upper() for col in bureau_bb_agg.columns]
bureau_bb_agg.reset_index(inplace=True)

bureau_bb_agg.head()

This gives history length features of bureau credits per client.

---
POS_CASH Aggregations

In [None]:
pos_agg = pos_cash.groupby("SK_ID_CURR").agg({
    "SK_ID_PREV": "nunique",
    "MONTHS_BALANCE": ["min", "max", "size"]
})
pos_agg.columns = ["POS_" + "_".join(col).upper() for col in pos_agg.columns]
pos_agg.reset_index(inplace=True)

pos_agg.head()

In [None]:
cc_agg = credit_card.groupby("SK_ID_CURR").agg({
    "SK_ID_PREV": "nunique",
    "MONTHS_BALANCE": ["min", "max", "size"],
    "AMT_BALANCE": ["mean", "max"],
    "AMT_CREDIT_LIMIT_ACTUAL": "mean"
})
cc_agg.columns = ["CC_" + "_".join(col).upper() for col in cc_agg.columns]
cc_agg.reset_index(inplace=True)

cc_agg.head()

In [None]:
prev_agg = previous_application.groupby("SK_ID_CURR").agg({
    "SK_ID_PREV": "nunique",
    "AMT_CREDIT": ["mean", "max"],
    "AMT_GOODS_PRICE": ["mean", "max"]
})
prev_agg.columns = ["PREV_" + "_".join(col).upper() for col in prev_agg.columns]
prev_agg.reset_index(inplace=True)

prev_agg.head()

In [None]:
# 1) Build aggregation dictionary safely
agg_dict = {
    "AMT_PAYMENT": ["mean", "sum"],
    "AMT_INSTALMENT": ["mean", "sum"],
}

if "DAYS_PAST_DUE" in installments_payments.columns:
    agg_dict["DAYS_PAST_DUE"] = ["mean", "max"]

# 2) Aggregate
inst_agg = installments_payments.groupby("SK_ID_CURR").agg(agg_dict)

# 3) Flatten columns
inst_agg.columns = ["INST_" + "_".join(col).upper() for col in inst_agg.columns]
inst_agg.reset_index(inplace=True)

inst_agg.head()

In [None]:
data = application_train_reduced.copy()

for df in [bureau_agg, bureau_bb_agg, pos_agg, cc_agg, prev_agg, inst_agg]:
    data = data.merge(df, on="SK_ID_CURR", how="left")

print("Final merged data shape:", data.shape)

In [None]:
X = data.drop(columns=["TARGET"])
y = data["TARGET"]

In [None]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

len(categorical_cols), len(numeric_cols)

In [None]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [None]:
# First: train_val vs test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Second: train vs validation (from train_val)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.2,           # 0.8 * 0.2 = 0.16 => 64/16/20 split overall
    stratify=y_train_val,
    random_state=42
)

X_train.shape, X_val.shape, X_test.shape

In [None]:
from sklearn.metrics import precision_recall_curve

def find_best_threshold(y_true, y_proba):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_proba)
    f1_scores = 2 * (precisions[1:] * recalls[1:]) / (precisions[1:] + recalls[1:] + 1e-8)
    best_idx = f1_scores.argmax()
    return thresholds[best_idx], f1_scores[best_idx], precisions[best_idx + 1], recalls[best_idx + 1]

In [None]:
# Pipeline
lr_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LogisticRegression(solver="liblinear", class_weight="balanced"))
])

# Hyperparameter space
lr_param_dist = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__penalty": ["l1", "l2"]
}

# Randomized search on TRAIN only
lr_search = RandomizedSearchCV(
    lr_pipe,
    param_distributions=lr_param_dist,
    n_iter=6,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

lr_search.fit(X_train, y_train)

print("Best LR params:", lr_search.best_params_)
print("Best CV ROC-AUC (LR, train only):", lr_search.best_score_)

best_lr = lr_search.best_estimator_

#  Choose threshold on VALIDATION
y_val_proba_lr = best_lr.predict_proba(X_val)[:, 1]
best_thr_lr, best_f1_lr, best_prec_lr, best_rec_lr = find_best_threshold(y_val, y_val_proba_lr)

print("\n[Logistic Regression] Validation threshold selection")
print("Best threshold (val):", best_thr_lr)
print("Validation F1:", best_f1_lr)
print("Validation Precision:", best_prec_lr)
print("Validation Recall:", best_rec_lr)

# Final fit on TRAIN+VAL and evaluation on TEST
best_lr.fit(X_train_val, y_train_val)

y_test_proba_lr = best_lr.predict_proba(X_test)[:, 1]
test_auc_lr = roc_auc_score(y_test, y_test_proba_lr)
y_test_pred_lr = (y_test_proba_lr >= best_thr_lr).astype(int)

print("\n[Logistic Regression] Test performance (fixed threshold from val)")
print("Test ROC-AUC:", test_auc_lr)
print(classification_report(y_test, y_test_pred_lr))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_test_pred_lr))

In [None]:
rf_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    ))
])

rf_param_dist = {
    "model__n_estimators": [100, 200, 400],
    "model__max_depth": [5, 10, 15, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log2"]
}

rf_search = RandomizedSearchCV(
    rf_pipe,
    param_distributions=rf_param_dist,
    n_iter=12,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

rf_search.fit(X_train, y_train)

print("Best RF params:", rf_search.best_params_)
print("Best CV ROC-AUC (RF, train only):", rf_search.best_score_)

best_rf = rf_search.best_estimator_

# Choose threshold on VALIDATION
y_val_proba_rf = best_rf.predict_proba(X_val)[:, 1]
best_thr_rf, best_f1_rf, best_prec_rf, best_rec_rf = find_best_threshold(y_val, y_val_proba_rf)

print("\n[Random Forest] Validation threshold selection")
print("Best threshold (val):", best_thr_rf)
print("Validation F1:", best_f1_rf)
print("Validation Precision:", best_prec_rf)
print("Validation Recall:", best_rec_rf)

# Final fit on TRAIN+VAL and evaluation on TEST
best_rf.fit(X_train_val, y_train_val)

y_test_proba_rf = best_rf.predict_proba(X_test)[:, 1]
test_auc_rf = roc_auc_score(y_test, y_test_proba_rf)
y_test_pred_rf = (y_test_proba_rf >= best_thr_rf).astype(int)

print("\n[Random Forest] Test performance (fixed threshold from val)")
print("Test ROC-AUC:", test_auc_rf)
print(classification_report(y_test, y_test_pred_rf))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_test_pred_rf))

In [None]:
import time

print("=== [GB] Starting Gradient Boosting hyperparameter tuning ===")

start_time = time.time()

gb_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", GradientBoostingClassifier(
        random_state=42,
        verbose=1  # progress inside each GB fit
    ))
])

gb_param_dist = {
    "model__n_estimators": [100, 200, 300],
    "model__learning_rate": [0.01, 0.05, 0.1],
    "model__max_depth": [2, 3, 4],
    "model__subsample": [0.7, 0.9, 1.0],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4]
}

gb_search = RandomizedSearchCV(
    gb_pipe,
    param_distributions=gb_param_dist,
    n_iter=5,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    verbose=10,
    random_state=42
)

X_train_small = X_train.sample(30000, random_state=42)
y_train_small = y_train.loc[X_train_small.index]

print("=== [GB] Fitting RandomizedSearchCV on TRAIN (this will show per-fit progress) ===")
gb_search.fit(X_train_small, y_train_small)

print("=== [GB] Hyperparameter search finished in {:.1f} minutes ==="
      .format((time.time() - start_time) / 60.0))
print("Best GB params:", gb_search.best_params_)
print("Best CV ROC-AUC (GB, train only):", gb_search.best_score_)

best_gb = gb_search.best_estimator_

print("=== [GB] Computing validation probabilities and best threshold ===")
y_val_proba_gb = best_gb.predict_proba(X_val)[:, 1]
best_thr_gb, best_f1_gb, best_prec_gb, best_rec_gb = find_best_threshold(y_val, y_val_proba_gb)

print("[GB] Best threshold (val):", best_thr_gb)
print("[GB] Validation F1:", best_f1_gb)
print("[GB] Validation Precision:", best_prec_gb)
print("[GB] Validation Recall:", best_rec_gb)

print("=== [GB] Re-fitting best model on TRAIN+VAL ===")
best_gb.fit(X_train_val, y_train_val)

print("=== [GB] Evaluating on TEST with fixed threshold ===")
y_test_proba_gb = best_gb.predict_proba(X_test)[:, 1]
test_auc_gb = roc_auc_score(y_test, y_test_proba_gb)
y_test_pred_gb = (y_test_proba_gb >= best_thr_gb).astype(int)

print("[GB] Test ROC-AUC:", test_auc_gb)
print("\n[GB] Classification report (test):")
print(classification_report(y_test, y_test_pred_gb))
print("[GB] Confusion matrix (test):")
print(confusion_matrix(y_test, y_test_pred_gb))

print("=== [GB] Done ===")

In [None]:
results_final = pd.DataFrame({
    "Model": [
        "Logistic Regression (tuned + val thr)",
        "Random Forest (tuned + val thr)",
        "Gradient Boosting (tuned + val thr)"
    ],
    "Test ROC_AUC": [test_auc_lr, test_auc_rf, test_auc_gb],
    "Best Threshold (val)": [best_thr_lr, best_thr_rf, best_thr_gb],
    "Best F1 (val)": [best_f1_lr, best_f1_rf, best_f1_gb],
    "Precision@BestThr (val)": [best_prec_lr, best_prec_rf, best_prec_gb],
    "Recall@BestThr (val)": [best_rec_lr, best_rec_rf, best_rec_gb]
})

results_final

In [None]:
fitted_preprocessor = best_rf.named_steps["preprocess"]

numeric_cols_fitted = fitted_preprocessor.transformers_[0][2]
categorical_cols_fitted = fitted_preprocessor.transformers_[1][2]

ohe = fitted_preprocessor.named_transformers_["cat"].named_steps["onehot"]
cat_feature_names = ohe.get_feature_names_out(categorical_cols_fitted)

all_features = list(numeric_cols_fitted) + list(cat_feature_names)

rf_model = best_rf.named_steps["model"]
importances = rf_model.feature_importances_

feat_imp = pd.DataFrame({
    "feature": all_features,
    "importance": importances
}).sort_values("importance", ascending=False)

# Top 30 most important factors
top30 = feat_imp.head(30)
top30

plt.figure(figsize=(8, 10))
sns.barplot(data=top30, x="importance", y="feature")
plt.title("Top 30 important features (Random Forest)")
plt.tight_layout()
plt.show()