# **PREPARATION**

In [1]:
!pip install optuna
!pip install xgboost
!pip install catboost



In [2]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, make_scorer
import joblib
import warnings
warnings.filterwarnings('ignore')

# **PREPROCESSING**

In [3]:
# Load datasets
train_imbalanced = pd.read_csv('/kaggle/input/credit-risk-analysis/train_set_imbalanced.csv')
train_oversampled = pd.read_csv('/kaggle/input/credit-risk-analysis/train_set_oversampled.csv')
train_undersampled = pd.read_csv('/kaggle/input/credit-risk-analysis/train_set_oversampled.csv')
test_set = pd.read_csv('/kaggle/input/credit-risk-analysis/test_set.csv')

# Features to scale
numeric_features = [
    'age',
    'income',
    'education',
    'emp_exp',
    'loan_amnt',
    'loan_int_rate',
    'loan_percent_income',
    'cb_person_cred_hist_length',
    'credit_score'
]

# All features except target
all_features = [
    'age', 'sex', 'education', 'income', 'emp_exp', 'loan_amnt',
    'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
    'credit_score', 'previous_loan_defaults_on_file',
    'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT',
    'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT',
    'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE'
]

# Features to passthrough
passthrough_features = list(set(all_features) - set(numeric_features))

preprocessor = ColumnTransformer(
    transformers=[
        ('robust', RobustScaler(), numeric_features),
        ('pass', 'passthrough', passthrough_features)
    ]
)

# Separate X and y
X_train_imbal = train_imbalanced.drop(columns='loan_status')
y_train_imbal = train_imbalanced['loan_status']

X_train_over = train_oversampled.drop(columns='loan_status')
y_train_over = train_oversampled['loan_status']

X_train_under = train_undersampled.drop(columns='loan_status')
y_train_under = train_undersampled['loan_status']

X_test = test_set.drop(columns='loan_status')
y_test = test_set['loan_status']

# Preprocessing pipelines (fit only on train sets)
pipeline_imbal = Pipeline(steps=[('preprocessor', preprocessor)])
pipeline_over = Pipeline(steps=[('preprocessor', preprocessor)])
pipeline_under = Pipeline(steps=[('preprocessor', preprocessor)])


# Transform (fit_transform for train, transform for test)
X_train_imbal_processed = pipeline_imbal.fit_transform(X_train_imbal)
X_train_over_processed = pipeline_over.fit_transform(X_train_over)
X_train_under_processed = pipeline_under.fit_transform(X_train_under)
X_test_processed = pipeline_imbal.transform(X_test)  # use imbal pipeline for test to maintain consistency

# **MODELING**

## **XGBoost Classifier**

### With Imbalanced Data

In [4]:
# Define Optuna objective function for XGBoost on imbalanced data
def objective_xgb_imbal(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int("max_depth", 3, 12)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
    colsample_bylevel = trial.suggest_float("colsample_bylevel", 0.5, 1.0)
    colsample_bynode = trial.suggest_float("colsample_bynode", 0.5, 1.0)
    gamma = trial.suggest_float("gamma", 0, 5.0)
    lambda_ = trial.suggest_float("lambda", 1e-3, 10.0, log=True)  # L2 reg
    alpha = trial.suggest_float("alpha", 1e-3, 10.0, log=True)    # L1 reg
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10)

    # Build pipeline with preprocessing and XGBoost
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            gamma=gamma,
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42,
            n_jobs=-1,
            tree_method="gpu_hist"
        ))
    ])

    # Cross-validation with F1 score
    scores = cross_val_score(
        pipeline,
        X_train_imbal,
        y_train_imbal,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring=make_scorer(f1_score)
    )
    return scores.mean()

# Run Optuna study
study_xgb_imbal = optuna.create_study(direction="maximize")
study_xgb_imbal.optimize(objective_xgb_imbal, n_trials=50)

# Train final model using best parameters
best_params_xgb_imbal = study_xgb_imbal.best_params
final_pipeline_xgb_imbal = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=best_params_xgb_imbal['n_estimators'],
        max_depth=best_params_xgb_imbal['max_depth'],
        learning_rate=best_params_xgb_imbal['learning_rate'],
        subsample=best_params_xgb_imbal['subsample'],
        colsample_bytree=best_params_xgb_imbal['colsample_bytree'],
        gamma=best_params_xgb_imbal['gamma'],
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ))
])

# Evaluate with Stratified K-Fold CV on imbalanced data
f1_scores_xgb_imbal = cross_val_score(
    final_pipeline_xgb_imbal,
    X_train_imbal,
    y_train_imbal,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score)
)

print("Cross-validated F1 scores (XGBoost - Imbalanced):", f1_scores_xgb_imbal)
print("Average F1 score (XGBoost - Imbalanced):", f1_scores_xgb_imbal.mean())

[I 2025-05-20 02:24:00,896] A new study created in memory with name: no-name-57a78e91-5fd9-4047-a6d6-71b5215f487a
[I 2025-05-20 02:24:09,223] Trial 0 finished with value: 0.8419621633518032 and parameters: {'n_estimators': 997, 'max_depth': 5, 'learning_rate': 0.05475157912345233, 'subsample': 0.5756991666545062, 'colsample_bytree': 0.966485916260738, 'colsample_bylevel': 0.8259929135371735, 'colsample_bynode': 0.9066496156817594, 'gamma': 1.6248168595335732, 'lambda': 0.1572290366483366, 'alpha': 0.022548546147135425, 'min_child_weight': 1}. Best is trial 0 with value: 0.8419621633518032.
[I 2025-05-20 02:24:11,729] Trial 1 finished with value: 0.8359194647351009 and parameters: {'n_estimators': 521, 'max_depth': 9, 'learning_rate': 0.2120821145629258, 'subsample': 0.8498004838762102, 'colsample_bytree': 0.7252848988932687, 'colsample_bylevel': 0.642020006789453, 'colsample_bynode': 0.857113698279572, 'gamma': 4.199384745646734, 'lambda': 0.002008150520996979, 'alpha': 0.0204084197413

Cross-validated F1 scores (XGBoost - Imbalanced): [0.84460338 0.85760623 0.84663383 0.83261518 0.8390276 ]
Average F1 score (XGBoost - Imbalanced): 0.8440972421142344


In [5]:
# Fit the final pipeline on imbalanced data
final_pipeline_xgb_imbal.fit(X_train_imbal, y_train_imbal)

# Save the model to Kaggle working directory
joblib.dump(final_pipeline_xgb_imbal, '/kaggle/working/xgb_best_model_imbalanced.joblib')

print("XGBoost model saved to /kaggle/working successfully.")

XGBoost model saved to /kaggle/working successfully.


### With Oversampled Data

In [9]:
# Define Optuna objective function for XGBoost on oversampled data
def objective_xgb_over(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int("max_depth", 3, 12)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
    colsample_bylevel = trial.suggest_float("colsample_bylevel", 0.5, 1.0)
    colsample_bynode = trial.suggest_float("colsample_bynode", 0.5, 1.0)
    gamma = trial.suggest_float("gamma", 0, 5.0)
    lambda_ = trial.suggest_float("lambda", 1e-3, 10.0, log=True)  # L2 reg
    alpha = trial.suggest_float("alpha", 1e-3, 10.0, log=True)    # L1 reg
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10)

    # Build pipeline with preprocessing and classifier
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            gamma=gamma,
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42,
            n_jobs=-1,
            tree_method="gpu_hist"
        ))
    ])

    # Cross-validation with F1 score
    scores = cross_val_score(
        pipeline,
        X_train_over,
        y_train_over,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring=make_scorer(f1_score)
    )
    return scores.mean()

# Run Optuna study
study_xgb_over = optuna.create_study(direction="maximize")
study_xgb_over.optimize(objective_xgb_over, n_trials=50)

# Train final model using best parameters on full oversampled training set
best_params_xgb_over = study_xgb_over.best_params
final_pipeline_xgb_over = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=best_params_xgb_over['n_estimators'],
        max_depth=best_params_xgb_over['max_depth'],
        learning_rate=best_params_xgb_over['learning_rate'],
        subsample=best_params_xgb_over['subsample'],
        colsample_bytree=best_params_xgb_over['colsample_bytree'],
        gamma=best_params_xgb_over['gamma'],
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ))
])

# Evaluate with Stratified K-Fold CV on oversampled data
f1_scores_xgb_over = cross_val_score(
    final_pipeline_xgb_over,
    X_train_over,
    y_train_over,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score)
)

print("Cross-validated F1 scores (XGBoost - Oversampled):", f1_scores_xgb_over)
print("Average F1 score (XGBoost - Oversampled):", f1_scores_xgb_over.mean())

[I 2025-05-20 02:35:55,543] A new study created in memory with name: no-name-bf87674e-1ec8-426c-a239-ada0588bf70a
[I 2025-05-20 02:36:06,986] Trial 0 finished with value: 0.9564582530359397 and parameters: {'n_estimators': 837, 'max_depth': 8, 'learning_rate': 0.15242778842727944, 'subsample': 0.7307531653476633, 'colsample_bytree': 0.6821413980537764, 'colsample_bylevel': 0.7919815151068064, 'colsample_bynode': 0.6709083927673998, 'gamma': 0.11038311967216108, 'lambda': 0.6646464056943682, 'alpha': 6.054266055434956, 'min_child_weight': 2}. Best is trial 0 with value: 0.9564582530359397.
[I 2025-05-20 02:36:09,465] Trial 1 finished with value: 0.9515530241438886 and parameters: {'n_estimators': 184, 'max_depth': 6, 'learning_rate': 0.09855626499390428, 'subsample': 0.6008686215623333, 'colsample_bytree': 0.9798238040928574, 'colsample_bylevel': 0.9586486770953437, 'colsample_bynode': 0.5514105161326766, 'gamma': 1.0510364981315097, 'lambda': 0.7851085199656385, 'alpha': 0.322462240230

Cross-validated F1 scores (XGBoost - Oversampled): [0.95546782 0.96118743 0.95789667 0.95513588 0.95808931]
Average F1 score (XGBoost - Oversampled): 0.9575554217364555


In [10]:
# Fit the final pipeline on oversampled data
final_pipeline_xgb_over.fit(X_train_over, y_train_over)

# Save the model to Kaggle working directory
joblib.dump(final_pipeline_xgb_over, '/kaggle/working/xgb_best_model_oversampled.joblib')

print("XGBoost model saved to /kaggle/working successfully.")

XGBoost model saved to /kaggle/working successfully.


### With Undersampled Data

In [11]:
# Define Optuna objective function for XGBoost on undersampled data
def objective_xgb_under(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int("max_depth", 3, 12)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log=True)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
    colsample_bylevel = trial.suggest_float("colsample_bylevel", 0.5, 1.0)
    colsample_bynode = trial.suggest_float("colsample_bynode", 0.5, 1.0)
    gamma = trial.suggest_float("gamma", 0, 5.0)
    lambda_ = trial.suggest_float("lambda", 1e-3, 10.0, log=True)  # L2 reg
    alpha = trial.suggest_float("alpha", 1e-3, 10.0, log=True)    # L1 reg
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10)

    # Build pipeline with preprocessing and classifier
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            gamma=gamma,
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42,
            n_jobs=-1,
            tree_method="gpu_hist"
        ))
    ])

    # Cross-validation with F1 score
    scores = cross_val_score(
        pipeline,
        X_train_under,
        y_train_under,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring=make_scorer(f1_score)
    )
    return scores.mean()

# Run Optuna study
study_xgb_under = optuna.create_study(direction="maximize")
study_xgb_under.optimize(objective_xgb_under, n_trials=50)

# Train final model using best parameters on full undersampled training set
best_params_xgb_under = study_xgb_under.best_params
final_pipeline_xgb_under = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=best_params_xgb_under['n_estimators'],
        max_depth=best_params_xgb_under['max_depth'],
        learning_rate=best_params_xgb_under['learning_rate'],
        subsample=best_params_xgb_under['subsample'],
        colsample_bytree=best_params_xgb_under['colsample_bytree'],
        gamma=best_params_xgb_under['gamma'],
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ))
])

# Evaluate with Stratified K-Fold CV on undersampled data
f1_scores_xgb_under = cross_val_score(
    final_pipeline_xgb_under,
    X_train_under,
    y_train_under,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score)
)

print("Cross-validated F1 scores (XGBoost - Undersampled):", f1_scores_xgb_under)
print("Average F1 score (XGBoost - Undersampled):", f1_scores_xgb_under.mean())

[I 2025-05-20 02:43:19,532] A new study created in memory with name: no-name-849a049d-65b5-4ced-a046-3bbdabe09490
[I 2025-05-20 02:43:22,160] Trial 0 finished with value: 0.9521370279781362 and parameters: {'n_estimators': 333, 'max_depth': 7, 'learning_rate': 0.20509968180937663, 'subsample': 0.6835041093100979, 'colsample_bytree': 0.9525740517032344, 'colsample_bylevel': 0.7870746044050193, 'colsample_bynode': 0.9263959317246226, 'gamma': 3.2091647842252335, 'lambda': 2.6559914347936306, 'alpha': 0.5063189917252139, 'min_child_weight': 8}. Best is trial 0 with value: 0.9521370279781362.
[I 2025-05-20 02:43:23,863] Trial 1 finished with value: 0.9462078072972219 and parameters: {'n_estimators': 198, 'max_depth': 3, 'learning_rate': 0.20196069321664997, 'subsample': 0.9537948071632953, 'colsample_bytree': 0.948733057327922, 'colsample_bylevel': 0.8507308472349977, 'colsample_bynode': 0.6645106233145655, 'gamma': 3.2537610923383813, 'lambda': 0.003280782851836808, 'alpha': 8.19227989866

Cross-validated F1 scores (XGBoost - Undersampled): [0.9553941  0.9604353  0.95854827 0.95492672 0.96029344]
Average F1 score (XGBoost - Undersampled): 0.957919566463738


In [12]:
# Fit the final pipeline on undersampled data
final_pipeline_xgb_under.fit(X_train_under, y_train_under)

# Save the model to Kaggle working directory
joblib.dump(final_pipeline_xgb_under, '/kaggle/working/xgb_best_model_undersampled.joblib')

print("XGBoost model saved to /kaggle/working successfully.")

XGBoost model saved to /kaggle/working successfully.


## **CatBoost Classifier**

### With Imbalanced Data

In [13]:
# Define Optuna objective function for CatBoost on imbalanced data
def objective_cat_imbal(trial):
    iterations = trial.suggest_int("iterations", 100, 200)
    depth = trial.suggest_int("depth", 4, 9)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2, log=True)
    l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1, 15.0)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    random_strength = trial.suggest_float("random_strength", 0.0, 5.0)

    # Set bootstrap_type that supports subsample
    bootstrap_type = "Bernoulli"

    # Build pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', CatBoostClassifier(
            task_type="GPU",
            devices="0",
            iterations=iterations,
            depth=depth,
            learning_rate=learning_rate,
            l2_leaf_reg=l2_leaf_reg,
            subsample=subsample,
            random_strength=random_strength,
            bootstrap_type=bootstrap_type,
            verbose=0,
            random_state=42
        ))
    ])

    # Cross-validation
    scores = cross_val_score(
        pipeline,
        X_train_imbal,
        y_train_imbal,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring=make_scorer(f1_score)
    )
    return scores.mean()

# Run Optuna study
study_cat_imbal = optuna.create_study(direction="maximize")
study_cat_imbal.optimize(objective_cat_imbal, n_trials=50)

# Train final model with best params
best_params_cat_imbal = study_cat_imbal.best_params
final_pipeline_cat_imbal = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(
        task_type="GPU",
        devices="0",
        iterations=best_params_cat_imbal['iterations'],
        depth=best_params_cat_imbal['depth'],
        learning_rate=best_params_cat_imbal['learning_rate'],
        l2_leaf_reg=best_params_cat_imbal['l2_leaf_reg'],
        subsample=best_params_cat_imbal['subsample'],
        random_strength=best_params_cat_imbal['random_strength'],
        bootstrap_type="Bernoulli",  # fixed!
        verbose=0,
        random_state=42
    ))
])

# Evaluate best model with 5-fold CV
f1_scores_cat_imbal = cross_val_score(
    final_pipeline_cat_imbal,
    X_train_imbal,
    y_train_imbal,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score)
)

print("Cross-validated F1 scores (CatBoost - Imbalanced):", f1_scores_cat_imbal)
print("Average F1 score (CatBoost - Imbalanced):", f1_scores_cat_imbal.mean())

[I 2025-05-20 02:53:18,998] A new study created in memory with name: no-name-1cdf0f49-9d8b-4d9d-83fd-cc82b8b1096b
[I 2025-05-20 02:53:48,768] Trial 0 finished with value: 0.8287290902267592 and parameters: {'iterations': 171, 'depth': 6, 'learning_rate': 0.11429425451462354, 'l2_leaf_reg': 7.628738474626067, 'subsample': 0.8947328410460971, 'random_strength': 4.952312608027936}. Best is trial 0 with value: 0.8287290902267592.
[I 2025-05-20 02:53:59,654] Trial 1 finished with value: 0.8254497597620192 and parameters: {'iterations': 118, 'depth': 9, 'learning_rate': 0.14191905934462176, 'l2_leaf_reg': 1.3741042469522482, 'subsample': 0.5734460501162788, 'random_strength': 3.982321543533049}. Best is trial 0 with value: 0.8287290902267592.
[I 2025-05-20 02:54:15,161] Trial 2 finished with value: 0.8211650674362971 and parameters: {'iterations': 186, 'depth': 9, 'learning_rate': 0.026939980077383782, 'l2_leaf_reg': 3.291576305561481, 'subsample': 0.9588527371281902, 'random_strength': 2.51

Cross-validated F1 scores (CatBoost - Imbalanced): [0.82701812 0.84852502 0.83817701 0.82277628 0.83482291]
Average F1 score (CatBoost - Imbalanced): 0.8342638695894147


In [14]:
# Fit the final pipeline on imbalanced data
final_pipeline_cat_imbal.fit(X_train_imbal, y_train_imbal)

# Save the model to Kaggle working directory
joblib.dump(final_pipeline_cat_imbal, '/kaggle/working/cat_best_model_imbalanced.joblib')

print("CatBoost model saved to /kaggle/working successfully.")

CatBoost model saved to /kaggle/working successfully.


### With Oversampled Data

In [15]:
# Define Optuna objective function for CatBoost on oversampled data
def objective_cat_over(trial):
    iterations = trial.suggest_int("iterations", 100, 200)
    depth = trial.suggest_int("depth", 4, 9)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2, log=True)
    l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1, 15.0)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    random_strength = trial.suggest_float("random_strength", 0.0, 5.0)

    bootstrap_type = "Bernoulli"

    # Build pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', CatBoostClassifier(
            task_type="GPU",
            devices="0",
            iterations=iterations,
            depth=depth,
            learning_rate=learning_rate,
            l2_leaf_reg=l2_leaf_reg,
            subsample=subsample,
            random_strength=random_strength,
            bootstrap_type=bootstrap_type,
            verbose=0,
            random_state=42
        ))
    ])

    # Cross-validation
    scores = cross_val_score(
        pipeline,
        X_train_over,
        y_train_over,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring=make_scorer(f1_score)
    )
    return scores.mean()

# Run Optuna study
study_cat_over = optuna.create_study(direction="maximize")
study_cat_over.optimize(objective_cat_over, n_trials=50)

# Train final model with best params
best_params_cat_over = study_cat_over.best_params
final_pipeline_cat_over = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(
        task_type="GPU",
        devices="0",
        iterations=best_params_cat_over['iterations'],
        depth=best_params_cat_over['depth'],
        learning_rate=best_params_cat_over['learning_rate'],
        l2_leaf_reg=best_params_cat_over['l2_leaf_reg'],
        subsample=best_params_cat_over['subsample'],
        random_strength=best_params_cat_over['random_strength'],
        bootstrap_type="Bernoulli",
        verbose=0,
        random_state=42
    ))
])

# Evaluate best model with 5-fold CV
f1_scores_cat_over = cross_val_score(
    final_pipeline_cat_over,
    X_train_over,
    y_train_over,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score)
)

print("Cross-validated F1 scores (CatBoost - Oversampled):", f1_scores_cat_over)
print("Average F1 score (CatBoost - Oversampled):", f1_scores_cat_over.mean())

[I 2025-05-20 03:01:56,949] A new study created in memory with name: no-name-c7d587f3-241e-44b3-a697-078597c772f2
[I 2025-05-20 03:02:07,402] Trial 0 finished with value: 0.940879177958092 and parameters: {'iterations': 107, 'depth': 9, 'learning_rate': 0.06881349374369551, 'l2_leaf_reg': 8.17941359426381, 'subsample': 0.9716592967484465, 'random_strength': 2.258923030392172}. Best is trial 0 with value: 0.940879177958092.
[I 2025-05-20 03:02:13,787] Trial 1 finished with value: 0.9247122011623397 and parameters: {'iterations': 155, 'depth': 4, 'learning_rate': 0.03706336026630867, 'l2_leaf_reg': 14.17997794752001, 'subsample': 0.9178720426952385, 'random_strength': 0.822079644937943}. Best is trial 0 with value: 0.940879177958092.
[I 2025-05-20 03:02:20,086] Trial 2 finished with value: 0.9352036967355103 and parameters: {'iterations': 151, 'depth': 4, 'learning_rate': 0.07288788595183952, 'l2_leaf_reg': 8.682728413064668, 'subsample': 0.989264119516162, 'random_strength': 2.131205280

Cross-validated F1 scores (CatBoost - Oversampled): [0.94863267 0.9533525  0.95110454 0.94981413 0.95046041]
Average F1 score (CatBoost - Oversampled): 0.9506728474624284


In [16]:
# Fit the final pipeline on oversampled data
final_pipeline_cat_over.fit(X_train_over, y_train_over)

# Save the model to Kaggle working directory
joblib.dump(final_pipeline_cat_over, '/kaggle/working/cat_best_model_oversampled.joblib')

print("CatBoost model saved to /kaggle/working successfully.")

CatBoost model saved to /kaggle/working successfully.


### With Undersampled Data

In [17]:
# Define Optuna objective function for CatBoost on undersampled data
def objective_cat_under(trial):
    iterations = trial.suggest_int("iterations", 100, 200)
    depth = trial.suggest_int("depth", 4, 9)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.2, log=True)
    l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1, 15.0)
    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    random_strength = trial.suggest_float("random_strength", 0.0, 5.0)

    bootstrap_type = "Bernoulli"

    # Build pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', CatBoostClassifier(
            task_type="GPU",
            devices="0",
            iterations=iterations,
            depth=depth,
            learning_rate=learning_rate,
            l2_leaf_reg=l2_leaf_reg,
            subsample=subsample,
            random_strength=random_strength,
            bootstrap_type=bootstrap_type,
            verbose=0,
            random_state=42
        ))
    ])

    # Cross-validation
    scores = cross_val_score(
        pipeline,
        X_train_under,
        y_train_under,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring=make_scorer(f1_score)
    )
    return scores.mean()

# Run Optuna study
study_cat_under = optuna.create_study(direction="maximize")
study_cat_under.optimize(objective_cat_under, n_trials=50)

# Train final model with best params
best_params_cat_under = study_cat_under.best_params
final_pipeline_cat_under = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(
        task_type="GPU",
        devices="0",
        iterations=best_params_cat_under['iterations'],
        depth=best_params_cat_under['depth'],
        learning_rate=best_params_cat_under['learning_rate'],
        l2_leaf_reg=best_params_cat_under['l2_leaf_reg'],
        subsample=best_params_cat_under['subsample'],
        random_strength=best_params_cat_under['random_strength'],
        bootstrap_type="Bernoulli",
        verbose=0,
        random_state=42
    ))
])

# Evaluate best model with 5-fold CV
f1_scores_cat_under = cross_val_score(
    final_pipeline_cat_under,
    X_train_under,
    y_train_under,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring=make_scorer(f1_score)
)

print("Cross-validated F1 scores (CatBoost - Undersampled):", f1_scores_cat_under)
print("Average F1 score (CatBoost - Undersampled):", f1_scores_cat_under.mean())

[I 2025-05-20 03:09:25,455] A new study created in memory with name: no-name-71be7867-da3b-450e-ab60-821c2c6544c6
[I 2025-05-20 03:09:41,456] Trial 0 finished with value: 0.9355855923179737 and parameters: {'iterations': 191, 'depth': 9, 'learning_rate': 0.020934533025401884, 'l2_leaf_reg': 13.048096028942354, 'subsample': 0.9859983443765794, 'random_strength': 3.8857123398304627}. Best is trial 0 with value: 0.9355855923179737.
[I 2025-05-20 03:09:46,949] Trial 1 finished with value: 0.9338274370282594 and parameters: {'iterations': 108, 'depth': 4, 'learning_rate': 0.10003284679722808, 'l2_leaf_reg': 12.265390921749677, 'subsample': 0.5259586339109414, 'random_strength': 1.2019682285942324}. Best is trial 0 with value: 0.9355855923179737.
[I 2025-05-20 03:10:03,705] Trial 2 finished with value: 0.9418822152776973 and parameters: {'iterations': 199, 'depth': 9, 'learning_rate': 0.03928912982695739, 'l2_leaf_reg': 11.915274358985437, 'subsample': 0.6438637124198954, 'random_strength': 

Cross-validated F1 scores (CatBoost - Undersampled): [0.94805794 0.95349912 0.95086812 0.94995366 0.9517254 ]
Average F1 score (CatBoost - Undersampled): 0.9508208489963387


In [18]:
# Fit the final pipeline on undersampled data
final_pipeline_cat_under.fit(X_train_under, y_train_under)

# Save the model to Kaggle working directory
joblib.dump(final_pipeline_cat_under, '/kaggle/working/cat_best_model_undersampled.joblib')

print("CatBoost model saved to /kaggle/working successfully.")

CatBoost model saved to /kaggle/working successfully.


### **PREDICTING**

In [19]:
# Predict with each pipeline
y_xgb_imbal = final_pipeline_xgb_imbal.predict(X_test)
y_xgb_under = final_pipeline_xgb_under.predict(X_test)
y_xgb_over = final_pipeline_xgb_over.predict(X_test)

y_cat_imbal = final_pipeline_cat_imbal.predict(X_test)
y_cat_under = final_pipeline_cat_under.predict(X_test)
y_cat_over = final_pipeline_cat_over.predict(X_test)

# Create DataFrame of all results
predictions_df = pd.DataFrame({
    "y_actual": y_test,
    "y_xgb_imbal": y_xgb_imbal,
    "y_xgb_under": y_xgb_under,
    "y_xgb_over": y_xgb_over,
    "y_cat_imbal": y_cat_imbal,
    "y_cat_under": y_cat_under,
    "y_cat_over": y_cat_over
})

# Save to CSV
predictions_df.to_csv("/kaggle/working/xgboost_and_catboost_predictions_output.csv", index=False)
print("All model predictions saved to 'all_model_predictions.csv'")

All model predictions saved to 'all_model_predictions.csv'
