In [0]:
!pip install optuna  xgboost

In [0]:

import pandas
import optuna
import mlflow
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report
from xgboost import XGBClassifier




### Load the data

In [0]:
train_df = spark.table("credit_catalog.gold.ml_train").toPandas()



In [0]:
X = train_df.drop(columns=['Credit_Score'])
y = train_df['Credit_Score']

In [0]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)



In [0]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [0]:
print("The size of train data is",X_train.shape)
print("The shape of test data is",X_test.shape)

In [0]:
ohe_encode = ['Occupation','Income_Group', 'Payment_of_Min_Amount']
ordinal_encode = ['Credit_Mix']
standard_scale = ['Age', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio',
       'Credit_History_Age', 'Total_EMI_per_month', 'Amount_invested_monthly',
       'Monthly_Balance', 'has_auto_loan', 'has_credit_builder_loan',
       'has_debt_consolidation_loan', 'has_home_equity_loan',
       'has_mortgage_loan', 'has_payday_loan', 'has_personal_loan',
       'has_student_loan', 'spend_level', 'txn_value_level',
       'Debt_to_Income_Ratio', 'EMI_to_Salary_Ratio', 'Saving_Capacity']

In [0]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe',
         OneHotEncoder(drop='first',
                       sparse_output=False,
                       handle_unknown='ignore'),
         ohe_encode),

        ('ordinal',
         OrdinalEncoder(categories=[['bad', 'good', 'standard']]),
         ordinal_encode),

        ('standard',
         StandardScaler(),
         standard_scale)
    ],
    remainder="passthrough",
    n_jobs=-1,
    force_int_remainder_cols=False,
    verbose_feature_names_out=False
)

# preprocessor.set_output(transform="pandas")

In [0]:
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)



In [0]:
# mlflow experiment
mlflow.set_experiment("/Users/souravraj664@gmail.com/Exp 2 - XGBoost Hyperparameter Tuning")

In [0]:
import pandas as pd


feature_names = ['Age', 'Occupation', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Monthly_Balance', 'Credit_Score',
       'Income_Group', 'has_auto_loan', 'has_credit_builder_loan',
       'has_debt_consolidation_loan', 'has_home_equity_loan',
       'has_mortgage_loan', 'has_payday_loan', 'has_personal_loan',
       'has_student_loan', 'spend_level', 'txn_value_level',
       'Debt_to_Income_Ratio', 'EMI_to_Salary_Ratio', 'Saving_Capacity']

# Ensure pandas objects
if not isinstance(X_train, pd.DataFrame):
    X_train = pd.DataFrame(X_train, columns=feature_names)

if not isinstance(y_train, pd.Series):
    y_train = pd.Series(y_train)


In [0]:
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
import numpy as np


def objective(trial):
    with mlflow.start_run(nested=True):

        # ðŸ”¹ XGBoost params
        xgb_params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15),
            "max_depth": trial.suggest_int("max_depth", 4, 10),

            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "gamma": trial.suggest_float("gamma", 0.0, 5.0),

            "subsample": trial.suggest_float("subsample", 0.7, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),

            "reg_lambda": trial.suggest_float("reg_lambda", 1.0, 100.0),

            "objective": "multi:softprob",
            "eval_metric": "mlogloss",
            "num_class": 3,
            "random_state": 42,
            "n_jobs": -1,
            "tree_method": "hist"
        }

        mlflow.log_params(xgb_params)

        # ðŸ”¹ Stratified CV (MANDATORY for class imbalance)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = []

        for train_idx, val_idx in cv.split(X_train, y_train):
            X_tr = X_train.iloc[train_idx]
            X_val = X_train.iloc[val_idx]

            y_tr = y_train.iloc[train_idx]
            y_val = y_train.iloc[val_idx]

            # ðŸ”¹ Compute class weights PER FOLD (VERY IMPORTANT)
            classes = np.unique(y_tr)
            class_weights = compute_class_weight(
                class_weight="balanced",
                classes=classes,
                y=y_tr
            )
            class_weight_dict = dict(zip(classes, class_weights))

            # ðŸ”¹ Convert to sample weights
            sample_weight = y_tr.map(class_weight_dict).values

            # ðŸ”¹ Build pipeline
            full_pipe = Pipeline([
                ("preprocessor", preprocessor),
                ("model", XGBClassifier(**xgb_params))
            ])

            # ðŸ”¹ Fit WITH sample weights
            full_pipe.fit(
                X_tr,
                y_tr,
                model__sample_weight=sample_weight
            )

            # ðŸ”¹ Predict probabilities
            y_val_proba = full_pipe.predict_proba(X_val)

            # ðŸ”¹ Multiclass ROC-AUC
            fold_score = roc_auc_score(
                y_val,
                y_val_proba,
                multi_class="ovr"
            )

            scores.append(fold_score)

        mean_auc = np.mean(scores)
        mlflow.log_metric("cv_roc_auc_ovr", mean_auc)

        return mean_auc


In [0]:

# create optuna study
study = optuna.create_study(direction="maximize")

study.optimize(objective,n_trials=10,n_jobs=-1,show_progress_bar=True)

