In [0]:
!pip install optuna xgboost

In [0]:

import pandas
import optuna
import mlflow
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, classification_report
from xgboost import XGBClassifier




### load the data

In [0]:
train_df = spark.table("credit_catalog.gold.ml_train").toPandas()

In [0]:
X = train_df.drop(columns=['Credit_Score'])
y = train_df['Credit_Score']

In [0]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [0]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [0]:
print("The size of train data is",X_train.shape)
print("The shape of test data is",X_test.shape)

In [0]:
ohe_encode = ['Occupation','Income_Group', 'Payment_of_Min_Amount']
ordinal_encode = ['Credit_Mix']
standard_scale = ['Age', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio',
       'Credit_History_Age', 'Total_EMI_per_month', 'Amount_invested_monthly',
       'Monthly_Balance', 'has_auto_loan', 'has_credit_builder_loan',
       'has_debt_consolidation_loan', 'has_home_equity_loan',
       'has_mortgage_loan', 'has_payday_loan', 'has_personal_loan',
       'has_student_loan', 'spend_level', 'txn_value_level',
       'Debt_to_Income_Ratio', 'EMI_to_Salary_Ratio', 'Saving_Capacity']

In [0]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe',
         OneHotEncoder(drop='first',
                       sparse_output=False,
                       handle_unknown='ignore'),
         ohe_encode),

        ('ordinal',
         OrdinalEncoder(categories=[['bad', 'good', 'standard']]),
         ordinal_encode),

        ('standard',
         StandardScaler(),
         standard_scale)
    ],
    remainder="passthrough",
    n_jobs=-1,
    force_int_remainder_cols=False,
    verbose_feature_names_out=False
)

preprocessor.set_output(transform="pandas")

In [0]:
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)



In [0]:
# Title: Compute class weights and sample weights for imbalanced classification
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)

class_weight_dict = dict(zip(classes, class_weights))

# Convert to sample weights
sample_weight = np.array([class_weight_dict[label] for label in y_train])

In [0]:
TARGET = "Credit_Score"

# train the model on best parameters
xgb_params = {'n_estimators': 160, 
              'learning_rate': 0.13517507315287902, 
              'max_depth': 10, 
              'min_child_weight': 3, 
              'gamma': 0.19187053861391912, 
              'subsample': 0.7453261407003142, 
              'colsample_bytree': 0.9022795822470242, 
              'reg_lambda': 52.61564564741222}
    
# ---------------------------
# Train + log model

# train the model on best parameters
best_xgb = XGBClassifier(**xgb_params)

full_pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model',best_xgb),
    ])

# train the model
full_pipe.fit(X_train,y_train, model__sample_weight=sample_weight )

# get the predictions
y_pred_train = full_pipe.predict(X_train)
y_pred_test = full_pipe.predict(X_test)

y_train_proba = full_pipe.predict_proba(X_train)
y_test_proba = full_pipe.predict_proba(X_test)

y_pred_train = y_train_proba.argmax(axis=1)
y_pred_test = y_test_proba.argmax(axis=1)

cv_scores = cross_val_score(full_pipe,
                         X_train,
                         y_train,
                         scoring="roc_auc_ovr",
                         cv=5,n_jobs=-1)

with mlflow.start_run(run_name="final_model_training") as run:
    # set tags
    mlflow.set_tag("model"," Credit Score Classification")
 
    # log parameters
    mlflow.log_params(xgb_params)

    # CV metric (already correct)
    mlflow.log_metric("cv_roc_auc_ovr", cv_scores.mean())

    #  ROC-AUC (USE PROBABILITIES)
    mlflow.log_metric(
        "train_roc_auc_ovr",
        roc_auc_score(y_train, y_train_proba, multi_class="ovr")
    )

    mlflow.log_metric(
        "test_roc_auc_ovr",
        roc_auc_score(y_test, y_test_proba, multi_class="ovr")
    )
     # F1 (USE macro for multiclass)
    mlflow.log_metric(
        "train_f1_macro",
        f1_score(y_train, y_pred_train, average="macro")
    )

    mlflow.log_metric(
        "test_f1_macro",
        f1_score(y_test, y_pred_test, average="macro")
    )

    #  Accuracy (USES LABELS)
    mlflow.log_metric("train_accuracy", accuracy_score(y_train, y_pred_train))
    mlflow.log_metric("test_accuracy", accuracy_score(y_test, y_pred_test))

    # log individual cv scores
    mlflow.log_metrics({f"CV {num}": score for num, score in enumerate(-cv_scores)})

    # mlflow dataset input datatype
    train_data_input = mlflow.data.from_pandas(train_df, targets=TARGET)
    val_df = X_test.copy()
    val_df[TARGET] = y_test
    test_data_input = mlflow.data.from_pandas(val_df, targets=TARGET)
    
    # log input
    mlflow.log_input(dataset=train_data_input,context="training")
    mlflow.log_input(dataset=test_data_input,context="validation")

    # model signature
    model_signature = mlflow.models.infer_signature(model_input=X_train.sample(20,random_state=42),
                                model_output=full_pipe.predict(X_train.sample(20,random_state=42)))
    
    # log the final model
    mlflow.sklearn.log_model(full_pipe,"credit_score_classification",signature=model_signature)

    # get the current run artifact uri
    artifact_uri = mlflow.get_artifact_uri()


In [0]:
%sql
CREATE VOLUME IF NOT EXISTS credit_catalog.credit_schema.run_information;

In [0]:
import shutil
import os
from pathlib import Path
import json

def save_model_info(root_path,save_json_path, run_id, artifact_path, model_name):
    info_dict = {
        "run_id": run_id,
        "artifact_path": artifact_path,
        "model_name": model_name
    }
    run_id_path = os.path.join(root_path, save_json_path)
    with open(run_id_path, "w") as f:
        json.dump(info_dict, f, indent=4)


# ðŸ”¹ Use DBFS path
root_path = Path(
    "/Volumes/credit_catalog/credit_schema/run_information"
)

run_id = run.info.run_id
model_name = "credit_score_classification"
artifact_uri = f"runs:/{run_id}/model"

save_json_path = "run_information.json"

save_model_info(
    root_path=root_path,
    save_json_path=save_json_path,
    run_id=run_id,
    artifact_path=artifact_uri,
    model_name=model_name
)
