In [0]:
!pip install lightgbm

In [0]:
pip install hyperopt

In [0]:
%pip install databricks-feature_engineering

In [0]:
%restart_python

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import os
import json
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import joblib
from datetime import datetime, date
import mlflow
import mlflow.lightgbm
from sklearn.metrics import classification_report,roc_auc_score,f1_score
from mlflow.models.signature import infer_signature
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from databricks.feature_engineering import FeatureEngineeringClient
from databricks.feature_engineering import FeatureEngineeringClient, FeatureLookup
from sklearn.preprocessing import LabelEncoder
import json
import requests
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from pyspark.sql.functions import col



This pipeline performs hyperparameter optimization for a LightGBM classifier using Hyperopt. For each trial, a model is trained on the training dataset, evaluated on a validation set, and tracked as a nested MLflow run. Hyperparameters and accuracy metrics are logged for experiment comparison, while validation log loss is computed and returned as the optimization objective. This design ensures systematic experimentation, reproducibility, and seamless model selection for production deployment.

### Creating training and test dataset"

In [0]:
# Load base training data from Databricks table
# This table contains loan_id and target label
base_df2 = spark.table("ispl_databricks.model_logs.base_df_500features_updated")

In [0]:
# Extract only the primary key and target label
# This will act as the label DataFrame
spark_label =  base_df2.select(col('loan_id'),col('target_30_dpd'))


In [0]:

fe = FeatureEngineeringClient()

In [0]:
# Create a Feature Store training set
# This joins:
#   - Labels (loan_id, target)
#   - Features from Feature Store
# Using loan_id as the lookup key
training_set = fe.create_training_set(
    df=spark_label,
    feature_lookups=[
        FeatureLookup(
            table_name="ispl_databricks.model_logs.mw_final_feature_store",
            lookup_key="loan_id"
        )
    ],
    label="target_30_dpd"
)


In [0]:

# --------------------------------------------
# Load the training set as a Spark DataFrame
# Then convert it to Pandas for model training
# --------------------------------------------
train_pd = training_set.load_df().toPandas()
# Remove rows with missing values
# Ensures clean data for model training
train_x  = train_pd.drop(['loan_id','target_30_dpd'], axis=1)
# Separate features (X) and target (y)
# Remove:
#   - loan_id (primary key)
#   - target (label)
train_y = train_pd['target_30_dpd']

In [0]:
#train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [0]:
data_version = 2

### hyperopt experiments

In [0]:
from sklearn.metrics import accuracy_score

In [0]:
mlflow.set_experiment("/Workspace/Shared/ff_mw/ff_mw/MW_LightGBM_Top50_Training")
if mlflow.active_run():
    mlflow.end_run()

# Train or load LGBMClassifier
with mlflow.start_run(run_name=f"LGBM_{data_version}") as run:

        # -----------------------------
        # Hyperparameter tuning section
        # -----------------------------
    search_space = {
            'num_leaves': scope.int(hp.quniform('num_leaves', 20, 150, 1)),
            'max_depth': scope.int(hp.quniform('max_depth', 3, 15, 1)),
            'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
            'n_estimators': scope.int(hp.quniform('n_estimators', 100, 800, 50)),
            'min_child_samples': scope.int(hp.quniform('min_child_samples', 10, 100, 5)),
            'subsample': hp.uniform('subsample', 0.6, 1.0),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0)
        }

    def objective(params):
        with mlflow.start_run(nested=True):
            model = lgb.LGBMClassifier(
                random_state=42,
                class_weight='balanced',
                **params
            )
            model.fit(
                X_train, y_train,
                eval_set=[(X_test, y_test)],
                eval_metric="binary_logloss"
            )
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test,y_pred)
            mlflow.log_metric('accuracy', acc)
            mlflow.log_params(params)
            val_pred_proba = model.predict_proba(X_test)[:, 1]
            val_logloss = -((y_test * np.log(val_pred_proba) + (1 - y_test) * np.log(1 - val_pred_proba)).mean())
            return {'loss': val_logloss, 'status': STATUS_OK}

    trials = Trials()
    best_params = fmin(
            fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=30,
            trials=trials,
            rstate=np.random.default_rng(42)
        )

        # Convert integer-like floats back to ints
    best_params = {
            k: int(v) if isinstance(v, float) and v.is_integer() else v
            for k, v in best_params.items()
        }

    mlflow.log_params(best_params)
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("random_state", 42)

        # -----------------------------
        # Train final model with best params
        # -----------------------------
    model = lgb.LGBMClassifier(
            random_state=42,
            class_weight='balanced',
            **best_params
        )

    model = model.fit(
            X_train,
            y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            eval_names=["train", "valid"],
            eval_metric=["binary_logloss"]
        )
    acc = accuracy_score(y_test, model.predict(X_test))
    mlflow.log_metric("test_accuracy", acc)
    
    

        # Infer model signature from training data and predictions
    

        # Save the trained model locally
    joblib.dump(model,'/Workspace/Shared/ff_mw/ff_mw/model_artifacts/mwtop50_new.pkl')
        

#### saving model as well as metrics in model artifacts folder

In [0]:
# get the features name
trained_feature_names = model.feature_name_

In [0]:
len(trained_feature_names)

In [0]:
# creating json file containing feature name accuracy so while creating a wrapper class to log model to avoid features mismatch and logged accuracy in mlflow run
features_json =  {'features' : trained_feature_names,'accuracy':acc}

In [0]:
# saving model into model_artifacts folder
joblib.dump(model,'/Workspace/Shared/ff_mw/ff_mw/model_artifacts/mwtop50.pkl')

In [0]:
# we are saving json in model_artifact folder it will help us to log accuracy in mlflow when we will log our best model
with open('/Workspace/Shared/ff_mw/ff_mw/model_artifacts/features.json','w') as f:
    json.dump(features_json, f, ensure_ascii=False, indent=4)

### defining signature

The signature is used for:

Model Serving input validation

Batch inference validation

Feature mismatch protection

Safe model upgrades

Automated governance

we will create a input dataframe using features used in model than predict output on it define signature on input,output

In [0]:
# creating model signature 
# Align input features to exactly match the features used during training
# This avoids feature mismatch issues during inference
input_x = train_x[trained_feature_names].iloc[[0]]

In [0]:
# loading model
model = joblib.load('/Workspace/Shared/ff_mw/ff_mw/model_artifacts/mwtop50_new.pkl')

In [0]:

# Generate model predictions (probability scores instead of class labels)
# predict_proba is commonly used for classification models
output = model.predict_proba(input_x)


In [0]:
print(output)

In [0]:
# Infer the MLflow model signature automatically
# The signature captures:
#  - Input schema (feature names + data types)
#  - Output schema (prediction shape + types)
# This is critical for model serving and validation
signature = infer_signature(input_x, output)

### final model logging

creating custom wrapper class

We create a custom wrapper class to hide complexity, reuse code, and keep logic consistent when working with libraries or external systems.
It makes code cleaner, easier to change, easier to test, and lets us add common features like logging, validation, and monitoring in one place.

In [0]:
# Custom MLflow PyFunc wrapper for model loading and inference
# This allows the model to be served in a standardized way
class mlwrapper(mlflow.pyfunc.PythonModel):

     # Load the trained model artifact at model serving / inference time
    def load_context(self,context):
        # load model from model artifact folder
        self.model = joblib.load(context.artifacts['model_artifacts']+'/mwtop50_new.pkl')
        # Load feature metadata used during training
        # This ensures feature consistency during inference
        with open(context.artifacts['model_artifacts']+'/features.json', 'r') as file:
            data = json.load(file)
        
        # Store the list of trained feature columns
        self.fc = data['features']
        print(self.fc)
        
    def predict(self,context,model_input):

        # Align incoming inference data with trained feature columns
        # This prevents feature mismatch issues
        df = model_input[self.fc]
        # Return class probability predictions
        return self.model.predict_proba(df)

running mlflow experiment and registering model into unity catalog

In [0]:
with mlflow.start_run():

    # Log evaluation metric for the trained model
    mlflow.log_metric("test_accuracy", acc)

    # Log the model using MLflow PyFunc format
    # This makes the model deployable via MLflow Model Serving


    mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=mlwrapper(),
        artifacts={"model_artifacts": "/Workspace/Shared/ff_mw/ff_mw/model_artifacts"},
        registered_model_name="ispl_databricks.model_logs.final_mw_model",
        signature=signature
    )

### inference

#### fetching latest model

In [0]:

from mlflow.tracking import MlflowClient
client = MlflowClient()
versions = client.search_model_versions("name = 'ispl_databricks.model_logs.final_mw_model'")

latest_version = sorted(versions, key=lambda v: int(v.version))[-1].version

In [0]:

model_uri = f'models:/ispl_databricks.model_logs.final_mw_model/{latest_version}'

In [0]:
model = mlflow.pyfunc.load_model(model_uri)

####prediction

In [0]:
model.predict(train_x[trained_feature_names].iloc[[3]])