In [0]:
pip install lightgbm

In [0]:
pip install hyperopt

In [0]:
%pip install databricks-feature_engineering

In [0]:
%restart_python

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import os
import json
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import joblib
from datetime import datetime, date
import mlflow
import mlflow.lightgbm
from sklearn.metrics import classification_report,roc_auc_score,f1_score
from mlflow.models.signature import infer_signature
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from databricks.feature_engineering import FeatureEngineeringClient
from databricks.feature_engineering import FeatureEngineeringClient, FeatureLookup
from sklearn.preprocessing import LabelEncoder
import json
import requests
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from pyspark.sql.functions import col


This pipeline performs hyperparameter optimization for a LightGBM classifier using Hyperopt. For each trial, a model is trained on the training dataset, evaluated on a validation set, and tracked as a nested MLflow run. Hyperparameters and accuracy metrics are logged for experiment comparison, while validation log loss is computed and returned as the optimization objective. This design ensures systematic experimentation, reproducibility, and seamless model selection for production deployment.

### Creating training and test dataset"

In [0]:
# Load base training data from Databricks table
# This table contains loan_id and target label
base_4 = spark.table('ispl_databricks.model_logs.bd_500_features_sample_training')

In [0]:
# Extract only the primary key and target label
# This will act as the label DataFrame
spark_label =  base_4.select(col('loan_id'),col('target'))

In [0]:
fe = FeatureEngineeringClient()

In [0]:
# Create a Feature Store training set
# This joins:
#   - Labels (loan_id, target)
#   - Features from Feature Store
# Using loan_id as the lookup key
training_set = fe.create_training_set(
    df=spark_label,
    feature_lookups=[
        FeatureLookup(
            table_name="ispl_databricks.model_logs.bd_final_feature_stores",
            lookup_key="loan_id"
        )
    ],
    label="target"
)


In [0]:

# --------------------------------------------
# Load the training set as a Spark DataFrame
# Then convert it to Pandas for model training
# --------------------------------------------
train_pd = training_set.load_df().toPandas()
# Remove rows with missing values
# Ensures clean data for model training
train_pd = train_pd.dropna()
# Separate features (X) and target (y)
# Remove:
#   - loan_id (primary key)
#   - target (label)
train_x  = train_pd.drop(['loan_id','target'], axis=1)
train_y = train_pd['target']

In [0]:
train_x.shape

In [0]:
train_x.shape

In [0]:
train_x

In [0]:
train_x.shape

In [0]:
#train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [0]:
X_train.shape

In [0]:
data_version = 2

### hyperopt experiments

In [0]:
mlflow.set_experiment("/Workspace/Shared/ff_bd/LGBM_TopN_Features_Training")
if mlflow.active_run():
    mlflow.end_run()

# Train or load LGBMClassifier
with mlflow.start_run(run_name=f"LGBM_{data_version}") as run:

        # -----------------------------
        # Hyperparameter tuning section
        # -----------------------------
    search_space = {
            'num_leaves': scope.int(hp.quniform('num_leaves', 20, 150, 1)),
            'max_depth': scope.int(hp.quniform('max_depth', 3, 15, 1)),
            'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
            'n_estimators': scope.int(hp.quniform('n_estimators', 100, 800, 50)),
            'min_child_samples': scope.int(hp.quniform('min_child_samples', 10, 100, 5)),
            'subsample': hp.uniform('subsample', 0.6, 1.0),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0)
        }

    def objective(params):
            # Determine the current Hyperopt trial number
            # trials.trials stores all completed trials so far
        trial_id = len(trials.trials)
            # Start an MLflow nested run for this trial
            # Each Hyperopt trial is logged as a child run

        with mlflow.start_run(run_name=f"trial_{trial_id}_LGBM",nested=True):
              # ----------------------------------------------
        # Initialize LightGBM classifier with:
        # - Fixed random seed for reproducibility
        # - Balanced class weights for imbalanced data
        # - Hyperparameters suggested by Hyperopt
        # ----------------------------------------------
            model = lgb.LGBMClassifier(
                random_state=42,
                class_weight='balanced',
                **params
            )
                  # Train the model using training data
                  # Evaluate performance on validation data
            model.fit(
                X_train, y_train,
                eval_set=[(X_test, y_test)],
                eval_metric="binary_logloss"
            )
            # Generate predictions on validation data
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_params(params)
           
        

        

       


        val_pred_proba = model.predict_proba(X_test)[:, 1]
        val_logloss = -((y_test * np.log(val_pred_proba) + (1 - y_test) * np.log(1 - val_pred_proba)).mean())

        return {'loss': val_logloss, 'status': STATUS_OK}

    trials = Trials()
    best_params = fmin(
            fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=30,
            trials=trials,
            rstate=np.random.default_rng(42)
        )

        # Convert integer-like floats back to ints
    best_params = {
            k: int(v) if isinstance(v, float) and v.is_integer() else v
            for k, v in best_params.items()
        }

    mlflow.log_params(best_params)
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("random_state", 42)

        # -----------------------------
        # Train final model with best params
        # -----------------------------
    model = lgb.LGBMClassifier(
            random_state=42,
            class_weight='balanced',
            **best_params
        )

    model = model.fit(
            X_train,
            y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            eval_names=["train", "valid"],
            eval_metric=["binary_logloss"]
        )
    acc = accuracy_score(y_test, model.predict(X_test))
    mlflow.log_metric("test_accuracy", acc)
   
  

    
    

        # Infer model signature from training data and predictions
    

        # Save the trained model locally
    joblib.dump(model,'/Workspace/Shared/ff_bd/model_artifacts/top50model.pkl')
        

### saving model as well as metrics

In [0]:
X_train.shape

In [0]:
# creating a metrics json which stores accuracy of our best model so thst we can store in model artifact folder and log in mlflow
metrics_json =  {'accuracy':acc}

In [0]:
# we are saving json in model_artifact folder it will help us to log accuracy in mlflow when we will log our best model
with open('/Workspace/Shared/ff_bd/model_artifacts/model_metric.json','w') as f:
    json.dump(metrics_json, f, ensure_ascii=False, indent=4)

In [0]:
# dumping model into model_artifacts
joblib.dump(model,'/Workspace/Shared/ff_bd/model_artifacts/top50model.pkl')
        