In [0]:
%pip install databricks-feature_engineering

In [0]:
%pip install lightgbm

In [0]:
%restart_python

# **Model Logging**

### 1.will create input ,output signature to register model in unity catalog


### 2. Save features used by model as json in folder artifacts to correctly match features no mismatch happen during inferencing

### 3. create a custom wrapper class to control how the model is loaded, how inputs are processed, and how predictions are madeâ€”independent of how the model was originally trained. 

### 4.then log that class to unity catalog table

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import os
import json
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import joblib
from datetime import datetime, date
import mlflow
import mlflow.lightgbm
from sklearn.metrics import classification_report,roc_auc_score,f1_score
from mlflow.models.signature import infer_signature
from databricks.feature_engineering import FeatureEngineeringClient
from databricks.feature_engineering import FeatureEngineeringClient, FeatureLookup
from sklearn.preprocessing import LabelEncoder
import json
import requests
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from pyspark.sql.functions import col

## INFER SIGNATURE

The signature is used for:

Model Serving input validation

Batch inference validation

Feature mismatch protection

Safe model upgrades

Automated governance

### first we load a sample training dataset using feature stores

### then we load the features use by model using model.input_features_names_ 

### then we create the dataframe with those features

### create signature using that input feature dataframe and output predicted using model on that dataframe

we are using table that we created in preprocessing stages just to get label ,target so to create spark label dataframe to featch training features using loan_id,target variable

In [0]:
# Load the base Spark table containing training data
# This table typically includes loan_id and target label
base_4 = spark.table('ispl_databricks.model_logs.bd_500_features_sample_training')

In [0]:
# Select only the primary key (loan_id) and target column
# This DataFrame will act as the label dataset
spark_label =  base_4.select(col('loan_id'),col('target'))

In [0]:
fe = FeatureEngineeringClient()

In [0]:
# Create a Feature Store training set
# This joins labels with features using loan_id as the lookup key
training_set = fe.create_training_set(
    df=spark_label,
    feature_lookups=[
        FeatureLookup(
            table_name="ispl_databricks.model_logs.bd_final_feature_stores",
            lookup_key="loan_id"
        )
    ],
    label="target"
)


In [0]:
#since what we got is an spark dataframe but to log we have to convert pandas dataframe
train_pd = training_set.load_df().toPandas()
train_pd = train_pd.dropna()
#dropping loan_id,target from train_x
train_x  = train_pd.drop(['loan_id','target'], axis=1)
#selecting target variable for train_Y
train_y = train_pd['target']

In [0]:
# taking a a single row dataframe for input_X to create model signature
input_X = train_x.iloc[[0]]

loading model to featch features used by model

In [0]:

# Load a pre-trained model from Databricks workspace using joblib
# This model was previously trained (e.g., on top 50 features)
model = joblib.load('/Workspace/Shared/ff_bd/model_artifacts/top50model.pkl')

In [0]:
# get features names that model uses for inference saving those features to a json so that while logging model there are no mismatch and we can get features that were used by model as in light_bgm sometime model pass and model it take for training changes
trained_feature_names = model.feature_name_

In [0]:
len(trained_feature_names)

In [0]:
# creating feature json will dump in model artifact folder so that while creating custom wrapper class we can get features that were used by model
feature_json = {'features': trained_feature_names}

In [0]:
#dumping the json
with open('/Workspace/Shared/ff_bd/model_artifacts/model_features.json','w') as f:
    json.dump(feature_json, f, ensure_ascii=False, indent=4)

### infer signature

In [0]:
# creating model signature 
# Align input features to exactly match the features used during training
# This avoids feature mismatch issues during inference
input_X_aligned = input_X[trained_feature_names]
# Generate model predictions (probability scores instead of class labels)
# predict_proba is commonly used for classification models
output = model.predict_proba(input_X_aligned)
# Infer the MLflow model signature automatically
# The signature captures:
#  - Input schema (feature names + data types)
#  - Output schema (prediction shape + types)
# This is critical for model serving and validation
signature = infer_signature(input_X_aligned, output)

In [0]:
with open('/Workspace/Shared/ff_bd/model_artifacts/model_features.json') as file:
    data = json.load(file)

In [0]:
data

### final model logging

creating custom wrapper class

In [0]:
# Custom MLflow PyFunc wrapper for model loading and inference
# This allows the model to be served in a standardized way
class mlwrapper(mlflow.pyfunc.PythonModel):
     # Load the trained model artifact at model serving / inference time
    def load_context(self,context):\
        # load model from model artifact folder
        self.model = joblib.load(context.artifacts['model_artifacts']+'/top50model.pkl')
        # Load feature metadata used during training
        # This ensures feature consistency during inference
        with open(context.artifacts['model_artifacts']+'/model_features.json', 'r') as file:
            data = json.load(file)
        
        # Store the list of trained feature columns
        self.fc = data['features']
        print(self.fc)
        
    def predict(self,context,model_input):

        # Align incoming inference data with trained feature columns
        # This prevents feature mismatch issues
        df = model_input[self.fc]
        # Return class probability predictions
        return self.model.predict_proba(df)

running mlflow experiment and registering model into unity catalog

In [0]:
with mlflow.start_run():

    # Log evaluation metric for the trained model
    mlflow.log_metric("test_accuracy", metric['accuracy'])

    # Log the model using MLflow PyFunc format
    # This makes the model deployable via MLflow Model Serving
    mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=mlwrapper(),
        artifacts={"model_artifacts": "/Workspace/Shared/ff_bd/model_artifacts"},
        registered_model_name="ispl_databricks.model_logs.final_bd_model",
        signature=signature
    )

### fetching latest model version doing inference on it to test whether model logged succesddfully or not

In [0]:

# fetching latest version of model
from mlflow.tracking import MlflowClient
client = MlflowClient()
versions = client.search_model_versions("name = 'ispl_databricks.model_logs.final_bd_model'")

latest_version = sorted(versions, key=lambda v: int(v.version))[-1].version

In [0]:
# uri of latest model
model_uri = f"models:/ispl_databricks.model_logs.final_bd_model/{latest_version}"

In [0]:
# load latest model
model = mlflow.pyfunc.load_model(model_uri=model_uri)

In [0]:
# make prediction using latest model
model.predict(input_X_aligned)