In [0]:
!pip install lightgbm

In [0]:
pip install hyperopt

In [0]:
%pip install databricks-feature_engineering

In [0]:
%restart_python

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import os
import json
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import joblib
from datetime import datetime, date
import mlflow
import mlflow.lightgbm
from sklearn.metrics import classification_report,roc_auc_score,f1_score
from mlflow.models.signature import infer_signature
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from databricks.feature_engineering import FeatureEngineeringClient
from databricks.feature_engineering import FeatureEngineeringClient, FeatureLookup
from sklearn.preprocessing import LabelEncoder
import json
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from mlflow.tracking import MlflowClient
from sklearn.metrics import accuracy_score


In [0]:
base_df2 = spark.table("ispl_databricks.model_logs.base_df_500features_updated")

In [0]:
from pyspark.sql.functions import col
spark_label =  base_df2.select(col('loan_id'), col('target_30_dpd'))


In [0]:
base_df2 = base_df2.toPandas()

In [0]:
base_df2.head()

In [0]:

target_feature = "actual_label"

In [0]:
train_data = base_df2[base_df2['type'] == 'train'].copy()
test_data = base_df2[base_df2['type'] == 'test'].copy()
live_data = base_df2[base_df2['type'] == 'live'].copy()

In [0]:
train_data = train_data[['target_30_dpd','loan_id']]
test_data = test_data[['target_30_dpd','loan_id']]
live_data = live_data[['target_30_dpd','loan_id']]

In [0]:
train_data.head()

In [0]:
spark_train_label = spark.createDataFrame(train_data)
spark_test_label = spark.createDataFrame(test_data)
spark_live_label = spark.createDataFrame(live_data)

In [0]:
fe = FeatureEngineeringClient()

In [0]:
training_set = fe.create_training_set(
    df=spark_train_label,
    feature_lookups=[
        FeatureLookup(
            table_name="ispl_databricks.model_logs.mw_feature_store_500",
            lookup_key="loan_id"
        )
    ],
    label="target_30_dpd"
)


In [0]:
train_pd = training_set.load_df().toPandas()
X_train = train_pd.drop(['loan_id','target_30_dpd'], axis=1)
y_train = train_pd['target_30_dpd']

In [0]:
test_set = fe.create_training_set(
    df=spark_test_label,
    feature_lookups=[
        FeatureLookup(
            table_name="ispl_databricks.model_logs.mw_feature_store_500",
            lookup_key="loan_id"
        )
    ],
    label="target_30_dpd"
)

In [0]:
test_pd = test_set.load_df().toPandas()
X_test = test_pd.drop(['loan_id','target_30_dpd'], axis=1)
y_test = test_pd['target_30_dpd']
# Predict on a test

In [0]:
X_train.head()

In [0]:
# Config flags and paths
model_trainYN = 1
data_version = "base"

input_dir  = "/Volumes/ispl_databricks/default/training/MW_Train/input_dir"
output_dir = "/Volumes/ispl_databricks/default/training/MW_Train/OUTPUT_DIR_NEW"
model_dir  = "/Volumes/ispl_databricks/default/training/MW_Train/model_dir"


# Create main directories if missing
for directory in [output_dir, model_dir]:
    os.makedirs(directory, exist_ok=True)


# Version-specific directories
# Choose data version (change here if needed)
# data_version = "data_v5_new/top_20"
input_dir_version  = os.path.join(input_dir, data_version)
output_dir_version = os.path.join(output_dir, data_version)
model_dir_version  = os.path.join(model_dir, data_version)

# Create version-specific directories if missing
for directory in [output_dir_version, model_dir_version]:
    os.makedirs(directory, exist_ok=True)


# Define model and feature file names
model_file_name   = "lgb_model.pickle"
feature_file_name = "model_input_feature.pickle"

In [0]:
# Ensure model directory exists
os.makedirs(model_dir_version, exist_ok=True)

# MLflow experiment setup
if mlflow.active_run():
    mlflow.end_run()

mlflow.set_experiment("/Workspace/Shared/ff_mw/ff_mw/MW_LightGBM_Training")

with mlflow.start_run(run_name=f"LGBM_{data_version}") as run:

    if model_trainYN == 1:

        # Define search space for hyperparameter tuning
        search_space = {
            'num_leaves': scope.int(hp.quniform('num_leaves', 20, 150, 1)),
            'max_depth': scope.int(hp.quniform('max_depth', 3, 15, 1)),
            'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
            'n_estimators': scope.int(hp.quniform('n_estimators', 100, 800, 50)),
            'min_child_samples': scope.int(hp.quniform('min_child_samples', 10, 100, 5)),
            'subsample': hp.uniform('subsample', 0.6, 1.0),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0)
        }

        # Define objective function
        def objective(params):
            trial_id = len(trials.trials)
            with mlflow.start_run(run_name=f"trial_{trial_id}_LGBM",nested=True):
                model = lgb.LGBMClassifier(
                random_state=42,
                class_weight='balanced',
                **params
                )
                model.fit(
                X_train, y_train,
                eval_set=[(X_test, y_test)],
                eval_metric="binary_logloss"
                )
                val_pred_proba = model.predict_proba(X_test)[:, 1]
                val_logloss = -((y_test * np.log(val_pred_proba) + (1 - y_test) * np.log(1 - val_pred_proba)).mean())
                mlflow.log_metric("validation_logloss", val_logloss)
                acc = accuracy_score(y_test, model.predict(X_test))
                mlflow.log_metric("accuracy", acc)
                mlflow.log_params(params)
                return {'loss': val_logloss, 'status': STATUS_OK}


           

        # Run Hyperopt optimization
        trials = Trials()
        best_params = fmin(
            fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=30,
            trials=trials,
            rstate=np.random.default_rng(42)
        )

        # Train final model with best params
        best_params = {
            k: int(v) if isinstance(v, float) and v.is_integer() else v
            for k, v in best_params.items()
        }
        model = lgb.LGBMClassifier(
            random_state=42,
            class_weight='balanced',
            **best_params
        )

        model.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            eval_names=["train", "valid"],
            eval_metric=["binary_logloss"]
        )

        # Log parameters and metrics to MLflow
        mlflow.log_params(best_params)
        mlflow.log_param("class_weight", "balanced")
        mlflow.log_param("random_state", 42)
        mlflow.log_param("num_features", X_train.shape[1])

        # Evaluate on validation set
        val_pred_proba = model.predict_proba(X_test)[:, 1]
        val_logloss = -((y_test * np.log(val_pred_proba) + (1 - y_test) * np.log(1 - val_pred_proba)).mean())
        mlflow.log_metric("validation_logloss", val_logloss)

        # Save model locally
        pickle.dump(model, open(os.path.join(model_dir_version, model_file_name), "wb"))
        joblib.dump(model, os.path.join(model_dir_version, "job_model.pkl"))

        # Save input feature list
        input_feature_model = X_train.columns.tolist()
        pickle.dump(input_feature_model, open(os.path.join(model_dir_version, feature_file_name), "wb"))

        # Log model to MLflow and register as endpoint-ready model
      

    else:
        # Load pre-trained model
        model = pickle.load(open(os.path.join(model_dir_version, model_file_name), "rb"))
        model = joblib.load(os.path.join(model_dir_version, "job_model.pkl"))

        # Load input feature list
        input_feature_model = pickle.load(open(os.path.join(model_dir_version, feature_file_name), "rb"))

    print(f"MLflow Run ID: {run.info.run_id}")
    print(f"Model trained: {model_trainYN == 1}")
    print(f"Number of input features: {len(input_feature_model)}")


In [0]:
model = pickle.load(open(os.path.join(model_dir_version, model_file_name), "rb"))

In [0]:
model = pickle.load(open(os.path.join(model_dir_version, model_file_name), "rb"))

In [0]:
joblib.dump(model,'/Workspace/Shared/ff_mw/ff_mw/model_artifacts/500featuresmodel.pkl')

In [0]:
model = joblib.load('/Workspace/Shared/ff_mw/ff_mw/model_artifacts/500featuresmodel.pkl')

In [0]:
training_feature = model.feature_name_

In [0]:
len(training_feature)

In [0]:
input_x = X_train[training_feature].iloc[[1]]

In [0]:
input_y = model.predict(input_x)

In [0]:
signature = infer_signature(input_x, input_y)


In [0]:
accuracy = accuracy_score(y_test, model.predict(X_test[training_feature]))

In [0]:
print(accuracy)

In [0]:
class mlwrapper(mlflow.pyfunc.PythonModel):
    def load_context(self,context):
        self.model = joblib.load(context.artifacts['model_artifacts']+'/500featuresmodel.pkl')
        self.fc = model.feature_name_
        print(self.fc)
        
    def predict(self,context,model_input):
        df = model_input[self.fc]
        return self.model.predict_proba(df)

In [0]:
with mlflow.start_run():
    mlflow.log_metric("test_accuracy", accuracy)

    

    mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=mlwrapper(),
        artifacts={"model_artifacts": '/Workspace/Shared/ff_mw/ff_mw/model_artifacts'},
        registered_model_name="ispl_databricks.model_logs.ffmw_lgbm_all_columns_endpoint",
        signature=signature
    )

In [0]:
model_name = 'ispl_databricks.model_logs.ffmw_lgbm_all_columns_endpoint'

In [0]:

client = MlflowClient()


In [0]:

latest_versions = client.search_model_versions(
    f"name='{model_name}'"
)

In [0]:
versions = [int(versions.version) for versions in latest_versions]

In [0]:
versions.sort(reverse = True)

In [0]:
latest_version = str(versions[0])

In [0]:
model_uri = f"models:/{model_name}/{latest_version}"
loaded_model = mlflow.pyfunc.load_model(model_uri)


In [0]:
loaded_model.predict(X_train[training_feature].iloc[[2]])

In [0]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
ax = lgb.plot_importance(model, max_num_features=20, importance_type='gain')
plt.title("Top 20 Feature Importances")
plt.show()

# Optionally, log feature importance plot to MLflow
mlflow.log_figure(ax.figure, "feature_importance.png")


# Plot training evaluation metrics (logloss over iterations)
plt.figure(figsize=(10, 6))
ax = lgb.plot_metric(model)
plt.title("Training Evaluation Metric (Binary Logloss)")
plt.show()

# Optionally, log evaluation metric plot to MLflow
mlflow.log_figure(ax.figure, "training_evaluation.png")