In [0]:
!pip install lightgbm

In [0]:
pip install hyperopt

In [0]:
%pip install databricks-feature_engineering

In [0]:
%restart_python

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import os
import json
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import joblib
from datetime import datetime, date
import mlflow
import mlflow.lightgbm
from sklearn.metrics import classification_report,roc_auc_score,f1_score
from mlflow.models.signature import infer_signature
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from databricks.feature_engineering import FeatureEngineeringClient
from databricks.feature_engineering import FeatureEngineeringClient, FeatureLookup
from sklearn.preprocessing import LabelEncoder
import json
import requests
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score



###### what this notebook is doing Model Loading

##### Loads a trained LightGBM model

##### Uses either pickle or joblib

##### Feature Importance Extraction

##### Pulls feature importance directly from the trained model

##### Sorts features by gain/importance

##### Feature Selection

##### Saves:

##### All features with importance

##### Top 50 most important features

##### Production Readiness

##### Top 50 features can be:

##### Registered in Feature Store

##### Used for retraining

##### Used for inference pipelines

In [0]:
# Config flags and paths
model_trainYN = 1
data_version = "base"

input_dir  = "/Volumes/ispl_databricks/default/training/MW_Train/input_dir"
output_dir = "/Volumes/ispl_databricks/default/training/MW_Train/OUTPUT_DIR_NEW"
model_dir  = "/Volumes/ispl_databricks/default/training/MW_Train/model_dir"


# Create main directories if missing
for directory in [output_dir, model_dir]:
    os.makedirs(directory, exist_ok=True)


# Version-specific directories
# Choose data version (change here if needed)
# data_version = "data_v5_new/top_20"
input_dir_version  = os.path.join(input_dir, data_version)
output_dir_version = os.path.join(output_dir, data_version)
model_dir_version  = os.path.join(model_dir, data_version)

# Create version-specific directories if missing
for directory in [output_dir_version, model_dir_version]:
    os.makedirs(directory, exist_ok=True)


# Define model and feature file names
model_file_name   = "lgb_model.pickle"
feature_file_name = "model_input_feature.pickle"

In [0]:
base_df2 = spark.table("ispl_databricks.model_logs.base_df_500features_updated").toPandas()

### loading model and features

In [0]:
# loading model
model = pickle.load(open(os.path.join(model_dir_version, model_file_name), "rb"))
model = joblib.load(os.path.join(model_dir_version, "job_model.pkl"))

In [0]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
ax = lgb.plot_importance(model, max_num_features=20, importance_type='gain')
plt.title("Top 20 Feature Importances")
plt.show()

# Optionally, log feature importance plot to MLflow
mlflow.log_figure(ax.figure, "feature_importance.png")


# Plot training evaluation metrics (logloss over iterations)
plt.figure(figsize=(10, 6))
ax = lgb.plot_metric(model)
plt.title("Training Evaluation Metric (Binary Logloss)")
plt.show()

# Optionally, log evaluation metric plot to MLflow
mlflow.log_figure(ax.figure, "training_evaluation.png")

#### loading features with thier feature importance score sort them in desc order and  fetch top 50 features

In [0]:
feature_imp_df = pd.DataFrame({
    'feature_name': model.feature_name_,
    'importance': model.feature_importances_
})

# Sort features by importance descending
feature_imp_df.sort_values(by="importance", ascending=False, inplace=True)

# Save all features with importance
feature_imp_df.to_csv(os.path.join(output_dir_version, "all_gain_features.csv"), index=False)

# Select top 50 features
top_50_features = feature_imp_df.head(50)

# Save top 50 features
top_50_features.to_csv(os.path.join(output_dir_version, "top50_gain_features.csv"), index=False)

# Export QC dataset with top 50 features
top50_feature_names = top_50_features['feature_name'].tolist()
base_df2[top50_feature_names].to_csv(os.path.join(output_dir_version, "top_50_features_qc_data.csv"), index=False)

# Display top 50 features
top_50_features

#### create a spark dataframe with top 50 features

In [0]:

# --------------------------------------------
# Extract only the feature names from the
# top N features DataFrame
# ----------------------------------------
feature_to_store = top_50_features["feature_name"].tolist()
# Add the primary key (loan_id)
# This is required for joins, feature store,
# training, and inference consistency
feature_to_store = feature_to_store + ['loan_id']
# Select only the required columns from base data
# This creates the final feature store DataFrame
# containing:
#   - Top N important features
#   - Primary key (loan_id)
available_features = [col for col in feature_to_store if col in base_df2.columns]
feature_store_df = base_df2.loc[:, available_features]

### creating feature store table for top 50 features

In [0]:
# just for precaution if any column is object type or category can be convert to int except loan_id
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in feature_store_df.columns:
    dtype = feature_store_df[col].dtype

    if col == "loan_id":
        continue

    if dtype.name in ["category", "object"]:
        le = LabelEncoder()
        feature_store_df[col] = le.fit_transform(
            feature_store_df[col].astype(str)
        )
        label_encoders[col] = le

In [0]:
feature_store_df.dtypes

In [0]:
# creating spark dataframe as feature store only takes spark dataframe
feature_store_df  = spark.createDataFrame(feature_store_df)
# drop duplucates loan_id as loan_id is primary key
feature_store_df = feature_store_df.drop_duplicates(['loan_id'])

### finally storing features into feature store table

In [0]:
# creation of feature store to store features that we will use to train top 50 features model
fe = FeatureEngineeringClient()
fe.create_table(
  name="ispl_databricks.model_logs.mw_final_feature_store",
  primary_keys=["loan_id"],
  df=feature_store_df,
  description="Feature table for the bank"
)
fe.write_table(
    name="ispl_databricks.model_logs.mw_final_feature_store",
    df=feature_store_df,                 # Spark or pandas DataFrame
    mode="merge"           # works like upsert (recommended)
)