In [0]:
pip install python-dotenv

Collecting python-dotenv
  Obtaining dependency information for python-dotenv from https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl.metadata
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import mlflow
import pandas as pd
from databricks import feature_engineering
from databricks.feature_engineering import FeatureLookup
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from mlflow.models import infer_signature
# from pyspark.sql import SparkSession
from databricks.connect import DatabricksSession
from pyspark.sql import functions as F
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

from utils import load_config

In [0]:
config = load_config("../project_config.yml")
parameters = config.parameters
print(config)

[32m2025-03-28 20:55:08.039[0m | [1mINFO    [0m | [36mutils[0m:[36mload_config[0m:[36m66[0m - [1mLoaded configuration from ../project_config.yml[0m


catalog_name='test01' schema_name='default' pipeline_id='4be66e88-11e8-4988-9fa3-459b9b28a83c' parameters={'learning_rate': 0.05, 'random_state': 42, 'force_col_wise': True} ab_test={'learning_rate_a': 0.05, 'learning_rate_b': 0.1, 'force_col_wise': True} num_features=[NumFeature(name='ID', dtype='int64'), NumFeature(name='LIMIT_BAL', dtype='float64'), NumFeature(name='SEX', dtype='int64'), NumFeature(name='EDUCATION', dtype='int64'), NumFeature(name='MARRIAGE', dtype='int64'), NumFeature(name='AGE', dtype='int64'), NumFeature(name='PAY_0', dtype='int64'), NumFeature(name='PAY_2', dtype='int64'), NumFeature(name='PAY_3', dtype='int64'), NumFeature(name='PAY_4', dtype='int64'), NumFeature(name='PAY_5', dtype='int64'), NumFeature(name='PAY_6', dtype='int64'), NumFeature(name='BILL_AMT1', dtype='float64'), NumFeature(name='BILL_AMT2', dtype='float64'), NumFeature(name='BILL_AMT3', dtype='float64'), NumFeature(name='BILL_AMT4', dtype='float64'), NumFeature(name='BILL_AMT5', dtype='float64'

In [0]:
# Initialize Spark and feature engineering client
spark = DatabricksSession.builder.getOrCreate()
fe = feature_engineering.FeatureEngineeringClient()

In [0]:
columns = [
    "Limit_bal",
    "Sex",
    "Education",
    "Marriage",
    "Age",
    "Pay_0",
    "Pay_2",
    "Pay_3",
    "Pay_4",
    "Pay_5",
    "Pay_6",
    "Bill_amt1",
    "Bill_amt2",
    "Bill_amt3",
    "Bill_amt4",
    "Bill_amt5",
    "Bill_amt6",
    "Pay_amt1",
    "Pay_amt2",
    "Pay_amt3",
    "Pay_amt4",
    "Pay_amt5",
    "Pay_amt6",
]

# First, create the feature table with original data
create_table_sql = f"""
CREATE OR REPLACE TABLE {config.catalog_name}.{config.schema_name}.features_balanced
(Id STRING NOT NULL,
 {', '.join([f'{col} DOUBLE' for col in columns])})
"""
spark.sql(create_table_sql)

DataFrame[]

In [0]:
# Add primary key and enable CDF
spark.sql(
    f"ALTER TABLE {config.catalog_name}.{config.schema_name}.features_balanced ADD CONSTRAINT features_balanced_pk PRIMARY KEY(Id);"
)
spark.sql(
    f"ALTER TABLE {config.catalog_name}.{config.schema_name}.features_balanced SET TBLPROPERTIES (delta.enableChangeDataFeed = true);"
)
# Convert Spark DataFrame to Pandas for SMOTE
train_pdf = spark.table(f"{config.catalog_name}.{config.schema_name}.train_set").toPandas()

In [0]:
train_pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Id                    24000 non-null  object        
 1   Limit_bal             24000 non-null  float64       
 2   Sex                   24000 non-null  int32         
 3   Education             24000 non-null  int32         
 4   Marriage              24000 non-null  int32         
 5   Age                   24000 non-null  int32         
 6   Pay_0                 24000 non-null  int32         
 7   Pay_2                 24000 non-null  int32         
 8   Pay_3                 24000 non-null  int32         
 9   Pay_4                 24000 non-null  int32         
 10  Pay_5                 24000 non-null  int32         
 11  Pay_6                 24000 non-null  int32         
 12  Bill_amt1             24000 non-null  float64       
 13  Bill_amt2       

In [0]:
# Separate features and target
X = train_pdf[columns]
y = train_pdf["Default"]

# Apply SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Create balanced DataFrame using only the train_set
balanced_df = pd.DataFrame(X_balanced, columns=columns)

# Identify the number of original samples
num_original_samples = len(train_pdf)

# Retain original Ids for the real samples and create new Ids for synthetic samples
# Start with 30001 to avoid conflicts with existing Ids
balanced_df["Id"] = train_pdf["Id"].values.tolist() + [
    str(i) for i in range(30001, 30001 + len(balanced_df) - num_original_samples)
]

In [0]:
len(balanced_df)

37354

In [0]:
# Convert back to Spark DataFrame and insert into feature table
balanced_spark_df = spark.createDataFrame(balanced_df)

# Cast columns in balanced_spark_df to match the schema of the Delta table
columns_to_cast = ["Sex", "Education", "Marriage", "Age", "Pay_0", "Pay_2", "Pay_3", "Pay_4", "Pay_5", "Pay_6"]

for column in columns_to_cast:
    balanced_spark_df = balanced_spark_df.withColumn(column, F.col(column).cast("double"))

balanced_spark_df.write.format("delta").mode("overwrite").saveAsTable(
    f"{config.catalog_name}.{config.schema_name}.features_balanced"
)

In [0]:
# Execute SQL to count rows
row_count = spark.sql(
    f"SELECT COUNT(*) AS row_count FROM {config.catalog_name}.{config.schema_name}.features_balanced"
).collect()[0]["row_count"]
print(f"The table has {row_count} rows.")

The table has 37354 rows.


In [0]:
# Check for duplicates in the 'Id' column
duplicate_ids = balanced_df[balanced_df["Id"].duplicated()]

if duplicate_ids.empty:
    print("No duplicate IDs found.")
else:
    print(f"Duplicate IDs found:\n{duplicate_ids}")

No duplicate IDs found.


In [0]:
# Now use create_training_set to create balanced training set
# Drop the original features that will be looked up from the feature store
# Define the list of columns you want to drop, including "Update_timestamp_utc"
columns_to_drop = columns + ["Update_timestamp_utc"]

# Drop the specified columns from the train_set
train_set = spark.table(f"{config.catalog_name}.{config.schema_name}.train_set").drop(*columns_to_drop)

In [0]:
mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")

training_set = fe.create_training_set(
    df=train_set,
    label="Default",
    feature_lookups=[
        FeatureLookup(
            table_name=f"{config.catalog_name}.{config.schema_name}.features_balanced",
            feature_names=columns,
            lookup_key="Id",
        )
    ],
    exclude_columns=["Update_timestamp_utc"],
)

In [0]:
# Load feature-engineered DataFrame
training_df = training_set.load_df().toPandas()
test_set = spark.table(f"{config.catalog_name}.{config.schema_name}.test_set").toPandas()

# Split features and target (exclude 'Id' from features)
X_train = training_df[columns]
y_train = training_df["Default"]
X_test = test_set[columns]
y_test = test_set["Default"]

features_robust = [
    "Limit_bal",
    "Bill_amt1",
    "Bill_amt2",
    "Bill_amt3",
    "Bill_amt4",
    "Bill_amt5",
    "Bill_amt6",
    "Pay_amt1",
    "Pay_amt2",
    "Pay_amt3",
    "Pay_amt4",
    "Pay_amt5",
    "Pay_amt6",
]

# Setup preprocessing and model pipeline
preprocessor = ColumnTransformer(
    transformers=[("robust_scaler", RobustScaler(), features_robust)],
    remainder="passthrough",
)

# Create the pipeline with preprocessing and the LightGBM classifier
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LGBMClassifier(**parameters))])

In [0]:
# Set and start MLflow experiment
mlflow.set_experiment(experiment_name="/Shared/test-feature")

with mlflow.start_run(tags={"branch": "serving"}) as run:
    run_id = run.info.run_id
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Calculate and print metrics
    auc_test = roc_auc_score(y_test, y_pred)

    print("Test AUC:", auc_test)

    # Log model parameters, metrics, and model
    mlflow.log_param("model_type", "LightGBM with preprocessing")
    mlflow.log_params(parameters)
    mlflow.log_metric("AUC", auc_test)

    signature = infer_signature(model_input=X_train, model_output=y_pred)

    # Log model with feature engineering
    fe.log_model(
        model=pipeline,
        flavor=mlflow.sklearn,
        artifact_path="lightgbm-pipeline-model-feature",
        training_set=training_set,
        signature=signature,
    )

2025/03/28 20:58:58 INFO mlflow.tracking.fluent: Experiment with name '/Shared/test-feature' does not exist. Creating a new experiment.


[LightGBM] [Info] Number of positive: 5323, number of negative: 18677
[LightGBM] [Info] Total Bins 3249
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221792 -> initscore=-1.255256
[LightGBM] [Info] Start training from score -1.255256




Test AUC: 0.6558141322330031


2025/03/28 20:59:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run likeable-fox-568 at: https://adb-4478913909061743.3.azuredatabricks.net/ml/experiments/327870133523236/runs/b82a208266be4959bf9ceb1a86eac961.
2025/03/28 20:59:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://adb-4478913909061743.3.azuredatabricks.net/ml/experiments/327870133523236.


In [0]:
print(training_df.columns)

Index(['Id', 'Limit_bal', 'Sex', 'Education', 'Marriage', 'Age', 'Pay_0',
       'Pay_2', 'Pay_3', 'Pay_4', 'Pay_5', 'Pay_6', 'Bill_amt1', 'Bill_amt2',
       'Bill_amt3', 'Bill_amt4', 'Bill_amt5', 'Bill_amt6', 'Pay_amt1',
       'Pay_amt2', 'Pay_amt3', 'Pay_amt4', 'Pay_amt5', 'Pay_amt6', 'Default'],
      dtype='object')


In [0]:
mlflow.register_model(
    model_uri=f"runs:/{run_id}/lightgbm-pipeline-model-feature",
    name=f"{config.catalog_name}.{config.schema_name}.credit_model_feature",
)

Successfully registered model 'test01.default.credit_model_feature'.
Created version '1' of model 'test01.default.credit_model_feature'.


<ModelVersion: aliases=[], creation_timestamp=1743195587817, current_stage=None, description='', last_updated_timestamp=1743195589475, name='test01.default.credit_model_feature', run_id='b82a208266be4959bf9ceb1a86eac961', run_link=None, source='dbfs:/databricks/mlflow-tracking/327870133523236/b82a208266be4959bf9ceb1a86eac961/artifacts/lightgbm-pipeline-model-feature', status='READY', status_message='', tags={}, user_id='ak36804n@pace.edu', version='1'>