In [0]:
import time

import numpy as np
import pandas as pd
from databricks.sdk import WorkspaceClient
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sklearn.ensemble import RandomForestClassifier

from credit_default.utils import load_config

spark = SparkSession.builder.getOrCreate()

# Load configuration
config = load_config("../project_config.yml")
catalog_name = config.catalog_name
schema_name = config.schema_name
parameters = config.parameters
target = config.target[0].new_name
pipeline_id = config.pipeline_id

# Load train/test set and convert to Pandas
train_set = spark.table(f"{catalog_name}.{schema_name}.train_set").toPandas()

test_set = spark.table(f"{catalog_name}.{schema_name}.test_set").toPandas()

[32m2025-04-04 19:05:11.544[0m | [1mINFO    [0m | [36mcredit_default.utils[0m:[36mload_config[0m:[36m66[0m - [1mLoaded configuration from ../project_config.yml[0m


In [0]:
# Define features and target (adjust columns accordingly)
X = train_set.drop(columns=["Id", target, "Update_timestamp_utc"])
y = train_set[target]

# Train a Random Forest model
model = RandomForestClassifier(random_state=parameters["random_state"])
model.fit(X, y)

# Identify the most important features
feature_importances = pd.DataFrame({"Feature": X.columns, "Importance": model.feature_importances_}).sort_values(
    by="Importance", ascending=False
)

print("Top 5 important features:")
print(feature_importances.head(4))

Top 5 important features:
      Feature  Importance
5       Pay_0    0.097989
4         Age    0.067260
11  Bill_amt1    0.061741
0   Limit_bal    0.061183


In [0]:
# Get Existing IDs
features_balanced = spark.table(f"{catalog_name}.{schema_name}.features_balanced").toPandas()
existing_ids = set(int(id) for id in features_balanced["Id"])

In [0]:
len(list(existing_ids))

37654

In [0]:
# Define function to create synthetic data without random state
# This will add some data drift in the above columns (if drift=True)

def create_synthetic_data(df, drift=False, num_rows=100):
    synthetic_data = pd.DataFrame()

    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]) and column != "Id":
            # Check if the column has a small set of discrete values
            unique_values = df[column].unique()
            if len(unique_values) <= 10:  # Assume discrete values if there are 10 or fewer unique values
                # This includes all above columns except "Age"
                synthetic_data[column] = np.random.choice(unique_values, num_rows)
            elif column.startswith("Pay_amt"):  # Ensure positive values for "Pay_amt" columns
                mean, std = df[column].mean(), df[column].std()
                synthetic_data[column] = np.abs(np.random.normal(mean, std, num_rows)).astype(int).astype(float)
            else:
                # This will add some data drift in the Bill_amt columns
                mean, std = df[column].mean(), df[column].std()
                synthetic_data[column] = np.round(np.random.normal(mean, std, num_rows)).astype(int).astype(float)

        elif pd.api.types.is_datetime64_any_dtype(df[column]):
            min_date, max_date = df[column].min(), df[column].max()
            if min_date < max_date:
                # Ensure the timestamp is between max_date and current time
                current_time = pd.to_datetime("now")
                if max_date < current_time:
                    timestamp_range_start = max_date.value
                    timestamp_range_end = current_time.value
                    synthetic_data[column] = pd.to_datetime(
                        np.random.randint(timestamp_range_start, timestamp_range_end, num_rows)
                    )
                else:
                    synthetic_data[column] = [max_date] * num_rows
            else:
                synthetic_data[column] = [min_date] * num_rows

    new_ids = []
    # The first synthetic Id must be one greater than the maximum existing Id of the whole dataframe (train + test). If no existing_ids, then starts from 1.
    i = max(existing_ids) + 1 if existing_ids else 1

    while len(new_ids) < num_rows:
        if i not in existing_ids:
            new_ids.append(str(i))  # Convert numeric ID to string
        i += 1

    synthetic_data["Id"] = new_ids

    # Move "Id" to the first position
    columns = ["Id"] + [col for col in synthetic_data.columns if col != "Id"]
    synthetic_data = synthetic_data[columns]

    if drift:
        # Skew the top features to introduce drift
        top_features = ["Limit_bal", "Age", "Pay_0", "Bill_amt1"]  # Select top 4 features
        for feature in top_features:
            if feature in synthetic_data.columns:
                synthetic_data[feature] = synthetic_data[feature] * 1.5

    return synthetic_data

In [0]:
# Create synthetic data normal
combined_set = pd.concat([train_set, test_set], ignore_index=True)

synthetic_data_normal = create_synthetic_data(combined_set, drift=False, num_rows=200)
print(synthetic_data_normal.dtypes)
print(synthetic_data_normal)

Id                              object
Limit_bal                      float64
Sex                              int32
Education                        int32
Marriage                         int32
Age                            float64
Pay_0                            int32
Pay_2                            int32
Pay_3                            int32
Pay_4                            int32
Pay_5                            int32
Pay_6                            int32
Bill_amt1                      float64
Bill_amt2                      float64
Bill_amt3                      float64
Bill_amt4                      float64
Bill_amt5                      float64
Bill_amt6                      float64
Pay_amt1                       float64
Pay_amt2                       float64
Pay_amt3                       float64
Pay_amt4                       float64
Pay_amt5                       float64
Pay_amt6                       float64
Default                          int32
Update_timestamp_utc    d

In [0]:
print(f"Before: {len(existing_ids)}")

Before: 37654


In [0]:
# Update existing_ids with the IDs from synthetic_data_normal
existing_ids.update(int(id) for id in synthetic_data_normal["Id"])

In [0]:
print(f"After: {len(existing_ids)}")

After: 37854


In [0]:
# Create synthetic data skewed
synthetic_data_skewed = create_synthetic_data(combined_set, drift=True, num_rows=200)
print(synthetic_data_normal.dtypes)
print(synthetic_data_skewed)


Id                              object
Limit_bal                      float64
Sex                              int32
Education                        int32
Marriage                         int32
Age                            float64
Pay_0                            int32
Pay_2                            int32
Pay_3                            int32
Pay_4                            int32
Pay_5                            int32
Pay_6                            int32
Bill_amt1                      float64
Bill_amt2                      float64
Bill_amt3                      float64
Bill_amt4                      float64
Bill_amt5                      float64
Bill_amt6                      float64
Pay_amt1                       float64
Pay_amt2                       float64
Pay_amt3                       float64
Pay_amt4                       float64
Pay_amt5                       float64
Pay_amt6                       float64
Default                          int32
Update_timestamp_utc    d

In [0]:
# Cast columns to match the schema of the Delta table
columns_to_cast = ["Sex", "Education", "Marriage", "Age", "Pay_0", "Pay_2", "Pay_3", "Pay_4", "Pay_5", "Pay_6"]

##  Write normal data to Delta Lake
synthetic_normal_df = spark.createDataFrame(synthetic_data_normal)
for column in columns_to_cast:
    synthetic_normal_df = synthetic_normal_df.withColumn(column, F.col(column).cast("double"))

synthetic_normal_df.write.mode("append").saveAsTable(f"{catalog_name}.{schema_name}.inference_set_normal")

##  Write synthetic data to Delta Lake
synthetic_skewed_df = spark.createDataFrame(synthetic_data_skewed)
for column in columns_to_cast:
    synthetic_skewed_df = synthetic_skewed_df.withColumn(column, F.col(column).cast("double"))

synthetic_skewed_df.write.mode("append").saveAsTable(f"{catalog_name}.{schema_name}.inference_set_skewed")

In [0]:
# Update offline table
workspace = WorkspaceClient()

columns = config.features.clean
columns_str = ", ".join(columns)

# Write normal into feature table; update online table
spark.sql(f"""
    INSERT INTO {catalog_name}.{schema_name}.features_balanced
    SELECT {columns_str}
    FROM {catalog_name}.{schema_name}.inference_set_normal
""")

# Write skewed into feature table; update online table
spark.sql(f"""
    INSERT INTO {catalog_name}.{schema_name}.features_balanced
    SELECT {columns_str}
    FROM {catalog_name}.{schema_name}.inference_set_skewed
""")

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
# Update online table
update_response = workspace.pipelines.start_update(pipeline_id=pipeline_id, full_refresh=False)

while True:
    update_info = workspace.pipelines.get_update(pipeline_id=pipeline_id, update_id=update_response.update_id)
    state = update_info.update.state.value
    if state == "COMPLETED":
        break
    elif state in ["FAILED", "CANCELED"]:
        raise SystemError("Online table failed to update.")
    elif state == "WAITING_FOR_RESOURCES":
        print("Pipeline is waiting for resources.")
    else:
        print(f"Pipeline is in {state} state.")
    time.sleep(30)

Pipeline is in CREATED state.
Pipeline is waiting for resources.
Pipeline is in RUNNING state.
Pipeline is in RUNNING state.
