In [0]:
# Setup widgets for environment configuration
dbutils.widgets.text("Environment", "dev1", "Set the current environment/catalog name")
dbutils.widgets.text("RunType", "once", "Set once to run as a batch")
dbutils.widgets.text("ProcessingTime", "5 seconds", "Set the microbatch interval")

In [0]:
env = dbutils.widgets.get("Environment")
once = dbutils.widgets.get("RunType") == "once"
processing_time = dbutils.widgets.get("ProcessingTime")

In [0]:
if once:
    print("Starting SBIT pipeline in batch mode...")
else:
    print(f"Starting SBIT pipeline in streaming mode with microbatch = {processing_time}...")

In [0]:
# Spark optimisations
spark.conf.set("spark.sql.shuffle.partitions", sc.defaultParallelism)
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", True)
spark.conf.set("spark.databricks.delta.autoCompact.enabled", True)
spark.conf.set("spark.sql.streaming.stateStore.providerClass", "com.databricks.sql.streaming.state.RocksDBStateStoreProvider")

In [0]:
%run ./02-setup


In [0]:
%run ./03-history-loader


In [0]:
# Parameterise these before running
catalog = env
db_name = f"sbit_{env}"
test_data_dir = base_dir_data + "/test_data"
sets = 2  # You can adjust based on your test dataset

In [0]:
# Setup (if needed)
from pyspark.sql.functions import col

setup_required = (
    spark.sql(f"SHOW DATABASES IN {catalog}")
    .filter(col("databaseName") == db_name)
    .count() != 1
)

if setup_required:
    create_all_tables(catalog, db_name)
    validate_setup(catalog, db_name)
    load_history(catalog, db_name)
    validate(catalog, db_name)
else:
    spark.sql(f"USE {catalog}.{db_name}")

In [0]:
%run ./04-bronze


In [0]:
%run ./05-silver

In [0]:
%run ./06-gold

In [0]:
# Bronze, Silver, and Gold execution

# Bronze
consume_bronze(once=True, processing_time="5 seconds")
# Silver
upsert_silver(once=True, processing_time="5 seconds")

# Gold
upsert_gold(once=True, processing_time="5 seconds")

In [0]:
# # Final validations

# #validate_bronze(catalog, db_name, sets)
# validate_silver(catalog, db_name, sets)
# validate_gold(catalog, db_name, test_data_dir, sets)

# print("SBIT pipeline complete.")