In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
%run ../01_init/03_utilities

In [0]:
dbutils.widgets.text("catalog", "fmcg", "Field#1")
dbutils.widgets.text("data_source", "products", "Field#2")

In [0]:
catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

base_path = (f"s3://xxxxx/{data_source}/*csv")

In [0]:
df = (
    spark.read.format("csv") \
    .option("header", True) \
    .option("interSchema", True) \
    .load(base_path) \
    .withColumn("read_timestamp", F.current_timestamp())
    .select("*", "_metadata.file_name", "_metadata.file_size")
)

In [0]:
df.write \
  .format("delta") \
  .option("delta.enableChangeDataFeed", "True") \
  .mode("overwrite") \
  .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")


In [0]:
df_bronze = spark.sql(f"SELECT * FROM {catalog}.{bronze_schema}.{data_source}")


In [0]:
df_duplicates = df_bronze.groupBy("product_name").count().where("count >1")

In [0]:
df_silver = df_bronze.dropDuplicates()

In [0]:
df_silver = df_silver.withColumn(
    "category",
    F.when(F.col("category").isNull(), None)
        .otherwise(F.initcap("category"))
)

In [0]:
df_silver = (df_silver
    .withColumn(
        "product_name",
        F.regexp_replace("product_name", "(?i)Protien", "Protein")
    )
    .withColumn(
        "category",
        F.regexp_replace("category", "(?i)Protien", "Protein")
    )
)

In [0]:
df_silver = (
    df_silver
    .withColumn(
        "division",
        F.when(F.col("category") == "Energy Bars",        "Nutrition Bars")
         .when(F.col("category") == "Protein Bars",       "Nutrition Bars")
         .when(F.col("category") == "Granola & Cereals",  "Breakfast Foods")
         .when(F.col("category") == "Recovery Dairy",     "Dairy & Recovery")
         .when(F.col("category") == "Healthy Snacks",     "Healthy Snacks")
         .when(F.col("category") == "Electrolyte Mix",    "Hydration & Electrolytes")
         .otherwise("Other")
    )
)

df_silver = df_silver.withColumn(
    "variant",
    F.regexp_extract(F.col("product_name"), r"\((.*?)\)", 1)
)

df_silver = (
    df_silver
    .withColumn(
        "product_code",
        F.sha2(F.col("product_name").cast("string"), 256)
    )
    .withColumn(
        "product_id",
        F.when(
            F.col("product_id").cast("string").rlike("^[0-9]+$"),
            F.col("product_id").cast("string")
        ).otherwise(F.lit(999999).cast("string"))
    )
    .withColumnRenamed("product_name", "product")
)

In [0]:
df_silver = df_silver.select(
  "product_code",
  "division",
  "category",
  "product",
  "variant",
  "product_id",
  "read_timestamp",
  "file_name",
  "file_size"  
  )

In [0]:
df_silver.write \
  .format("delta") \
  .option("enableChangeDataFeed", "true") \
  .option("mergeSchema", "true") \
  .mode("overwrite") \
  .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

In [0]:
df_silver = spark.sql(f"SELECT * FROM {catalog}.{silver_schema}.{data_source}")
df_gold = df_silver.select("product_code", "product_id", "division", "category", "product", "variant")

In [0]:
df_gold.write \
  .format("delta") \
  .option("enableChangeDataFeed", "true") \
  .option("mergeSchema", "true") \
  .mode("overwrite") \
  .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

In [0]:
delta_table = DeltaTable.forName(spark, "fmcg.gold.dim_products")
df_child_products = spark.sql(f"SELECT product_code, division, category, product, variant FROM fmcg.gold.sb_dim_products")

In [0]:
delta_table.alias("target").merge(
    source = df_child_products.alias("source"),
    condition = "target.product_code = source.product_code"
).whenMatchedUpdate(
    set = {
        "division": "source.division",
        "category": "source.category",
        "product": "source.product",
        "variant": "source.variant",
    }
).whenNotMatchedInsert(
    values = {
        "product_code": "source.product_code",
        "division": "source.division",
        "category": "source.category",
        "product": "source.product",
        "variant": "source.variant",
    }
).execute()