In [0]:
from pyspark.sql import functions as f
from delta.tables import DeltaTable
from pyspark.sql.window import Window

In [0]:
%run /Workspace/Databricks_FMCG/setup/utilities

In [0]:
dbutils.widgets.text("catalog", "fmcg", "Catalog")
dbutils.widgets.text("data_source", "gross_price", "Data Source")

catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

base_path = f's3://fmcg-child-sports-data/{data_source}/*.csv'
print(base_path)

##Bronze

In [0]:
df=(
  spark.read.format("csv")
  .option("header","true")
  .option("inferSchema","true")
  .load(base_path)
  .withColumn("read_timestamp", f.current_timestamp())
  .select("*", "_metadata.file_name", "_metadata.file_size")
  )



In [0]:
df.printSchema()

In [0]:
display(df.limit(10))

In [0]:
df.write\
.format("delta")\
.option("delta.enableChangeDataFeed", "true")\
.mode("overwrite")\
.saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

#Silver

In [0]:
df_bronze = spark.sql(f"SELECT * FROM {catalog}.{bronze_schema}.{data_source};")
df_bronze.show()

In [0]:
df_bronze.select("month").distinct().show()

In [0]:
#Normalise Month to same style
date_formats=["yyyy/MM/dd","yyyy-MM-dd","dd/MM/yyyy","dd-MM-yyyy"]

df_silver = df_bronze.withColumn(
    "month",
    f.coalesce(
        f.try_to_date(f.col("month"), "yyyy/MM/dd"),
        f.try_to_date(f.col("month"), "dd/MM/yyyy"),
        f.try_to_date(f.col("month"), "yyyy-MM-dd"),
        f.try_to_date(f.col("month"), "dd-MM-yyyy")
    )
)

In [0]:
df_silver.select('month').distinct().show()

In [0]:
#replacing non-numeric to 0, converting negative values to positive, converting the rest to double
df_silver = df_silver.withColumn(
    "gross_price",
    f.when(f.col("gross_price").rlike(r'^-?\d+(\.\d+)?$'), 
           f.when(f.col("gross_price").cast("double") < 0, -1 * f.col("gross_price").cast("double"))
            .otherwise(f.col("gross_price").cast("double")))
    .otherwise(0)
)

In [0]:
df_silver.show(10)

In [0]:
#fetching the crt product code by performing innner join
df_products=spark.table("fmcg.silver.products")
df_joined=df_silver.join(df_products.select("product_id","product_code"),on="product_id", how="inner")
df_joined=df_joined.select("product_id", "product_code", "month", "gross_price", "read_timestamp", "file_name", "file_size")
df_joined.show(5) 

In [0]:
df_joined.write\
 .format("delta") \
 .option("delta.enableChangeDataFeed", "true")\
 .option("mergeSchema", "true") \
 .mode("overwrite") \
 .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

##Gold

In [0]:
df_silver=spark.sql(f"select * from {catalog}.{silver_schema}.{data_source};")


In [0]:
df_gold=df_silver.select("product_code","month","gross_price")
df_gold.show(5)

In [0]:
df_gold.write\
.format("delta")\
.option("delta.enableChangeDataFeed","true")\
.mode("overwrite")\
.saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")
    

### Merge the data into dim_gross_price

In [0]:
df_gold_price=spark.table("fmcg.gold.sb_dim_gross_price")
df_gold_price.show(5)

In [0]:

df_gold_latest_price = (
    df_gold_price
    # 1️⃣ derive year
    .withColumn("year", f.year("month"))
    
    # 2️⃣ flag zero prices
    .withColumn(
        "is_zero",
        f.when(f.col("gross_price") == 0, 1).otherwise(0)
    )
)

w = (
    Window
    .partitionBy("product_code", "year")
    .orderBy(f.col("is_zero"), f.col("month").desc())
)

df_gold_latest_price = (
    df_gold_latest_price
    # 3️⃣ pick latest non-zero price per product per year
    .withColumn("rn", f.row_number().over(w))
    .filter(f.col("rn") == 1)
    
    # 4️⃣ align schema with parent
    .select(
        f.col("product_code"),
        f.col("gross_price").alias("price_inr"),
        f.col("year").cast("string")
    )
)




In [0]:
df_gold_latest_price.show(5)


In [0]:
df_gold_latest_price.printSchema()

In [0]:
delta_table = DeltaTable.forName(spark, "fmcg.gold.dim_gross_price")


delta_table.alias("target").merge(
    source=df_gold_latest_price.alias("source"),
    condition="target.product_code = source.product_code"
).whenMatchedUpdate(
    set={
        "price_inr": "source.price_inr",
        "year": "source.year"
    }
).whenNotMatchedInsert(
    values={
        "product_code": "source.product_code",
        "price_inr": "source.price_inr",
        "year": "source.year"
    }
).execute()