In [0]:
RAW_PATH = "/Volumes/ecommerce/ecommerce/data"
GCP_PROJECT = "regal-elf-481622-u5"
BQ_DATASET = "ecommerce"
TEMP_GCS_BUCKET = "ecom-databricks-temp"

GCP_SECRET_SCOPE = "gcp-secrets"
GCP_SECRET_KEY = "gcp-sa-key"

In [0]:
from pyspark.sql.functions import to_timestamp, coalesce, lit, when, date_format, current_timestamp, col
from pyspark.sql.types import DoubleType

txn = spark.read.table("ecommerce.bronze.transactions")
cust = spark.read.table("ecommerce.bronze.customers")
feedback = spark.read.table("ecommerce.bronze.feedback")
prod= spark.read.table("ecommerce.bronze.products")
promo = spark.read.table("ecommerce.bronze.promotions")
store= spark.read.table("ecommerce.bronze.stores")


In [0]:
txn_clean = (txn
             .dropDuplicates(['transaction_id']) \
             .withColumn('total_amount', col('total_amount').cast(DoubleType())) \
             .withColumn('transaction_date', date_format("transaction_date", "yyyy-MM-dd")) \
             .withColumn("quantity", col("quantity").cast("int")) \
             .withColumn("total_amount", coalesce(col("total_amount"), lit(0.0))) \
             .withColumn("_ingest_time", current_timestamp())
       )

display(txn_clean)

In [0]:
silver_enriched = (txn_clean.alias("t")
          .join(cust.alias("c"), col("t.customer_id") == col("c.customer_id"), "left") \
          .join(prod.alias("p"), col("t.product_id") == col("p.product_id"), "left") \
          .join(store.alias("s"), col("t.store_id")==col("s.store_id"), "left") \
          .join(promo.alias("pr"), col("t.product_id") == col("pr.product_id"), "left") \
          .join(feedback.alias("f"), col("t.customer_id")==col("f.customer_id"), "left")
)
silver_enriched = (silver_enriched
          .withColumn("transaction_date", date_format("t.transaction_date", "yyyy-MM-dd")) \
          .withColumn("is_valid_store", when(col("s.city").isNotNull(), lit(True)).otherwise(lit(False))) \
          .withColumn("has_valid_customer", when(col("c.first_name").isNotNull(), lit(True)).otherwise(lit(False))) \
          .withColumn("has_promo_active", when(col("pr.discount_percent").isNotNull(), lit(True)).otherwise(lit(False))) \
          .withColumn("final_amount", when(col("pr.discount_percent").isNotNull(), col("t.total_amount")*(1-col("pr.discount_percent")/100)).otherwise(col("t.total_amount")))
)
display(silver_enriched)

In [0]:
silver = silver_enriched.select(
    # Fact
    col("t.transaction_id"),
    col("transaction_date"),
    col("t.customer_id"),
    col("t.product_id"),
    col("t.store_id"),
    col("t.quantity"),
    col("t.total_amount"),
    col("final_amount"),

    # Flags
    col("is_valid_store"),
    col("has_valid_customer"),
    col("has_promo_active"),

    # Customer
    col("c.first_name"),
    col("c.last_name"),
    col("c.email"),
    col("c.country").alias("customer_country"),
    col("city").alias("customer_city"),

    # Product
    col("p.product_name"),
    col("p.category"),
    col("p.price"),

    # Store
    col("s.store_name"),
    col("s.country").alias("store_country"),
    col("s.city").alias("store_city"),

    # Promo / feedback
    col("pr.promotion_id"),
    col("pr.promotion_method"),
    col("pr.discount_percent"),
    col("f.rating").alias("customer_rating"),
    col("f.comments")
)


In [0]:
total = silver.count()
invalid_store_count = silver.filter(~col("is_valid_store")).count()
invalid_customer_count = silver.filter(~col("has_valid_customer")).count()

print(f"Total rows:{total}, invalid stores:{invalid_store_count}, invalid customers = {invalid_customer_count}")

if total > 0 and (invalid_store_count/total)>0.10:
    raise Exception("DQ check failed: >10% of stores are invalid")

In [0]:
silver.write.mode("overwrite").partitionBy("transaction_date").format("delta").saveAsTable("ecommerce.silver.transaction_enriched")