In [0]:
RAW_PATH = "/Volumes/ecommerce/ecommerce/data"
GCP_PROJECT = "regal-elf-481622-u5"
BQ_DATASET = "ecommerce"
TEMP_GCS_BUCKET = "ecommerce-data1"

GCP_SECRET_SCOPE = "gcp-secrets"
GCP_SECRET_KEY = "gcp-sa-key"

In [0]:
from pyspark.sql.functions import (
    to_timestamp, coalesce, lit, when, date_format, current_timestamp, col, to_date, avg, sum, count,
    max as fmax, row_number,concat
)
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window

txn = spark.read.table("ecommerce.bronze.transactions")
cust = spark.read.table("ecommerce.bronze.customers")
feedback = spark.read.table("ecommerce.bronze.feedback")
prod= spark.read.table("ecommerce.bronze.products")
promo = spark.read.table("ecommerce.bronze.promotions")
store= spark.read.table("ecommerce.bronze.stores")


In [0]:
if "transaction_ts" not in txn.columns:
    txn = txn.withColumn("transaction_ts", to_timestamp(col("transaction_date")))
if "transaction_date" not in txn.columns:
    txn = txn.withColumn("transaction_date", to_date(col("transaction_ts")))

In [0]:
silver_txn = txn.select(
    "transaction_id","customer_id","product_id","store_id","promotion_id",
    "quantity","total_amount","transaction_ts","transaction_date"
)

silver_cust = cust.select("customer_id","first_name","last_name","email","join_date","country")
silver_prod = prod.select("product_id","product_name","category","price","supplier_name")
silver_store = store.select("store_id","store_name","city","country")
silver_promo = promo.select("promotion_id","product_id","promotion_method","discount_percent","start_date","end_date")
silver_fb = feedback.select("feedback_id","product_id","customer_id","rating","review_date","comments")

In [0]:
fb_by_product = (silver_fb
    .groupBy("product_id")
    .agg(
        avg(col("rating")).alias("avg_rating"),
        count("*").alias("review_count"),
        fmax(col("review_date")).alias("latest_review_date")
    )
)

In [0]:
txn_promo_candidates = (silver_txn.alias("t")
    .join(
        silver_promo.alias("p"),
        (col("t.product_id") == col("p.product_id")) &
        (col("t.transaction_ts") >= col("p.start_date")) &
        (col("t.transaction_ts") <= col("p.end_date")),
        "left"
    )
)

# If multiple promos match the same transaction, keep the one with max discount
w = Window.partitionBy("t.transaction_id").orderBy(col("p.discount_percent").desc_nulls_last())

txn_with_best_promo = (txn_promo_candidates
    .withColumn("rn", row_number().over(w))
    .filter(col("rn") == 1)
    .drop("rn")
)

# Compute discount amount + net_sales (optional but very useful for gold)
txn_with_best_promo = (txn_with_best_promo
    .withColumn("discount_percent_applied", col("p.discount_percent"))
    .withColumn(
        "discount_amount",
        (col("t.total_amount") * (col("p.discount_percent") / 100.0))
    )
    .withColumn(
        "net_sales",
        col("t.total_amount") - col("discount_amount")
    )
)

In [0]:
silver_enriched = (txn_with_best_promo
    .join(silver_cust.alias("c"), col("t.customer_id") == col("c.customer_id"), "left")
    .join(silver_prod.alias("pr"), col("t.product_id") == col("pr.product_id"), "left")
    .join(silver_store.alias("s"), col("t.store_id") == col("s.store_id"), "left")
    .join(fb_by_product.alias("f"), col("t.product_id") == col("f.product_id"), "left")
    .select(
        col("t.transaction_id"),
        col("t.transaction_ts"),
        col("t.transaction_date"),
        col("t.customer_id"),
        concat(col("c.first_name"), col("c.last_name")).alias("customer_name"), col("c.email"), col("c.country").alias("customer_country"),
        col("t.product_id"),
        col("pr.product_name"), col("pr.category"), col("pr.price"), col("pr.supplier_name"),
        col("t.store_id"),
        col("s.store_name"), col("s.city"), col("s.country").alias("store_country"),
        col("t.quantity"),
        col("t.total_amount"),
        col("discount_percent_applied"),
        col("discount_amount"),
        col("net_sales"),
        col("f.avg_rating"),
        col("f.review_count"),
        col("f.latest_review_date"),
        col("p.promotion_id"),
        col("p.promotion_method"),
        col("p.start_date").alias("promo_start_date"),
        col("p.end_date").alias("promo_end_date")
    )
)

In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS ecommerce.silver")

silver_txn.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("ecommerce.silver.fact_transactions")
silver_cust.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("ecommerce.silver.dim_customers")
silver_prod.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("ecommerce.silver.dim_products")
silver_store.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("ecommerce.silver.dim_stores")
silver_promo.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("ecommerce.silver.dim_promotions")
fb_by_product.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("ecommerce.silver.agg_feedback_product")

silver_enriched.write.format("delta").mode("overwrite").option("overwriteSchema","true").saveAsTable("ecommerce.silver.transaction_enriched")


In [0]:
fact_cnt = spark.table("ecommerce.silver.fact_transactions").selectExpr("count(*) as c").collect()[0]["c"]
enr_cnt  = spark.table("ecommerce.silver.transaction_enriched").selectExpr("count(*) as c").collect()[0]["c"]

print("fact_transactions rows:", fact_cnt)
print("transaction_enriched rows:", enr_cnt)