In [0]:
RAW_PATH = "/Volumes/ecommerce/ecommerce/data"
GCP_PROJECT = "regal-elf-481622-u5"
BQ_DATASET = "ecommerce"
TEMP_GCS_BUCKET = "ecom-databricks-temp"

GCP_SECRET_SCOPE = "gcp-secrets"
GCP_SECRET_KEY = "gcp-sa-key"

In [0]:
from pyspark.sql.functions import sum as _sum, countDistinct, avg, desc

silver = spark.read.table("ecommerce.silver.transaction_enriched")
silver.createOrReplaceTempView("silver_txn")


In [0]:
gold_daily_store_cat = spark.sql("""
                                 select transaction_date,store_name,
                                 store_city as store_location,category, sum(total_amount)as gross_sales, sum(final_amount) as net_sales, count(distinct customer_id) as unique_customer,
                                 count(*) as txn_count
                                 from silver_txn
                                 group by transaction_date, store_name,store_location, category 
                                 """)

gold_daily_store_cat.write.mode("overwrite").partitionBy("transaction_date").format("delta").saveAsTable("ecommerce.gold.daily_store_category")

In [0]:
gold_top_customers = spark.sql("""
                               select customer_id, concat(first_name,' ',last_name) as customer_name, store_country, store_name, sum(final_amount) as total_spent, count(*) as txn_count
                               from silver_txn
                               group by customer_id,customer_name, store_country, store_name
                               """)
gold_top_customers.write.mode("overwrite").format("delta").saveAsTable("ecommerce.gold.top_customers")

In [0]:
gold_promotion_impact = spark.sql("""
                                  select promotion_id, promotion_method, count(*) as promo_txn, sum(final_amount) as promo_sales, avg(final_amount) as avg_promo_sale
                                  from silver_txn
                                  where has_promo_active = true
                                  group by promotion_id,promotion_method
                             """)
gold_promotion_impact.write.mode("overwrite").format("delta").saveAsTable("ecommerce.gold.promo_impact")

In [0]:
gold_product_impact = spark.sql("""
                                select product_id, product_name, category, round(avg(customer_rating),1) as avg_rating, count(customer_rating) as rating_count
                                from silver_txn
                                where customer_rating is not null
                                group by product_id,product_name,category
                                """)
gold_product_impact.write.mode("overwrite").format("delta").saveAsTable("ecommerce.gold.product_impact")

In [0]:
from pyspark.sql.functions import max as _max, count as _count, sum as _sum, datediff, current_date, to_timestamp, concat_ws,col, when

rfm = (silver.groupBy("customer_id", concat_ws(" ", col("first_name"), col("last_name")).alias("name"))
.agg(
    _max("transaction_date").alias("last_txn"),
    _count("transaction_id").alias("frequency"),
    _sum("final_amount").alias("monetary")
)
.withColumn("recency", datediff(current_date(), to_timestamp("last_txn"))
)
)

rfm = (rfm.withColumn("recency_bucket", when(col("recency")<=30 , "0-30")
       .when(col("recency") <=90 , "31-90")
       .otherwise("90+"))
       .withColumn("frequency_bucket", when(col("frequency")>=10, "high")
                   .when(col("frequency")>=5, "medium")
                   .otherwise("low"))
       .withColumn("monetary_bucket", when(col("monetary")>=1000, "high")
                   .when(col("monetary")>=500, "medium")
                   .otherwise("low"))           
       
       )
rfm.write.mode("overwrite").format("delta").saveAsTable("ecommerce.gold.rfm")


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, unix_timestamp

w = Window.partitionBy("customer_id").orderBy("transaction_date")
txn_with_prev = silver.withColumn("prev_txn", lag("transaction_date").over(w)).withColumn("prev_store", lag("store_id").over(w))
txn_with_prev = txn_with_prev.withColumn("time_diff_mins", (unix_timestamp("transaction_date")- unix_timestamp("prev_txn"))/60)

suspects = txn_with_prev.filter((col("time_diff_mins") < 30) & (col("total_amount")>1000) & (col("store_id")!= col("prev_store")))

suspects.write.mode("overwrite").format("delta").saveAsTable("ecommerce.gold.suspects")
