In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window
from delta.tables import DeltaTable

# Paths
silver_path = "/mnt/Prajwal/Retail_sales_usecase/Silver/"
gold_path = "/mnt/Prajwal/Retail_sales_usecase/Gold"

# Load Day 1 Data
df_day1 = spark.read.format("delta").load(
    silver_path + "SilverCDetails_day1/ingestion_time=2025-04-30 06%3A53%3A33/"
)

# Cast 'registration_date' to DateType and 'customer_id' to IntegerType
df_day1 = df_day1.withColumn(
    "registration_date", F.to_date("registration_date", "M/d/yyyy H:mm")
).withColumn(
    "customer_id", F.col("customer_id").cast(IntegerType())
)

display(df_day1)

In [0]:

from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window
from delta.tables import DeltaTable

# Load the existing customer dimension table
SCD2customerdim = DeltaTable.forName(spark, "retail.CustomerDim")
target_df = SCD2customerdim.toDF().cache()

changed_records = df_day1.alias("src").join(target_df.alias("tgt"), "customer_id", "left_outer")\
    .filter(
        (col("tgt.customer_id").isNull()) |
    (
        (col("src.first_name") != col("tgt.first_name")) |
        (col("src.last_name") != col("tgt.last_name")) |
        (col("src.email") != col("tgt.email")) |
        (col("src.gender")!= col("tgt.gender")) |
        (col("src.address") != col("tgt.address")) |
        (col("src.city") != col("tgt.city")) |
        (col("src.state") != col("tgt.state")) |
        (col("src.country") != col("tgt.country")) |
        (col("src.zipcode") != col("tgt.zipcode")) |
        (col("src.contact_no") != col("tgt.contact_no"))
    )
    ).select("src.*")

changed_records = changed_records.withColumn("Effective_start_date", col("registration_date").cast("date"))

max_sk = target_df.selectExpr("coalesce(max(customer_id), 0)").first()[0]
changed_records = changed_records.withColumn("Customer_Key", row_number().over(Window.orderBy("customer_id")) + max_sk)
display(changed_records)

In [0]:
%python
from pyspark.sql.functions import lit, col, when

changed_records = changed_records.withColumn("Effective_end_date", lit(None).cast("date")) \
    .withColumn("Is_active", lit("Y")) \
    .withColumn("status", when(col("customer_id").isin([c.customer_id for c in target_df.collect()]), lit("updated")).otherwise(lit("New")))

display(changed_records)
print(changed_records.columns)

In [0]:
%python
from pyspark.sql.functions import coalesce, col, lit, expr

changed_records = changed_records.select(
    'customer_id', 'first_name', 'last_name', 'email', 'gender', 'address', 
    'city', 'state', 'country', 'zipcode', 'contact_no', 'registration_date', 
    'membership_status', 'Ingestion_time', 'Effective_start_date', 
    'Effective_end_date', 'Is_active', 'status', 'Customer_Key'
)

# ensuring customer key is non nullable
changed_records = changed_records.withColumn("Customer_Key", coalesce(col("Customer_Key"), lit(0))).dropDuplicates(['customer_id'])

# expire existing records
SCD2customerdim.alias("tgt").merge(changed_records.alias("src"), 
    "tgt.customer_id = src.customer_id and tgt.Is_active='Y'")\
        .whenMatchedUpdate(
            set = {
                "Effective_End_Dt": col("src.Effective_start_date") - expr("INTERVAL 1 DAY"),
                "Is_active": lit("N"),
                "status": lit("Expired")
            }
        ).execute()

new_inserts = changed_records.filter(col("status") == "New").cache()
updated_records = changed_records.filter(col("status") == "updated").cache()

print("Newly inserted records count = ", new_inserts.count())
print("Updated records count = ", updated_records.count())

if updated_records.count() > 0:
    updated_records.write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable("retail.CustomerDim")
if new_inserts.count() > 0:
    new_inserts.write.format("delta").option("mergeSchema", "true").mode("append").saveAsTable("retail.CustomerDim")

In [0]:
%sql
select * from retail.CustomerDim