In [0]:
# Import necessary functions and types
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.window import Window
from delta.tables import DeltaTable

# Define Schema and Initial Load
base_schema = StructType([
    StructField("customer_id", IntegerType()),
    StructField("first_name", StringType()),
    StructField("email", StringType()),
    StructField("contact_no", StringType())
])

# Day 0 Data
data_day0 = [
    (101, 'John', 'john@email.com', '9876543210'),
    (102, 'Alice', 'alice@email.com', '9876541234'),
    (103, 'Bob', 'bob@email.com', '8765432190')
]

# Convert to DataFrame with additional SCD2 columns (load_date, end_date, is_current, and status)
df_day0 = spark.createDataFrame(data_day0, base_schema)
display(df_day0)

In [0]:
df_day0 = df_day0.withColumn("load_date", F.lit("2025-04-04").cast("date")) \
    .withColumn("end_date", F.lit(None).cast("date")) \
    .withColumn("is_current", F.lit(1)) \
        
display(df_day0)

In [0]:
df_day0_with_surrogate = df_day0.withColumn("customer_key", F.row_number().over(Window.orderBy("customer_id")))
df_day0_with_surrogate = df_day0_with_surrogate.drop("surrogate_key")
display(df_day0_with_surrogate)

In [0]:
# Write the initial data to a Delta table
df_day0_with_surrogate.write.format("delta").mode("overwrite").save("/mnt/Prajwal/Retail_sales_usecase/SCD2")

In [0]:
# Read the file from delta table
df_day0_with_surrogate = spark.read.format("delta").load("/mnt/Prajwal/Retail_sales_usecase/SCD2")
display(df_day0_with_surrogate)

In [0]:
source_day1 = [
    (101, 'John', 'john.new@email.com', '9876543210'),  # Email changed
    (102, 'Alice', 'alice@email.com', '9999999999'),    # Phone changed
    (103, 'Bob', 'bob@email.com', '8765432190'),         # No change
    (104, 'Mike', 'mike@email.com', '7777777777')        # New customer
]

df_day1 = spark.createDataFrame(source_day1, base_schema)

display(df_day1)


In [0]:
SCD2Skeycustomer_dim = spark.read.format("delta").load("/mnt/Prajwal/Retail_sales_usecase/SCD2")

# Step 1: Convert Load Date (current date for this example)
# load_date = F.current_date()
# Day 1 is 5th Apr 2025
load_date = F.lit("2025-04-05").cast("date")


target_df = SCD2Skeycustomer_dim.cache()



In [0]:

# Step 3: Identify new or changed records by joining on customer_id
# Incremental is handled here, ensuring only new or modified records are considered
changed_records = df_day1.alias("src").join(
    target_df.alias("tgt").filter("is_current = 1"),
    "customer_id",
    "left_outer"
).filter(
    (F.col("tgt.customer_id").isNull()) |  # New records
    (
        (F.col("src.email") != F.col("tgt.email")) |
        (F.col("src.contact_no") != F.col("tgt.contact_no")) |
        (F.col("src.first_name") != F.col("tgt.first_name"))
    )
).select("src.*")

changed_records.show()

In [0]:
# Step 4: Add Surrogate Keys
max_sk = target_df.selectExpr("COALESCE(MAX(customer_key), 0)").first()[0]

window = Window.orderBy("customer_id")
changed_records = changed_records.withColumn(
    "customer_key",  # Add the surrogate key as customer_key
    F.row_number().over(window) + max_sk
)

display(changed_records)

In [0]:
%python
# Step 5: Expire old (previous current) records by setting end_date and is_current=0
query = f"""
UPDATE SCD2Skeycustomer_dim
SET end_date = DATE_SUB('{load_date}', 1),
    is_current = 0,
    status = 'OU - Old Updated Passive'
WHERE customer_id IN (
    SELECT customer_id
    FROM changed_records
) AND is_current = 1
"""

spark.sql(query)