In [0]:
# Import necessary functions and types
from pyspark.sql import functions as F
from pyspark.sql.types import *
from delta.tables import DeltaTable
from pyspark.sql.functions import monotonically_increasing_id

In [0]:

from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
from delta.tables import DeltaTable

silver_path = "/mnt/Prajwal/Capstone_Project/silver/bankcustomer_source"
gold_path = "/mnt/Prajwal/Capstone_Project/gold/"

df = spark.read.format("delta").load(silver_path)

customer_df = df.withColumn("Effective_Start_Dt", F.lit("2025-05-04").cast("date")) \
                .withColumn("Effective_End_Dt", F.lit(None).cast("date")) \
                .withColumn("Is_Active", F.lit("Y")) \
                .withColumn("status", F.lit("New")) \
                .withColumn("Customer_Key", F.monotonically_increasing_id().cast(IntegerType()))

# Deduplicate the source DataFrame based on customer_id
customer_df = customer_df.dropDuplicates(["customer_id"])

customer_df = customer_df.select(
    'customer_id', 
    'name', 
    'city', 
    'phone_no', 
    'area_code', 
    'maritial_status', 
    'gender', 
    'DOB', 
    'age', 
    'email', 
    'Effective_Start_Dt', 
    'Effective_End_Dt', 
    'Is_Active', 
    'status', 
    'Customer_Key'
)

# Create a DeltaTable object for the existing data
deltaTable = DeltaTable.forPath(spark, gold_path + "dim_customer")

# Merge new data with existing data to implement SCD Type 2
deltaTable.alias("existing").merge(
    customer_df.alias("new"),
    "existing.customer_id = new.customer_id AND existing.Is_Active = 'Y'"
).whenMatchedUpdate(
    condition="existing.name != new.name OR "
              "existing.city != new.city OR "
              "existing.phone_no != new.phone_no OR "
              "existing.maritial_status != new.maritial_status OR "
              "existing.gender != new.gender OR "
              "existing.DOB != new.DOB OR "
              "existing.email != new.email",
    set={
        "Is_Active": F.lit("N"),
        "Effective_End_Dt": F.current_date()
    }
).whenNotMatchedInsertAll().execute()

# Insert records with new keys (new business keys that never existed)
updated_target_df = deltaTable.toDF().filter("Is_Active = 'Y'").select("customer_id")
insert_df = customer_df.join(updated_target_df, on="customer_id", how="left_anti")

# Append the new records to the Delta table
insert_df.write.format("delta").mode("append").save(gold_path + "dim_customer")

print("Customer data updated in Gold layer with SCD Type 2 implementation.")

spark.sql(f"CREATE TABLE IF NOT EXISTS banking.dim_customer USING DELTA LOCATION '{gold_path}dim_customer'")

display(spark.sql("DESCRIBE banking.dim_customer"))

In [0]:
%sql
select * from banking.customer_dim;

# SCD 2

In [0]:
# # Load the existing dimension table
# dim_customer_path = gold_path + "dim_customer"
# dim_customer_df = spark.read.format("delta").load(dim_customer_path)

# # Load the new data from the silver layer
# new_data_df = spark.read.format("delta").load(silver_path)

# # Prepare the new data with SCD Type 2 attributes
# new_data_prepared_df = new_data_df.withColumn("Effective_Start_Dt", F.lit("2025-05-04").cast("date")) \
#     .withColumn("Effective_End_Dt", F.lit(None).cast("date")) \
#     .withColumn("Is_Active", F.lit("Y")) \
#     .withColumn("status", F.lit("New")) \
#     .withColumn("Customer_Key", F.monotonically_increasing_id().cast(IntegerType()))

# # Join the new data with the existing dimension table to find changes
# join_condition = [dim_customer_df.customer_id == new_data_prepared_df.customer_id]
# changes_df = dim_customer_df.join(new_data_prepared_df, join_condition, "outer") \
#     .select(
#         new_data_prepared_df["*"],
#         dim_customer_df["Effective_End_Dt"].alias("existing_Effective_End_Dt"),
#         dim_customer_df["Is_Active"].alias("existing_Is_Active")
#     ) \
#     .filter(
#         (dim_customer_df.customer_id.isNull()) | 
#         (dim_customer_df.name != new_data_prepared_df.name) |
#         (dim_customer_df.city != new_data_prepared_df.city) |
#         (dim_customer_df.phone_no != new_data_prepared_df.phone_no) |
#         (dim_customer_df.area_code != new_data_prepared_df.area_code) |
#         (dim_customer_df.maritial_status != new_data_prepared_df.maritial_status) |
#         (dim_customer_df.gender != new_data_prepared_df.gender) |
#         (dim_customer_df.DOB != new_data_prepared_df.DOB) |
#         (dim_customer_df.age != new_data_prepared_df.age) |
#         (dim_customer_df.email != new_data_prepared_df.email)
#     )

# # Update the existing records to set Effective_End_Dt and Is_Active
# updates_df = changes_df.filter(changes_df.existing_Is_Active == "Y") \
#     .withColumn("Effective_End_Dt", F.lit("2025-05-03").cast("date")) \
#     .withColumn("Is_Active", F.lit("N"))

# # Insert the new records
# new_records_df = changes_df.filter(changes_df.existing_Is_Active.isNull()) \
#     .select(new_data_prepared_df.columns)

# # Combine the updates and new records
# final_df = updates_df.union(new_records_df)

# # Write the final DataFrame to Delta format
# final_df.write.format("delta").mode("append").option("mergeSchema", "true").save(dim_customer_path)

# # Refresh the table
# spark.sql(f"REFRESH TABLE banking.dim_customer")

# display(spark.sql("SELECT * FROM banking.dim_customer"))