In [0]:
# Import necessary functions and types
from pyspark.sql import functions as F
from pyspark.sql.types import *
from delta.tables import DeltaTable
from pyspark.sql.functions import lit, current_timestamp

In [0]:
silver_path = "/mnt/Prajwal/Capstone_Project/silver/bankcustomer_source/"
gold_path = "/mnt/Prajwal/Capstone_Project/gold/"

try:
    df_existing = spark.read.format("delta").load(silver_path)
except:
    df_existing = spark.createDataFrame([], df.schema)

df_new = spark.read.format("delta").load(silver_path)

# Add SCD Type 2 columns to the new data
df_new = df_new.withColumn("is_current", lit(True)) \
               .withColumn("start_date", current_timestamp()) \
               .withColumn("end_date", lit(None).cast("timestamp"))

# Join new data with existing data to identify changes
df_joined = df_new.join(df_existing, "customer_id", "left")

# Identify records that have changed
df_changed = df_joined.filter(
    (df_new["name"] != df_existing["name"]) |
    (df_new["city"] != df_existing["city"]) |
    (df_new["phone_no"] != df_existing["phone_no"]) |
    (df_new["maritial_status"] != df_existing["maritial_status"]) |
    (df_new["gender"] != df_existing["gender"]) |
    (df_new["email"] != df_existing["email"]) |
    (df_new["DOB"] != df_existing["DOB"]) |
    (df_new["age"] != df_existing["age"]) |
    (df_new["ingestion_time"] != df_existing["ingestion_time"]) |
    (df_new["ingestion_time_formatted"] != df_existing["ingestion_time_formatted"])
).select(df_new["*"])

# Mark existing records as not current
df_existing_updated = df_existing.join(df_changed, "customer_id", "left_anti") \
                                 .withColumn("is_current", lit(False)) \
                                 .withColumn("end_date", current_timestamp())

# Create a DeltaTable object for the existing data
deltaTable = DeltaTable.forPath(spark, silver_path)


# Merge new data with existing data to identify changes and insert new records
df_final = deltaTable.alias("existing").merge(
    df_new.alias("new"),
    "existing.customer_id = new.customer_id"
).whenMatchedUpdate(
    condition="existing.name != new.name OR "
              "existing.city != new.city OR "
              "existing.phone_no != new.phone_no OR "
              "existing.maritial_status != new.maritial_status OR "
              "existing.gender != new.gender OR "
              "existing.DOB != new.DOB OR "
              "existing.email != new.email",
    set={
        "is_current": lit(False),
        "end_date": current_timestamp(),
    }
).execute()

# Insert records with new keys (new business keys that never existed)
updated_target_df = deltaTable.toDF().filter("is_current = true").select("customer_id")


insert_df = df_new.join(updated_target_df, on="customer_id")

display(insert_df)


In [0]:
silver_path = "/mnt/Prajwal/Capstone_Project/silver/bankcustomer_source"
gold_path = "/mnt/Prajwal/Capstone_Project/gold/"

df_new = spark.read.format("delta").load(silver_path)

# Add SCD Type 2 columns to the new data
df_new = df_new.withColumn("is_current", lit(True)) \
               .withColumn("start_date", current_timestamp()) \
               .withColumn("end_date", lit(None).cast("timestamp"))

# Join new data with existing data to identify changes
df_joined = df_new.join(df_existing, "customer_id", "left")

# Identify records that have changed
df_changed = df_joined.filter(
    (df_new["name"] != df_existing["name"]) |
    (df_new["city"] != df_existing["city"]) |
    (df_new["phone_no"] != df_existing["phone_no"]) |
    (df_new["maritial_status"] != df_existing["maritial_status"]) |
    (df_new["gender"] != df_existing["gender"]) |
    (df_new["email"] != df_existing["email"]) |
    (df_new["DOB"] != df_existing["DOB"]) |
    (df_new["age"] != df_existing["age"]) |
    (df_new["ingestion_time"] != df_existing["ingestion_time"]) |
    (df_new["ingestion_time_formatted"] != df_existing["ingestion_time_formatted"])
).select(df_new["*"])

# Mark existing records as not current
df_existing_updated = df_existing.join(df_changed, "customer_id", "left_anti") \
                                 .withColumn("is_current", lit(False)) \
                                 .withColumn("end_date", current_timestamp())

# Create a DeltaTable object for the existing data
deltaTable = DeltaTable.forPath(spark, silver_path)

# Merge new data with existing data to identify changes and insert new records
df_final = deltaTable.alias("existing").merge(
    df_new.alias("new"),
    "existing.customer_id = new.customer_id"
).whenMatchedUpdate(
    condition="existing.name != new.name OR "
              "existing.city != new.city OR "
              "existing.phone_no != new.phone_no OR "
              "existing.maritial_status != new.maritial_status OR "
              "existing.gender != new.gender OR "
              "existing.DOB != new.DOB OR "
              "existing.email != new.email",
    set={
        "is_current": lit(False),
        "end_date": current_timestamp(),
    }
).execute()

# Insert records with new keys (new business keys that never existed)
updated_target_df = deltaTable.toDF().filter("is_current = true").select("customer_id")
insert_df = df_new.join(updated_target_df, on="customer_id", how="left_anti")

In [0]:
display(insert_df)

In [0]:

# Create a DeltaTable object for the existing data
deltaTable = DeltaTable.forPath(spark, silver_path)

# Merge new data with existing data to identify changes and insert new records
df_final = deltaTable.alias("existing").merge(
    df_new.alias("new"),
    "existing.customer_id = new.customer_id"
).whenMatchedUpdate(
    condition="existing.name != new.name OR "
              "existing.city != new.city OR "
              "existing.phone_no != new.phone_no OR "
              "existing.maritial_status != new.maritial_status OR "
              "existing.gender != new.gender OR "
              "existing.DOB != new.DOB OR "
              "existing.email != new.email",
    set={
        "is_current": lit(False),
        "end_date": current_timestamp(),
    }
).execute()

# Insert records with new keys (new business keys that never existed)
updated_target_df = deltaTable.toDF().filter("is_current = true").select("customer_id")
insert_df = df_new.join(updated_target_df, on="customer_id", how="left_anti")

# final_insert = insert_df.withColumn('is_current', lit(True) \
#                               .withColumn("start_date", current_timestamp()) \
#                               .withColumn("end_date", lit(None).cast("timestamp")))

print("data is updated and inserted in silver path")