In [30]:
import pyspark, os, sys
from pyspark.sql import *
from pyspark import SparkConf,SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import types
from pyspark import StorageLevel
from pyspark.sql.functions import broadcast
from pyspark.sql.functions import hash

spark=SparkSession.builder.getOrCreate()

#####################First Method###################################
df_new=spark.read.format("csv").option("header", True).load("new.csv")

df_old=spark.read.format("csv").option("header", True).load("old.csv")
df_old=df_old.withColumn("st_date", col("st_date").cast("date"))\
            .withColumn("end_date", col("end_date").cast("date"))

df_new=df_new.withColumn("id", col("id").cast("int"))\
            .withColumn("dim1", col("dim1").cast("int"))\
            .withColumn("dim2", col("dim2").cast("int"))\
            .withColumn("dim3", col("dim3").cast("int"))\
            .withColumn("dim4", col("dim4").cast("int"))\
            .withColumn("new_hash", hash("id", "dim1", "dim2", "dim3", "dim4"))  #to compare with old records

df_old=df_old.withColumn("id_old", col("id_old").cast("int"))\
            .withColumn("dim1_old", col("dim1_old").cast("int"))\
            .withColumn("dim2_old", col("dim2_old").cast("int"))\
            .withColumn("dim3_old", col("dim3_old").cast("int"))\
            .withColumn("dim4_old", col("dim4_old").cast("int"))\
            .withColumn("old_hash", hash("id_old", "dim1_old", "dim2_old", "dim3_old", "dim4_old"))  #to compare with new records


join_df=df_new.join(df_old, df_new.id==df_old.id_old, "full")
join_df.show()

unchanged_df=join_df.filter((col("new_hash")==col("old_hash"))|(col("new_hash").isNull()))\
                    .select("id_old", "dim1_old", "dim2_old", "dim3_old", "dim4_old")  #row with no changes
unchanged_df.show()

new_changes_df=join_df.filter(col("old_hash").isNull()).select("id", "dim1", "dim2", "dim3", "dim4")  #new row
new_changes_df.show()

updated_df=join_df.filter(col("new_hash")!=col("old_hash")).select("id", "dim1", "dim2", "dim3", "dim4")  #row with updated changes
updated_df.show()

print("final SCD result:")
final_df=unchanged_df.union(new_changes_df).union(updated_df)
final_df.show()

+----+----+----+----+----+----------+------+--------+--------+--------+--------+----------+----------+-----------+
|  id|dim1|dim2|dim3|dim4|  new_hash|id_old|dim1_old|dim2_old|dim3_old|dim4_old|   st_date|  end_date|   old_hash|
+----+----+----+----+----+----------+------+--------+--------+--------+--------+----------+----------+-----------+
| 111| 200| 500| 800| 400|-842040841|   111|     200|     500|     800|     400|2024-12-01|2999-12-31| -842040841|
| 222| 800|1300| 800| 500| 307381462|   222|     900|    NULL|     700|     100|2024-12-01|2999-12-31|-1540894174|
|NULL|NULL|NULL|NULL|NULL|      NULL|   333|     300|     900|     250|     650|2024-12-01|2999-12-31|-1529515503|
| 444| 100|NULL| 700| 300|-805886361|  NULL|    NULL|    NULL|    NULL|    NULL|      NULL|      NULL|       NULL|
+----+----+----+----+----+----------+------+--------+--------+--------+--------+----------+----------+-----------+

+------+--------+--------+--------+--------+
|id_old|dim1_old|dim2_old|dim3_old

In [None]:
#########################Second Method################################

old_df=spark.read.format("csv").option("header", True).load("customer_dim.csv")

new_df=spark.read.format("csv").option("header", True).load("customer_staging.csv")

old_df=old_df.withColumn("old_hash", hash("customer_id", "customer_name", "city"))\
            .withColumnRenamed("customer_id", "old_id")\
            .withColumnRenamed("customer_name", "old_name")\
            .withColumnRenamed("city", "old_city")

new_df=new_df.withColumn("new_hash", hash("customer_id", "customer_name", "city"))\
            .withColumnRenamed("customer_id", "new_id")\
            .withColumnRenamed("customer_name", "new_name")\
            .withColumnRenamed("city", "new_city")

join_df=new_df.join(old_df, new_df.new_id==old_df.old_id, "full")
join_df.show()

#no changes in data
df1=join_df.filter((col("old_hash")==col("new_hash")) | (col("new_hash").isNull()))\
    .select("old_id", "old_name", "old_city", "start_date", "end_date", "current_flag")

#updated changes in data
df2=join_df.filter(col("old_hash")!=col("new_hash"))\
    .select("new_id", "new_name", "new_city", "start_date", "end_date", "current_flag")

#new data
df3=join_df.filter(col("old_hash").isNull())\
    .select("new_id", "new_name", "new_city", "start_date", "end_date", "current_flag")

df3=df3.withColumn("start_date", current_date())\
        .withColumn("end_date", lit("9999-12-31"))\
        .withColumn("current_flag", lit("Y"))

union_df=df1.union(df2).union(df3)
union_df.show()


In [None]:
###############################THIRD Method#############################

spark=SparkSession.builder.getOrCreate()

old_df=spark.read.format("csv").option("header", True).load("sample_scd.csv")

new_df=spark.read.format("csv").option("header", True).load("sample_scd2.csv")

old_df=old_df.withColumn("start_date", col("start_date").cast("date"))\
            .withColumn("end_date", col("end_date").cast("date"))\
            .withColumn("salary", col("salary").cast("Integer"))

new_df=new_df.withColumn("salary", col("salary").cast("Integer"))

new_df=new_df.withColumnRenamed("id", "id_new")\
            .withColumnRenamed("name", "name_new")\
            .withColumnRenamed("salary", "salary_new")

old_df=old_df.withColumn("hash_old", hash("id", "name", "salary"))
new_df=new_df.withColumn("hash_new", hash("id_new", "name_new", "salary_new"))

join_df=old_df.join(new_df, old_df.id==new_df.id_new, "full")
#join_df.show()

#unchanged
print("unchanged")
df1=join_df.filter(col("hash_new")==col("hash_old"))
df1=df1.select("id", "name", "salary", "start_date", "end_date")
df1.show()

#new
print("new")
df2=join_df.filter(col("hash_old").isNull())
df2=df2.select("id_new", "name_new", "salary_new")\
        .withColumn("start_date", current_date())\
        .withColumn("end_date", lit("2999-12-31").cast("date"))
df2.show()

#updated
print("updated")
df3=join_df.filter(col("hash_old")!=col("hash_new"))
df3_a=df3.select("id", "name", "salary", "start_date", "end_date")\
        .withColumn("end_date", current_date()-1)
df3_a.show()

df3_b=df3.select("id_new", "name_new", "salary_new")\
        .withColumn("start_date", current_date())\
        .withColumn("end_date", lit("2999-12-31").cast("date"))
df3_b.show()

print("final SCD2 result:")
final_df=df1.union(df2)\
            .union(df3_a)\
            .union(df3_b)
final_df.show()