In [None]:
# ! pip install delta-spark==2.1.0

In [19]:
import os
import re
from datetime import datetime

from pyspark.sql.types import _parse_datatype_string
from pyspark.sql.functions import input_file_name, monotonically_increasing_id, row_number, regexp_extract, col, concat, sha2, to_timestamp, lit, desc, lag, when
from pyspark.sql.window import Window

import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [20]:
exec_time = datetime.now()
exec_time_str = exec_time.strftime("%Y-%m-%d %H:%M:%S")

high_date = "2999-12-31 23:59:59"

year, month, day, hour, minute = map(str, exec_time.strftime("%Y %m %d %H %M").split())

In [21]:
delta_schema_str = (
        "id int, first_name string,last_name string, source string, hash string, is_active string, start_tmst timestamp, "
        "end_tmst timestamp, create_tmst timestamp, update_tmst timestamp, created_by_exec_id int, updated_by_exec_id int, "
        "year string, month string, day string, hour string, minute string"
    )

delta_schema = _parse_datatype_string(delta_schema_str)
delta_path = f"{os.getcwd()}/delta"

partition_columns = ["year", "month", "day", "hour", "minute"]

if not DeltaTable.isDeltaTable(spark, delta_path):
    print("Not a delta table. Creating delta table...")
    empty_df = spark.createDataFrame([], delta_schema)

    empty_df.write.format("delta") \
                .mode("append") \
                .partitionBy(*partition_columns) \
                .save(delta_path)

delta_table = DeltaTable.forPath(spark, delta_path)
delta_df = delta_table.toDF()
delta_df.show()

+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|        N|2023-07-01 00:58:00|2023-07-02 00:58:00|2023-07-17 19:02:44|2023-07-18 11:20:31|                 1|                 2|2023|   07| 17|  19|    02|
|  2|      Lexi|     Walls|file:///Users/sai...|83b9e894466f70135...|        Y|2023-07-01 00:58:00|2

In [22]:
def create_staging_df_from_raw(raw_schema_str, path):
    raw_schema = _parse_datatype_string(raw_schema_str)

    raw_df = spark.read.csv(path, header=True, schema=raw_schema)

    timestamp_pattern = r"(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})"

    staging_df = raw_df.withColumn("extract_tmst", to_timestamp(regexp_extract(input_file_name(), timestamp_pattern, 1), "yyyy-MM-dd-HH-mm-ss")) \
                       .withColumn("source", input_file_name()) \
                       .withColumn("hash", sha2(concat(*raw_df.columns), 256))

    return staging_df

In [23]:
raw_schema_str = "id int,first_name string,last_name string"
path = "./raw/first_load/"

staging_df = create_staging_df_from_raw(raw_schema_str, path)
staging_df.printSchema()
staging_df.show()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- extract_tmst: timestamp (nullable = true)
 |-- source: string (nullable = false)
 |-- hash: string (nullable = true)

+---+----------+----------+-------------------+--------------------+--------------------+
| id|first_name| last_name|       extract_tmst|              source|                hash|
+---+----------+----------+-------------------+--------------------+--------------------+
|  1|   Belinda|  Sullivan|2023-07-01 00:58:00|file:///Users/sai...|1863cefdfa2cde755...|
|  1|   Belinda|   Hendrix|2023-07-01 00:58:00|file:///Users/sai...|e974f0bffc47aec5a...|
|  2|      Lexi|     Walls|2023-07-01 00:58:00|file:///Users/sai...|83b9e894466f70135...|
|  3|   Sherlyn|Williamson|2023-07-01 00:58:00|file:///Users/sai...|bf96251ef7d0ab4d8...|
|  1|   Belinda|   Hendrix|2023-07-02 00:58:00|file:///Users/sai...|e974f0bffc47aec5a...|
|  1|   Belinda|    Waters|2023-07

In [24]:
window_spec = Window.partitionBy("id").orderBy(desc("extract_tmst"), desc(monotonically_increasing_id()))
staging_df = staging_df.withColumn("eff_end_tmst", lag("extract_tmst").over(window_spec))

staging_df.printSchema()
staging_df.show()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- extract_tmst: timestamp (nullable = true)
 |-- source: string (nullable = false)
 |-- hash: string (nullable = true)
 |-- eff_end_tmst: timestamp (nullable = true)

+---+----------+----------+-------------------+--------------------+--------------------+-------------------+
| id|first_name| last_name|       extract_tmst|              source|                hash|       eff_end_tmst|
+---+----------+----------+-------------------+--------------------+--------------------+-------------------+
|  1|   Belinda|    Waters|2023-07-02 00:58:00|file:///Users/sai...|5877452deb0c15143...|               null|
|  1|   Belinda|   Hendrix|2023-07-02 00:58:00|file:///Users/sai...|e974f0bffc47aec5a...|2023-07-02 00:58:00|
|  1|   Belinda|   Hendrix|2023-07-01 00:58:00|file:///Users/sai...|e974f0bffc47aec5a...|2023-07-02 00:58:00|
|  1|   Belinda|  Sullivan|2023-07-01 00:58:00|

In [25]:
staging_df = staging_df.withColumn("is_active", when(col("eff_end_tmst").isNull(), lit("Y")).otherwise(lit("N"))) \
                .withColumn("start_tmst", col("extract_tmst")) \
                .withColumn("end_tmst", when(col("eff_end_tmst").isNull(), to_timestamp(lit(high_date), "yyyy-MM-dd HH:mm:ss")).otherwise(col("eff_end_tmst"))) \
                .withColumn("create_tmst", to_timestamp(lit(exec_time_str), "yyyy-MM-dd HH:mm:ss")) \
                .withColumn("update_tmst", to_timestamp(lit(exec_time_str), "yyyy-MM-dd HH:mm:ss")) \
                .withColumn("created_by_exec_id", lit(2)) \
                .withColumn("updated_by_exec_id", lit(2)) \
                .withColumn("year", lit(year)) \
                .withColumn("month", lit(month)) \
                .withColumn("day", lit(day)) \
                .withColumn("hour", lit(hour)) \
                .withColumn("minute", lit(minute)) \
                .drop("extract_tmst") \
                .drop("eff_end_tmst")

staging_df.show()

+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|    Waters|file:///Users/sai...|5877452deb0c15143...|        Y|2023-07-02 00:58:00|2999-12-31 23:59:59|2023-07-18 11:28:50|2023-07-18 11:28:50|                 2|                 2|2023|   07| 18|  11|    28|
|  1|   Belinda|   Hendrix|file:///Users/sai...|e974f0bffc47aec5a...|        N|2023-07-02 00:58:00|2

In [26]:
staging_df = staging_df.alias("stage")
delta_df = delta_df.alias("target")

In [27]:
cond = [col("stage.id") == col("target.id"), col("stage.hash") != col("target.hash"), col("stage.is_active") == "Y", col("target.is_active") == "Y"]

In [28]:
joined_df = delta_df.join(staging_df, cond, how="inner")
joined_df.show()

+---+----------+---------+------+----+---------+----------+--------+-----------+-----------+------------------+------------------+----+-----+---+----+------+---+----------+---------+------+----+---------+----------+--------+-----------+-----------+------------------+------------------+----+-----+---+----+------+
| id|first_name|last_name|source|hash|is_active|start_tmst|end_tmst|create_tmst|update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute| id|first_name|last_name|source|hash|is_active|start_tmst|end_tmst|create_tmst|update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+---------+------+----+---------+----------+--------+-----------+-----------+------------------+------------------+----+-----+---+----+------+---+----------+---------+------+----+---------+----------+--------+-----------+-----------+------------------+------------------+----+-----+---+----+------+
+---+----------+---------+------+----+---------+----------

In [29]:
joined_condition = ((col("stage.hash") != col("target.hash")) | col("target.hash").isNull()) & (col("stage.is_active") == "Y") & (col("target.is_active") == "Y")
joined_condition

Column<'((((NOT (stage.hash = target.hash)) OR (target.hash IS NULL)) AND (stage.is_active = Y)) AND (target.is_active = Y))'>

In [12]:
joined_df = delta_df.join(staging_df, col("stage.id")==col("target.id"), how="inner").where(joined_condition)
# joined_df.printSchema()
joined_df.show()

+---+----------+---------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+---+----------+---------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name|last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute| id|first_name|last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+---------+--------------------+--------------------+---------+-------------------+----------

In [13]:
# target_cols = [f"target.{col}" for col in delta_df.columns]
# staging_cols 


updated_df = (
    joined_df
    .select(
        "target.*",
        col("stage.start_tmst").alias("stage_start_tmst"),
        col("stage.create_tmst").alias("stage_create_tmst"),
        col("stage.created_by_exec_id").alias("stage_created_by_exec_id")
    )
    .withColumn("is_active", lit("N"))
    .withColumn("end_tmst", col("stage_start_tmst"))
    .withColumn("update_tmst", col("stage_create_tmst"))
    .withColumn("updated_by_exec_id", col("stage_created_by_exec_id"))
    .drop("stage_start_tmst", "stage_create_tmst", "stage_created_by_exec_id")
)



updated_df.show()

+---+----------+---------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name|last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+---------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda| Sullivan|file:///Users/sai...|1863cefdfa2cde755...|        N|2023-07-01 00:58:00|2023-07-02 00:58:00|2023-07-17 19:02:44|2023-07-18 11:20:31|                 1|                 2|2023|   07| 17|  19|    02|
+---+----------+---------+--------------------+--------------------+---------+-------------------+------

In [14]:
union_df = staging_df.union(updated_df)
union_df.show()

+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|    Waters|file:///Users/sai...|5877452deb0c15143...|        Y|2023-07-02 00:58:00|2999-12-31 23:59:59|2023-07-18 11:20:31|2023-07-18 11:20:31|                 2|                 2|2023|   07| 18|  11|    20|
|  1|   Belinda|   Hendrix|file:///Users/sai...|e974f0bffc47aec5a...|        N|2023-07-02 00:58:00|2

In [15]:
union_df = staging_df.unionByName(updated_df).select(*updated_df.columns)
union_df.show()

+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|    Waters|file:///Users/sai...|5877452deb0c15143...|        Y|2023-07-02 00:58:00|2999-12-31 23:59:59|2023-07-18 11:20:31|2023-07-18 11:20:31|                 2|                 2|2023|   07| 18|  11|    20|
|  1|   Belinda|   Hendrix|file:///Users/sai...|e974f0bffc47aec5a...|        N|2023-07-02 00:58:00|2

In [17]:
delta_table = DeltaTable.forPath(spark, delta_path)
delta_df = delta_table.toDF()
delta_df.show()

+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|        Y|2023-07-01 00:58:00|2999-12-31 23:59:59|2023-07-17 19:02:44|2023-07-17 19:02:44|                 1|                 1|2023|   07| 17|  19|    02|
|  3|   Sherlyn|Williamson|file:///Users/sai...|bf96251ef7d0ab4d8...|        Y|2023-07-01 00:58:00|2

In [18]:
delta_table = DeltaTable.forPath(spark, delta_path)


condition = "target.id==updates.id AND target.hash==updates.hash"

delta_table.alias("target").merge(
        union_df.alias("updates"),
        condition
    ).whenMatchedUpdateAll(
        condition="target.is_active!=updates.is_active"
    ).whenNotMatchedInsertAll(
    ).execute()

delta_table_df = delta_table.toDF()
delta_table_df.show()

+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|        N|2023-07-01 00:58:00|2023-07-02 00:58:00|2023-07-17 19:02:44|2023-07-18 11:20:31|                 1|                 2|2023|   07| 17|  19|    02|
|  2|      Lexi|     Walls|file:///Users/sai...|83b9e894466f70135...|        Y|2023-07-01 00:58:00|2

In [None]:
"""
Get the required columns from the updated df. Union them with the staging df

merge union df into delta
condition source and target id and hash match
when match update condition source.is_active != target.is_active
when does not match insert
"""

In [63]:
_new_records = staging_df.alias("stage").join(delta_df.alias("target"), on="id", how="leftanti")
_new_records.show()

+---+----------+---------+-------------------+--------------------+--------------------+
| id|first_name|last_name|       extract_tmst|              source|                hash|
+---+----------+---------+-------------------+--------------------+--------------------+
|  7|   Rodrigo|   Sparks|2023-07-02 00:58:00|file:///Users/sai...|5d664cbfd64c26514...|
+---+----------+---------+-------------------+--------------------+--------------------+



In [22]:
new_records = _new_records.withColumn("is_active", lit("Y")) \
                          .withColumn("start_tmst", col("extract_tmst")) \
                          .withColumn("end_tmst", to_timestamp(lit(high_date), "yyyy-MM-dd HH:mm:ss")) \
                          .withColumn("create_tmst", to_timestamp(lit(exec_time_str), "yyyy-MM-dd HH:mm:ss")) \
                          .withColumn("update_tmst", to_timestamp(lit(exec_time_str), "yyyy-MM-dd HH:mm:ss")) \
                          .withColumn("created_by_exec_id", lit(1)) \
                          .withColumn("updated_by_exec_id", lit(1)) \
                          .withColumn("year", lit(year)) \
                          .withColumn("month", lit(month)) \
                          .withColumn("day", lit(day)) \
                          .withColumn("hour", lit(hour)) \
                          .withColumn("minute", lit(minute)) \
                          .drop("extract_tmst")

new_records.printSchema()
new_records.show()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- source: string (nullable = false)
 |-- hash: string (nullable = true)
 |-- is_active: string (nullable = false)
 |-- start_tmst: timestamp (nullable = true)
 |-- end_tmst: timestamp (nullable = true)
 |-- create_tmst: timestamp (nullable = true)
 |-- update_tmst: timestamp (nullable = true)
 |-- created_by_exec_id: integer (nullable = false)
 |-- updated_by_exec_id: integer (nullable = false)
 |-- year: string (nullable = false)
 |-- month: string (nullable = false)
 |-- day: string (nullable = false)
 |-- hour: string (nullable = false)
 |-- minute: string (nullable = false)

+---+----------+---------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name|last_name|              source|                h

In [33]:
updates_condition = (col("stage.hash") != col("target.hash")) | col("target.hash").isNull()
updates_condition

Column<'((NOT (stage.hash = target.hash)) OR (target.hash IS NULL))'>

In [34]:
updated_records = staging_df.alias("stage").join(delta_df.alias("target"), on="id", how="inner").where(updates_condition)
updated_records.show()

+---+----------+---------+-------------------+--------------------+--------------------+----------+---------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name|last_name|       extract_tmst|              source|                hash|first_name|last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+---------+-------------------+--------------------+--------------------+----------+---------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Hendrix|2023-07-02 00:58:00|file:///Users/sai...|5620d42bea3b

Column<'(target.is_active = Y)'>

+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|        Y|2023-07-01 00:58:00|2999-12-31 23:59:59|2023-07-17 19:02:44|2023-07-17 19:02:44|                 1|                 1|2023|   07| 17|  19|    02|
|  3|   Sherlyn|Williamson|file:///Users/sai...|bf96251ef7d0ab4d8...|        Y|2023-07-01 00:58:00|2

In [None]:
# filter_condition = col("target.is_active") == "Y"
# filter_condition

# filtered_df = delta_df.join(staging_df, col("stage.id")==col("target.id"), how="leftsemi").where(filter_condition)
# filtered_df.show()

In [None]:
# filtered_df = delta_df.alias("target").join(staging_df.alias("stage"), on="id", how="leftsemi").where(filter_condition)
# filtered_df.show()

In [None]:
# window_spec = Window.partitionBy("id").orderBy(desc("extract_tmst"), desc(monotonically_increasing_id()))
# active_records = staging_df.withColumn("row_number", row_number().over(window_spec))

# active_records.show()

# _active_records = active_records.filter(active_records["row_number"] == 1).drop("row_number")
# _active_records.show()

# _inactive_records = active_records.filter(active_records["row_number"] != 1).drop("row_number")
# _inactive_records.show()

In [57]:
# max_surrogate_key = delta_df.selectExpr("max(surrogate_key)").collect()[0][0]

# if max_surrogate_key == None:
#     max_surrogate_key = 0

# _x = staging_df.withColumn("_x", lit(max_surrogate_key +1))
# _x.show()

In [52]:
filtered_df = delta_df.alias("target").join(staging_df.alias("stage"), on="id", how="inner").where(filter_condition)
filtered_df.show()

+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+----------+----------+-------------------+--------------------+--------------------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|first_name| last_name|       extract_tmst|              source|                hash|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+----------+----------+-------------------+--------------------+--------------------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|    

In [53]:
filtered_df = delta_df.alias("target").join(staging_df.alias("stage"), on="id", how="left").where(filter_condition)
filtered_df.show()

+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+----------+----------+-------------------+--------------------+--------------------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|first_name| last_name|       extract_tmst|              source|                hash|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+----------+----------+-------------------+--------------------+--------------------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|    

In [55]:
filtered_df = delta_df.alias("target").join(staging_df.alias("stage"), on="id", how="leftanti").where(filter_condition)
filtered_df.show()

+---+----------+---------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name|last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+---------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  6|   Shyanne|      Liu|file:///Users/sai...|547811bdef1d46e0a...|        Y|2023-07-02 00:58:00|2999-12-31 23:59:59|2023-07-17 19:02:44|2023-07-17 19:02:44|                 1|                 1|2023|   07| 17|  19|    02|
|  5|    Nathan|  Mcclain|file:///Users/sai...|e2cc62e08ad845909...|        Y|2023-07-02 00:58:00|2999-1

In [14]:
delta_path = f"{os.getcwd()}/delta"

delta_table = DeltaTable.forPath(spark, delta_path)

delta_table.alias("target").merge(
        new_records.alias("updates"),
        "target.id==updates.id"
    ).whenMatchedUpdateAll(
    ).whenNotMatchedInsertAll(
    ).execute()

delta_table_df = delta_table.toDF()
delta_table_df.show()

23/07/17 19:03:30 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|        Y|2023-07-01 00:58:00|2999-12-31 23:59:59|2023-07-17 19:02:44|2023-07-17 19:02:44|                 1|                 1|2023|   07| 17|  19|    02|
|  3|   Sherlyn|Williamson|file:///Users/sai...|bf96251ef7d0ab4d8...|        Y|2023-07-01 00:58:00|2

In [58]:
def delta_scd2(updates_df, condition):
    delta_schema_str = (
        "id int,first_name string,last_name string, source string, hash string, start_tmst timestamp, end_tmst timestamp, "
        "create_tmst timestamp, update_tmst timestamp, created_by_exec_id int, updated_by_exec_id int, "
        "year string, month string, day string, hour string, minute string"
    )

    delta_schema = _parse_datatype_string(delta_schema_str)
    delta_path = f"{os.getcwd()}/delta"

    partition_columns = ["year", "month", "day", "hour", "minute"]

    if not DeltaTable.isDeltaTable(spark, delta_path):
        print("Not a delta table. Creating delta table...")
        empty_df = spark.createDataFrame([], delta_schema)

        empty_df.write.format("delta") \
                    .mode("append") \
                    .partitionBy(*partition_columns) \
                    .save(delta_path)

    delta_table = DeltaTable.forPath(spark, delta_path)

    delta_table.alias("target").merge(
            updates_df.alias("updates"),
            condition
        ).whenMatchedUpdateAll(
        ).whenNotMatchedInsertAll(
        ).execute()

    delta_table_df = delta_table.toDF()
    delta_table_df.show()

In [60]:
delta_scd2(staging_df, "target.id==updates.id")

AnalysisException: cannot resolve hash in UPDATE clause given columns updates.id, updates.first_name, updates.last_name, updates.extract_date, updates.source