In [None]:
# ! pip install delta-spark==2.1.0

In [1]:
import os
import re
from datetime import datetime

from pyspark.sql.types import _parse_datatype_string
from pyspark.sql.functions import input_file_name, monotonically_increasing_id, row_number, regexp_extract, col, concat, sha2, to_timestamp, lit, desc, lag, when
from pyspark.sql.window import Window

import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

23/07/18 12:47:51 WARN Utils: Your hostname, Sais-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.44 instead (on interface en0)
23/07/18 12:47:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/opt/anaconda3/envs/aws/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/sai/.ivy2/cache
The jars for the packages stored in: /Users/sai/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f69bf692-8fef-4267-8b71-10a058ab282b;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.0 in central
	found io.delta#delta-storage;2.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 134ms :: artifacts dl 6ms
	:: modules in use:
	io.delta#delta-core_2.12;2.1.0 from central in [default]
	io.delta#delta-storage;2.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted

23/07/18 12:47:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
def create_staging_df_from_raw(raw_path):
    raw_schema_str = "id int,first_name string,last_name string"
    raw_schema = _parse_datatype_string(raw_schema_str)

    raw_df = spark.read.csv(raw_path, header=True, schema=raw_schema)

    timestamp_pattern = r"(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})"

    staging_df = raw_df.withColumn("extract_tmst", to_timestamp(regexp_extract(input_file_name(), timestamp_pattern, 1), "yyyy-MM-dd-HH-mm-ss")) \
                       .withColumn("source", input_file_name()) \
                       .withColumn("hash", sha2(concat(*raw_df.columns), 256))

    return staging_df

In [3]:
def scd2_run(raw_path, run_number):

    exec_time = datetime.now()
    exec_time_str = exec_time.strftime("%Y-%m-%d %H:%M:%S")

    high_date = "2999-12-31 23:59:59"

    year, month, day, hour, minute = map(str, exec_time.strftime("%Y %m %d %H %M").split())

    delta_schema_str = (
        "id int, first_name string,last_name string, source string, hash string, is_active string, start_tmst timestamp, "
        "end_tmst timestamp, create_tmst timestamp, update_tmst timestamp, created_by_exec_id int, updated_by_exec_id int, "
        "year string, month string, day string, hour string, minute string"
    )

    delta_schema = _parse_datatype_string(delta_schema_str)
    delta_path = f"{os.getcwd()}/delta"

    partition_columns = ["year", "month", "day", "hour", "minute"]

    if not DeltaTable.isDeltaTable(spark, delta_path):
        print("Not a delta table. Creating delta table...")
        empty_df = spark.createDataFrame([], delta_schema)

        empty_df.write.format("delta") \
                    .mode("append") \
                    .partitionBy(*partition_columns) \
                    .save(delta_path)
    
    delta_table = DeltaTable.forPath(spark, delta_path)
    delta_df = delta_table.toDF()
    print("Current delta table")
    delta_df.show()


    staging_df = create_staging_df_from_raw(raw_path)

    window_spec = Window.partitionBy("id").orderBy(desc("extract_tmst"), desc(monotonically_increasing_id()))
    staging_df = staging_df.withColumn("eff_end_tmst", lag("extract_tmst").over(window_spec))

    staging_df = staging_df.withColumn("is_active", when(col("eff_end_tmst").isNull(), lit("Y")).otherwise(lit("N"))) \
                .withColumn("start_tmst", col("extract_tmst")) \
                .withColumn("end_tmst", when(col("eff_end_tmst").isNull(), to_timestamp(lit(high_date), "yyyy-MM-dd HH:mm:ss")).otherwise(col("eff_end_tmst"))) \
                .withColumn("create_tmst", to_timestamp(lit(exec_time_str), "yyyy-MM-dd HH:mm:ss")) \
                .withColumn("update_tmst", to_timestamp(lit(exec_time_str), "yyyy-MM-dd HH:mm:ss")) \
                .withColumn("created_by_exec_id", lit(run_number)) \
                .withColumn("updated_by_exec_id", lit(run_number)) \
                .withColumn("year", lit(year)) \
                .withColumn("month", lit(month)) \
                .withColumn("day", lit(day)) \
                .withColumn("hour", lit(hour)) \
                .withColumn("minute", lit(minute)) \
                .drop("extract_tmst") \
                .drop("eff_end_tmst")

    print("Current staging table")
    staging_df.show()

    staging_df = staging_df.alias("stage")
    delta_df = delta_df.alias("target")

    # cond = [col("stage.id") == col("target.id"), col("stage.hash") != col("target.hash"), col("stage.is_active") == "Y", col("target.is_active") == "Y"]
    cond = [col("stage.id") == col("target.id"), col("target.is_active") == "Y"]

    filtered_target_df = delta_df.join(staging_df, cond, how="inner")
    print("Filtered target records")
    filtered_target_df.show()

    updated_df = (
        filtered_target_df
        .select(
            "target.*",
            col("stage.start_tmst").alias("stage_start_tmst"),
            col("stage.create_tmst").alias("stage_create_tmst"),
            col("stage.created_by_exec_id").alias("stage_created_by_exec_id")
        )
        .withColumn("is_active", lit("N"))
        .withColumn("end_tmst", col("stage_start_tmst"))
        .withColumn("update_tmst", col("stage_create_tmst"))
        .withColumn("updated_by_exec_id", col("stage_created_by_exec_id"))
        .drop("stage_start_tmst", "stage_create_tmst", "stage_created_by_exec_id")
    )

    print("Updated target records")
    updated_df.show()

    union_df = staging_df.unionByName(updated_df).select(*updated_df.columns)
    print("Uninoned staging & target records")
    union_df.show()

    condition = "target.id==updates.id AND target.hash==updates.hash"

    delta_table.alias("target").merge(
            union_df.alias("updates"),
            condition
        ).whenMatchedUpdateAll(
            condition="target.is_active!=updates.is_active AND target.start_tmst==updates.start_tmst"
        ).whenNotMatchedInsertAll(
        ).execute()

    delta_table_df = delta_table.toDF()
    print("Updated delta table")
    delta_table_df.show()

In [4]:
scd2_run("./data/2023-07-01-00-58-00.csv", 1)

Not a delta table. Creating delta table...


                                                                                

Current delta table
+---+----------+---------+------+----+---------+----------+--------+-----------+-----------+------------------+------------------+----+-----+---+----+------+
| id|first_name|last_name|source|hash|is_active|start_tmst|end_tmst|create_tmst|update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+---------+------+----+---------+----------+--------+-----------+-----------+------------------+------------------+----+-----+---+----+------+
+---+----------+---------+------+----+---------+----------+--------+-----------+-----------+------------------+------------------+----+-----+---+----+------+

Current staging table
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst| 

                                                                                

Updated delta table
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  3|   Sherlyn|Williamson|file:///Users/sai...|bf96251ef7d0ab4d8...|        Y|2023-07-01 00:58:00|2999-12-31 23:59:59|2023-07-18 12:47:56|2023-07-18 12:47:56|                 1|                 1|2023|   07| 18|  12|    47|
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|        Y|2

In [5]:
scd2_run("./data/2023-07-02-00-58-00.csv", 2)

Current delta table
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  3|   Sherlyn|Williamson|file:///Users/sai...|bf96251ef7d0ab4d8...|        Y|2023-07-01 00:58:00|2999-12-31 23:59:59|2023-07-18 12:47:56|2023-07-18 12:47:56|                 1|                 1|2023|   07| 18|  12|    47|
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|        Y|2

In [6]:
scd2_run("./data/2023-07-03-00-58-00.csv", 3)

Current delta table
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  3|   Sherlyn|Williamson|file:///Users/sai...|bf96251ef7d0ab4d8...|        Y|2023-07-01 00:58:00|2999-12-31 23:59:59|2023-07-18 12:47:56|2023-07-18 12:47:56|                 1|                 1|2023|   07| 18|  12|    47|
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|        Y|2

In [7]:
scd2_run("./data/2023-07-04-00-58-00.csv", 4)

Current delta table
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|    Waters|file:///Users/sai...|5877452deb0c15143...|        Y|2023-07-03 00:58:00|2999-12-31 23:59:59|2023-07-18 12:48:37|2023-07-18 12:48:37|                 3|                 3|2023|   07| 18|  12|    48|
|  4|      Zoey|    Tucker|file:///Users/sai...|1818ea471269caa72...|        Y|2

Py4JJavaError: An error occurred while calling o636.execute.
: org.apache.spark.sql.delta.DeltaUnsupportedOperationException: Cannot perform Merge as multiple source rows matched and attempted to modify the same
target row in the Delta table in possibly conflicting ways. By SQL semantics of Merge,
when multiple source rows match on the same target row, the result may be ambiguous
as it is unclear which source row should be used to update or delete the matching
target row. You can preprocess the source table to eliminate the possibility of
multiple matches. Please refer to
https://docs.delta.io/latest/delta-update.html#upsert-into-a-table-using-merge
	at org.apache.spark.sql.delta.DeltaErrorsBase.multipleSourceRowMatchingTargetRowInMergeException(DeltaErrors.scala:1039)
	at org.apache.spark.sql.delta.DeltaErrorsBase.multipleSourceRowMatchingTargetRowInMergeException$(DeltaErrors.scala:1036)
	at org.apache.spark.sql.delta.DeltaErrors$.multipleSourceRowMatchingTargetRowInMergeException(DeltaErrors.scala:2293)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$findTouchedFiles$1(MergeIntoCommand.scala:455)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordMergeOperation(MergeIntoCommand.scala:970)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.findTouchedFiles(MergeIntoCommand.scala:391)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$2(MergeIntoCommand.scala:336)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$2$adapted(MergeIntoCommand.scala:319)
	at org.apache.spark.sql.delta.DeltaLog.withNewTransaction(DeltaLog.scala:221)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$1(MergeIntoCommand.scala:319)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:139)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:137)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordFrameProfile(MergeIntoCommand.scala:215)
	at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperationInternal$1(DeltaLogging.scala:132)
	at com.databricks.spark.util.DatabricksLogging.recordOperation(DatabricksLogging.scala:77)
	at com.databricks.spark.util.DatabricksLogging.recordOperation$(DatabricksLogging.scala:67)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordOperation(MergeIntoCommand.scala:215)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperationInternal(DeltaLogging.scala:131)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:121)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:109)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordDeltaOperation(MergeIntoCommand.scala:215)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.run(MergeIntoCommand.scala:317)
	at io.delta.tables.DeltaMergeBuilder.$anonfun$execute$1(DeltaMergeBuilder.scala:230)
	at org.apache.spark.sql.delta.util.AnalysisHelper.improveUnsupportedOpError(AnalysisHelper.scala:104)
	at org.apache.spark.sql.delta.util.AnalysisHelper.improveUnsupportedOpError$(AnalysisHelper.scala:90)
	at io.delta.tables.DeltaMergeBuilder.improveUnsupportedOpError(DeltaMergeBuilder.scala:122)
	at io.delta.tables.DeltaMergeBuilder.execute(DeltaMergeBuilder.scala:206)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [10]:
scd2_run("./data/2023-07-05-00-58-00.csv", 5)

Current delta table
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|        N|2023-07-01 00:58:00|2023-07-03 00:58:00|2023-07-18 12:06:11|2023-07-18 12:10:42|                 1|                 3|2023|   07| 18|  12|    06|
|  2|      Lexi|     Walls|file:///Users/sai...|83b9e894466f70135...|        Y|2

In [11]:
scd2_run("./data/2023-07-06-00-58-00.csv", 6)

Current delta table
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  7|   Rodrigo|     Cline|file:///Users/sai...|9dc9e42700e242c08...|        N|2023-07-05 00:58:00|2023-07-05 00:58:00|2023-07-18 12:17:51|2023-07-18 12:17:51|                 5|                 5|2023|   07| 18|  12|    17|
|  7|   Rodrigo|  McKinney|file:///Users/sai...|a607afbdf3766b3ea...|        N|2

In [14]:
scd2_run("./data/2023-07-07-00-58-00.csv", 7)

Current delta table
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|is_active|         start_tmst|           end_tmst|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  7|   Rodrigo|   Carillo|file:///Users/sai...|238173817f6ca8671...|        Y|2023-07-06 00:58:00|2999-12-31 23:59:59|2023-07-18 12:20:52|2023-07-18 12:20:52|                 6|                 6|2023|   07| 18|  12|    20|
|  7|   Rodrigo|   Carillo|file:///Users/sai...|238173817f6ca8671...|        N|2

Py4JJavaError: An error occurred while calling o1376.execute.
: org.apache.spark.sql.delta.DeltaUnsupportedOperationException: Cannot perform Merge as multiple source rows matched and attempted to modify the same
target row in the Delta table in possibly conflicting ways. By SQL semantics of Merge,
when multiple source rows match on the same target row, the result may be ambiguous
as it is unclear which source row should be used to update or delete the matching
target row. You can preprocess the source table to eliminate the possibility of
multiple matches. Please refer to
https://docs.delta.io/latest/delta-update.html#upsert-into-a-table-using-merge
	at org.apache.spark.sql.delta.DeltaErrorsBase.multipleSourceRowMatchingTargetRowInMergeException(DeltaErrors.scala:1039)
	at org.apache.spark.sql.delta.DeltaErrorsBase.multipleSourceRowMatchingTargetRowInMergeException$(DeltaErrors.scala:1036)
	at org.apache.spark.sql.delta.DeltaErrors$.multipleSourceRowMatchingTargetRowInMergeException(DeltaErrors.scala:2293)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$findTouchedFiles$1(MergeIntoCommand.scala:455)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordMergeOperation(MergeIntoCommand.scala:970)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.findTouchedFiles(MergeIntoCommand.scala:391)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$2(MergeIntoCommand.scala:336)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$2$adapted(MergeIntoCommand.scala:319)
	at org.apache.spark.sql.delta.DeltaLog.withNewTransaction(DeltaLog.scala:221)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.$anonfun$run$1(MergeIntoCommand.scala:319)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:139)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:137)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordFrameProfile(MergeIntoCommand.scala:215)
	at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperationInternal$1(DeltaLogging.scala:132)
	at com.databricks.spark.util.DatabricksLogging.recordOperation(DatabricksLogging.scala:77)
	at com.databricks.spark.util.DatabricksLogging.recordOperation$(DatabricksLogging.scala:67)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordOperation(MergeIntoCommand.scala:215)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperationInternal(DeltaLogging.scala:131)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:121)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:109)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.recordDeltaOperation(MergeIntoCommand.scala:215)
	at org.apache.spark.sql.delta.commands.MergeIntoCommand.run(MergeIntoCommand.scala:317)
	at io.delta.tables.DeltaMergeBuilder.$anonfun$execute$1(DeltaMergeBuilder.scala:230)
	at org.apache.spark.sql.delta.util.AnalysisHelper.improveUnsupportedOpError(AnalysisHelper.scala:104)
	at org.apache.spark.sql.delta.util.AnalysisHelper.improveUnsupportedOpError$(AnalysisHelper.scala:90)
	at io.delta.tables.DeltaMergeBuilder.improveUnsupportedOpError(DeltaMergeBuilder.scala:122)
	at io.delta.tables.DeltaMergeBuilder.execute(DeltaMergeBuilder.scala:206)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
"""
Get the required columns from the updated df. Union them with the staging df

merge union df into delta
condition source and target id and hash match
when match update condition source.is_active != target.is_active
when does not match insert
"""