In [None]:
# ! pip install delta-spark==2.1.0

In [1]:
import os
import re
from datetime import datetime

from pyspark.sql.types import _parse_datatype_string
from pyspark.sql.functions import input_file_name, monotonically_increasing_id, row_number, regexp_extract, col, concat, sha2, to_timestamp, lit, desc, lag, when, lead,asc,count
from pyspark.sql.window import Window

import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/opt/anaconda3/envs/aws/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/sai/.ivy2/cache
The jars for the packages stored in: /Users/sai/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8af9a167-f5c9-4995-80f2-40ec5652a809;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.0 in central
	found io.delta#delta-storage;2.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 102ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;2.1.0 from central in [default]
	io.delta#delta-storage;2.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted

23/07/19 18:50:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
def create_staging_df_from_raw(raw_path):
    raw_schema_str = "id int,first_name string,last_name string"
    raw_schema = _parse_datatype_string(raw_schema_str)

    raw_df = spark.read.csv(raw_path, header=True, schema=raw_schema)

    timestamp_pattern = r"(\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2})"

    staging_df = raw_df.withColumn("extract_tmst", to_timestamp(regexp_extract(input_file_name(), timestamp_pattern, 1), "yyyy-MM-dd-HH-mm-ss")) \
                       .withColumn("source", input_file_name()) \
                       .withColumn("hash", sha2(concat(*raw_df.columns), 256))

    return staging_df

In [7]:
def scd2_run(raw_path, run_number):

    exec_time = datetime.now()
    exec_time_str = exec_time.strftime("%Y-%m-%d %H:%M:%S")

    high_date = "2999-12-31 23:59:59"

    year, month, day, hour, minute = map(str, exec_time.strftime("%Y %m %d %H %M").split())

    delta_schema_str = (
        "id int, first_name string,last_name string, source string, hash string, action string, is_active string, eff_start timestamp, "
        "eff_end timestamp, create_tmst timestamp, update_tmst timestamp, created_by_exec_id int, updated_by_exec_id int, "
        "year string, month string, day string, hour string, minute string"
    )

    delta_schema = _parse_datatype_string(delta_schema_str)
    delta_path = f"{os.getcwd()}/delta"

    partition_columns = ["year", "month", "day", "hour", "minute"]

    if not DeltaTable.isDeltaTable(spark, delta_path):
        print("Not a delta table. Creating delta table...")
        empty_df = spark.createDataFrame([], delta_schema)

        empty_df.write.format("delta") \
                    .mode("append") \
                    .partitionBy(*partition_columns) \
                    .save(delta_path)
    
    delta_table = DeltaTable.forPath(spark, delta_path)
    delta_df = delta_table.toDF()
    print("Current delta table")
    delta_df.show()


    staging_df = create_staging_df_from_raw(raw_path)


    window_spec = Window.partitionBy("id").orderBy(desc("extract_tmst"), desc(monotonically_increasing_id()))
    staging_df = staging_df = staging_df.withColumn("eff_end_tmst", lag("extract_tmst").over(window_spec))
    staging_df.show()


    staging_df = (
        staging_df
        .withColumn("eff_start", col("extract_tmst"))
        .withColumn("eff_end", when(col("eff_end_tmst").isNull(), to_timestamp(lit(high_date), "yyyy-MM-dd HH:mm:ss")).otherwise(col("eff_end_tmst")))
        .withColumn("create_tmst", to_timestamp(lit(exec_time_str), "yyyy-MM-dd HH:mm:ss"))
        .withColumn("created_by_exec_id", lit(run_number))
        .withColumn("action", lit("insert"))
        .drop("extract_tmst")
        .drop("eff_end_tmst")
    )
    print("Staging records")
    staging_df.show()

    staging_df = staging_df.alias("stage")
    delta_df = delta_df.alias("target")
    cond = [col("stage.id") == col("target.id"), col("target.is_active") == "Y"]

    filtered_target_df = (
        delta_df
        .join(staging_df, cond, how="leftsemi")
        .drop("is_active", "update_tmst", "updated_by_exec_id", "year", "month", "hour", "day", "minute")
        .withColumn("action", lit("update"))
    )
    print("Filtered target records")
    filtered_target_df.show()

    union_df = staging_df.unionByName(filtered_target_df).select(*filtered_target_df.columns)
    print("Unioned staging & target records")
    union_df.show()

    window_spec = Window.partitionBy("id").orderBy(desc("eff_start"), asc(monotonically_increasing_id()))
    union_df = (
        union_df
        .withColumn("next_hash", lag("hash").over(window_spec))
        .withColumn("next_eff_start", lag("eff_start").over(window_spec))
    )
    print("Unioned records with prev_hash, next_eff_start")
    union_df.show()

    union_df = union_df.filter((col("hash") != col("next_hash")) | col("next_hash").isNull())
    print("Filter consecutive duplicates")
    union_df.show()

    window_spec = Window.partitionBy("id")
    union_df = union_df.withColumn("count", count("id").over(window_spec))
    print("Count of records with the same id")
    union_df.show()

    filtered_union_df = (
        union_df
        .filter(~((union_df["action"] == "update") & (union_df["count"] == 1)))
        .persist()
    )
    print("Remove records which are set for update but only have a count==1, i.e. no insert for the id")
    filtered_union_df.show()

    updated_union_df = (
        filtered_union_df
        .withColumn("eff_end", when(col("next_eff_start").isNull(), to_timestamp(lit(high_date), "yyyy-MM-dd HH:mm:ss")).otherwise(col("next_eff_start")))
        .withColumn("is_active", when(col("eff_end") == to_timestamp(lit(high_date), "yyyy-MM-dd HH:mm:ss"), "Y").otherwise("N"))
        .withColumn("update_tmst", to_timestamp(lit(exec_time_str), "yyyy-MM-dd HH:mm:ss"))
        .withColumn("updated_by_exec_id", lit(run_number))
        .withColumn("year", lit(year))
        .withColumn("month", lit(month))
        .withColumn("day", lit(day))
        .withColumn("hour", lit(hour))
        .withColumn("minute", lit(minute))
        .drop("prev_hash", "next_eff_start", "count")
    )
    print("Union records with updated metadata")
    updated_union_df.show()

    reordered_union_df = updated_union_df.select(*delta_df.columns)
    print("Reorder union records")
    reordered_union_df.show()

    # condition = "target.id==updates.id AND target.hash==updates.hash AND updates.action=='update'"

    # delta_table.alias("target").merge(
    #         reordered_union_df.alias("updates"),
    #         condition
    #     ).whenMatchedUpdateAll(
    #     ).whenNotMatchedInsertAll(
    #     ).execute()

    # delta_table_df = delta_table.toDF()
    # print("Updated delta table")
    # delta_table_df.show()

In [4]:
scd2_run("./data/delta_load/2023-07-01-00-58-00.csv", 1)

Not a delta table. Creating delta table...


                                                                                

Current delta table
+---+----------+---------+------+----+------+---------+---------+-------+-----------+-----------+------------------+------------------+----+-----+---+----+------+
| id|first_name|last_name|source|hash|action|is_active|eff_start|eff_end|create_tmst|update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+---------+------+----+------+---------+---------+-------+-----------+-----------+------------------+------------------+----+-----+---+----+------+
+---+----------+---------+------+----+------+---------+---------+-------+-----------+-----------+------------------+------------------+----+-----+---+----+------+

+---+----------+----------+-------------------+--------------------+--------------------+------------+
| id|first_name| last_name|       extract_tmst|              source|                hash|eff_end_tmst|
+---+----------+----------+-------------------+--------------------+--------------------+------------+
|  1|   Belinda|  

                                                                                

Updated delta table
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|action|is_active|          eff_start|            eff_end|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|insert|        Y|2023-07-01 00:58:00|2999-12-31 23:59:59|2023-07-19 18:50:27|2023-07-19 18:50:27|                 1|                 1|2023|   07| 19|  18|    50|
|  3|   Sherlyn|Williamson|file:///Users/sai...|bf96

In [5]:
scd2_run("./data/delta_load/2023-07-02-00-58-00.csv", 2)

Current delta table
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|action|is_active|          eff_start|            eff_end|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|insert|        Y|2023-07-01 00:58:00|2999-12-31 23:59:59|2023-07-19 18:50:27|2023-07-19 18:50:27|                 1|                 1|2023|   07| 19|  18|    50|
|  3|   Sherlyn|Williamson|file:///Users/sai...|bf96

In [6]:
scd2_run("./data/delta_load/2023-07-03-00-58-00.csv", 3)

Current delta table
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|action|is_active|          eff_start|            eff_end|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|insert|        Y|2023-07-01 00:58:00|2999-12-31 23:59:59|2023-07-19 18:50:27|2023-07-19 18:50:27|                 1|                 1|2023|   07| 19|  18|    50|
|  3|   Sherlyn|Williamson|file:///Users/sai...|bf96

In [8]:
scd2_run("./data/delta_load/2023-07-04-00-58-00.csv", 4)

Current delta table
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|action|is_active|          eff_start|            eff_end|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|update|        N|2023-07-01 00:58:00|2023-07-03 00:58:00|2023-07-19 18:50:27|2023-07-19 18:52:26|                 1|                 3|2023|   07| 19|  18|    52|
|  1|   Belinda|    Waters|file:///Users/sai...|5877

In [9]:
scd2_run("./data/delta_load/2023-07-05-00-58-00.csv", 5)

Current delta table
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|action|is_active|          eff_start|            eff_end|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|update|        N|2023-07-01 00:58:00|2023-07-03 00:58:00|2023-07-19 18:35:49|2023-07-19 18:37:09|                 1|                 3|2023|   07| 19|  18|    37|
|  1|   Belinda|    Waters|file:///Users/sai...|5877

In [14]:
scd2_run("./data/delta_load/2023-07-06-00-58-00.csv", 6)

Current delta table
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|action|is_active|          eff_start|            eff_end|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|update|        N|2023-07-01 00:58:00|2023-07-03 00:58:00|2023-07-19 18:35:49|2023-07-19 18:37:09|                 1|                 3|2023|   07| 19|  18|    37|
|  1|   Belinda|    Waters|file:///Users/sai...|5877

In [10]:
scd2_run("./data/delta_load/2023-07-07-00-58-00.csv", 7)

Current delta table
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|action|is_active|          eff_start|            eff_end|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|update|        N|2023-07-01 00:58:00|2023-07-03 00:58:00|2023-07-19 16:11:42|2023-07-19 16:12:21|                 1|                 3|2023|   07| 19|  16|    12|
|  1|   Belinda|    Waters|file:///Users/sai...|5877

In [11]:
scd2_run("./data/delta_load/2023-07-08-00-58-00.csv", 8)

Current delta table
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
| id|first_name| last_name|              source|                hash|action|is_active|          eff_start|            eff_end|        create_tmst|        update_tmst|created_by_exec_id|updated_by_exec_id|year|month|day|hour|minute|
+---+----------+----------+--------------------+--------------------+------+---------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+----+-----+---+----+------+
|  1|   Belinda|  Sullivan|file:///Users/sai...|1863cefdfa2cde755...|update|        N|2023-07-01 00:58:00|2023-07-03 00:58:00|2023-07-19 16:11:42|2023-07-19 16:12:21|                 1|                 3|2023|   07| 19|  16|    12|
|  1|   Belinda|    Waters|file:///Users/sai...|5877

In [71]:
"""
Get the required columns from the updated df. Union them with the staging df

merge union df into delta
condition source and target id and hash match
when match update condition source.is_active != target.is_active
when does not match insert
"""

'\nGet the required columns from the updated df. Union them with the staging df\n\nmerge union df into delta\ncondition source and target id and hash match\nwhen match update condition source.is_active != target.is_active\nwhen does not match insert\n'

In [None]:
""""
1. What if the current active record in target is same as the most recent staging record but there are other older records with the same id
""""