This botebook uses the data generated by gen_daily_csv

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("SvnLocalSpark") \
    .config("spark.sql.warehouse.dir", "../delta-data-tmp")\
    .config("spark.jars.packages","io.delta:delta-spark_2.13:3.3.0")\
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
    .master("local")\
    .getOrCreate()

print(f"spark {spark.version} {spark.sparkContext.uiWebUrl}")

spark 3.5.4 http://DESKTOP-4GOMK6M:4040


In [3]:
import shutil
import os

folder_path = "../delta-data-tmp/integration.db/property_test1"

# Check if the folder exists
if os.path.exists(folder_path):
    # Delete the folder and all its contents
    shutil.rmtree(folder_path)

In [4]:
from pyspark.sql.types import StructType
import json

spark.sql("CREATE SCHEMA IF NOT EXISTS integration")

spark.sql("DROP TABLE IF EXISTS integration.property_test1")

# JSON string representing the schema (with metadata field added)
json_schema = '''
{
    "fields":[
        {"metadata":{},"name":"valid_from","nullable":false,"type":"date"},
        {"metadata":{},"name":"valid_to","nullable":false,"type":"date"},
        {"metadata":{},"name":"property_id","nullable":false,"type":"string"},
        {"metadata":{},"name":"street","nullable":true,"type":"string"},
        {"metadata":{},"name":"street_number","nullable":true,"type":"integer"},
        {"metadata":{},"name":"city","nullable":true,"type":"string"},
        {"metadata":{},"name":"zip_code","nullable":true,"type":"integer"},
        {"metadata":{},"name":"category","nullable":true,"type":"string"},
        {"metadata":{},"name":"property_value","nullable":true,"type":"double"},
        {"metadata":{},"name":"energy_label","nullable":true,"type":"string"}
    ],
    "type":"struct"}
'''

# Deserialize the JSON string into a StructType schema
schema_dict = json.loads(json_schema)
schema = StructType.fromJson(schema_dict)

# Create an empty DataFrame using the schema
empty_df = spark.createDataFrame([], schema)

spark.catalog.createTable("integration.property_test1", source="delta", schema=schema)

DataFrame[valid_from: date, valid_to: date, property_id: string, street: string, street_number: int, city: string, zip_code: int, category: string, property_value: double, energy_label: string]

In [5]:
from datetime import date, datetime, timedelta
from delta.tables import DeltaTable
from pyspark.sql.functions import lit,sha2,concat

start_date = '2022-01-01'
end_date = '2022-01-03'


# Convert dates to datetime objects
start_date = datetime.strptime(start_date, '%Y-%m-%d').date()
end_date = datetime.strptime(end_date, '%Y-%m-%d').date()
max_date = datetime.strptime("9999-12-31", '%Y-%m-%d').date()

key_cols = ["property_id"]
tl_cols = ["valid_from", "valid_to"]
merge_cols = key_cols + ["valid_from"]
data_cols = [c for c in spark.table("integration.property_test1").columns if c not in (key_cols + tl_cols)]

merge_tgt = DeltaTable.forName(spark,"integration.property_test1")

current_date = start_date

# get the new data
while current_date <= end_date:
    file_path = f"../resources/generated/commercial_property/{current_date.strftime('%Y')}/{current_date.strftime('%m')}/commercial_property_snapshot_{current_date.strftime('%Y%m%d')}.csv"
    print(file_path)
    inp = spark.read.option("header", True).option("inferSchema", True).csv(file_path)\
        .withColumn("hash",sha2(concat(*data_cols),512))\
        .join(spark.table("integration.property_test1")\
              .withColumn("hash",sha2(concat(*data_cols),512))\
            , on=(key_cols + ["hash"]), how = "left_anti")\
        .withColumn("valid_from", lit(current_date))\
        .withColumn("valid_to", lit(max_date))\
        .drop("hash")

    upd = spark.table("integration.property_test1").where("valid_to=date '9999-12-31'")\
        .withColumn("valid_to", lit(current_date))\
        .join(inp,on=key_cols,how="left_semi")\
        .unionByName(inp)
    
    merge_tgt.alias("tgt").merge(upd.alias("src")
        ," and ".join([f"tgt.{c}=src.{c}" for c in merge_cols]))\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()

    # Move to the next day
    current_date += timedelta(days=1)

../resources/generated/commercial_property/2022/01/commercial_property_snapshot_20220101.csv
../resources/generated/commercial_property/2022/01/commercial_property_snapshot_20220102.csv
../resources/generated/commercial_property/2022/01/commercial_property_snapshot_20220103.csv


In [6]:
spark.table("integration.property_test1").orderBy("property_id").show()

+----------+----------+-----------+-----------------+-------------+------------+--------+---------+--------------+------------+
|valid_from|  valid_to|property_id|           street|street_number|        city|zip_code| category|property_value|energy_label|
+----------+----------+-----------+-----------------+-------------+------------+--------+---------+--------------+------------+
|2022-01-01|9999-12-31|       P001|    Poplar Street|          388|Fayetteville|   27505| Workshop|     109568.45|           A|
|2022-01-01|9999-12-31|       P002|     Maple Street|          401|Indian Trail|   27572|   Office|     381282.69|           F|
|2022-01-01|9999-12-31|       P003|   Asheville Road|          162|     Raleigh|   28727| Workshop|     217940.83|           F|
|2022-01-01|9999-12-31|       P004|Greensboro Street|          563|     Sanford|   28881| Workshop|     395346.61|           A|
|2022-01-01|9999-12-31|       P005|  Magnolia Street|          776|Fayetteville|   28872|Warehouse|     

In [7]:
%load_ext sparksql_magic

In [8]:
%%sparksql
DESCRIBE HISTORY integration.property_test1

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2025-04-14 13:26:35.707000,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""}]', 'predicate': '[""((property_id#193 = property_id#4158) AND (valid_from#191 = valid_from#4156))""]', 'notMatchedBySourcePredicates': '[]', 'notMatchedPredicates': '[{""actionType"":""insert""}]'}",,,,2,Serializable,False,"{'numOutputRows': '106', 'numTargetBytesAdded': '6039', 'numTargetRowsInserted': '3', 'numTargetRowsMatchedDeleted': '0', 'numTargetFilesAdded': '1', 'materializeSourceTimeMs': '1088', 'numTargetFilesRemoved': '1', 'numTargetRowsMatchedUpdated': '3', 'executionTimeMs': '2778', 'numTargetDeletionVectorsUpdated': '0', 'numTargetRowsCopied': '100', 'rewriteTimeMs': '303', 'numTargetRowsUpdated': '3', 'numTargetDeletionVectorsRemoved': '0', 'numTargetRowsDeleted': '0', 'scanTimeMs': '1381', 'numSourceRows': '6', 'numTargetDeletionVectorsAdded': '0', 'numTargetChangeFilesAdded': '0', 'numTargetRowsNotMatchedBySourceUpdated': '0', 'numTargetRowsNotMatchedBySourceDeleted': '0', 'numTargetBytesRemoved': '5972'}",,Apache-Spark/3.5.4 Delta-Lake/3.3.0
2,2025-04-14 13:26:30.866000,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""}]', 'predicate': '[""((property_id#193 = property_id#2268) AND (valid_from#191 = valid_from#2266))""]', 'notMatchedBySourcePredicates': '[]', 'notMatchedPredicates': '[{""actionType"":""insert""}]'}",,,,1,Serializable,False,"{'numOutputRows': '103', 'numTargetBytesAdded': '5972', 'numTargetRowsInserted': '3', 'numTargetRowsMatchedDeleted': '0', 'numTargetFilesAdded': '1', 'materializeSourceTimeMs': '1339', 'numTargetFilesRemoved': '1', 'numTargetRowsMatchedUpdated': '3', 'executionTimeMs': '3544', 'numTargetDeletionVectorsUpdated': '0', 'numTargetRowsCopied': '97', 'rewriteTimeMs': '541', 'numTargetRowsUpdated': '3', 'numTargetDeletionVectorsRemoved': '0', 'numTargetRowsDeleted': '0', 'scanTimeMs': '1656', 'numSourceRows': '6', 'numTargetDeletionVectorsAdded': '0', 'numTargetChangeFilesAdded': '0', 'numTargetRowsNotMatchedBySourceUpdated': '0', 'numTargetRowsNotMatchedBySourceDeleted': '0', 'numTargetBytesRemoved': '5911'}",,Apache-Spark/3.5.4 Delta-Lake/3.3.0
1,2025-04-14 13:26:25.174000,,,MERGE,"{'matchedPredicates': '[{""actionType"":""update""}]', 'predicate': '[""((property_id#193 = property_id#332) AND (valid_from#191 = valid_from#330))""]', 'notMatchedBySourcePredicates': '[]', 'notMatchedPredicates': '[{""actionType"":""insert""}]'}",,,,0,Serializable,False,"{'numOutputRows': '100', 'numTargetBytesAdded': '5911', 'numTargetRowsInserted': '100', 'numTargetRowsMatchedDeleted': '0', 'numTargetFilesAdded': '1', 'materializeSourceTimeMs': '6102', 'numTargetFilesRemoved': '0', 'numTargetRowsMatchedUpdated': '0', 'executionTimeMs': '9924', 'numTargetDeletionVectorsUpdated': '0', 'numTargetRowsCopied': '0', 'rewriteTimeMs': '1038', 'numTargetRowsUpdated': '0', 'numTargetDeletionVectorsRemoved': '0', 'numTargetRowsDeleted': '0', 'scanTimeMs': '2756', 'numSourceRows': '100', 'numTargetDeletionVectorsAdded': '0', 'numTargetChangeFilesAdded': '0', 'numTargetRowsNotMatchedBySourceUpdated': '0', 'numTargetRowsNotMatchedBySourceDeleted': '0', 'numTargetBytesRemoved': '0'}",,Apache-Spark/3.5.4 Delta-Lake/3.3.0
0,2025-04-14 13:26:09.895000,,,CREATE TABLE,"{'partitionBy': '[]', 'description': None, 'properties': '{}', 'clusterBy': '[]', 'isManaged': 'true'}",,,,,Serializable,True,{},,Apache-Spark/3.5.4 Delta-Lake/3.3.0
