This botebook uses the data generated by gen_daily_csv

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("SvnLocalSpark") \
    .config("spark.sql.warehouse.dir", "../delta-data-tmp")\
    .config("spark.jars.packages","io.delta:delta-spark_2.13:3.3.0")\
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
    .master("local")\
    .getOrCreate()

print(f"spark {spark.version} {spark.sparkContext.uiWebUrl}")

spark 3.5.4 http://DESKTOP-4GOMK6M:4040


In [2]:
import shutil
import os

folder_path = "../delta-data-tmp/append_test1"

# Check if the folder exists
if os.path.exists(folder_path):
    # Delete the folder and all its contents
    shutil.rmtree(folder_path)

In [8]:
from datetime import date, datetime, timedelta
from delta.tables import DeltaTable
from pyspark.sql.functions import lit,sha2,concat,col

start_date = '2022-01-01'
end_date = '2022-01-03'


# Convert dates to datetime objects
start_date = datetime.strptime(start_date, '%Y-%m-%d').date()
end_date = datetime.strptime(end_date, '%Y-%m-%d').date()
max_date = datetime.strptime("9999-12-31", '%Y-%m-%d').date()

current_date = start_date

# get the new data
while current_date <= end_date:
    file_path = f"../resources/generated/commercial_property/{current_date.strftime('%Y')}/{current_date.strftime('%m')}/commercial_property_snapshot_{current_date.strftime('%Y%m%d')}.csv"
    print(file_path)
    inp = spark.read.option("header", True).option("inferSchema", True).csv(file_path)\
        .withColumn("snapshot_date", lit(current_date))\
        .withColumn("yyyy", lit(datetime.strftime(current_date, "%Y")))\
        .withColumn("mm", lit(datetime.strftime(current_date, "%m")))\
        .withColumn("dd", lit(datetime.strftime(current_date, "%d")))

    if DeltaTable.isDeltaTable(spark, folder_path):
        tbl = DeltaTable.forPath(spark, folder_path)
        tbl.delete(col("snapshot_date")==current_date)
    inp.write.format('delta').partitionBy("yyyy", "mm", "dd").mode('append').save("../delta-data-tmp/append_test1")

    # Move to the next day
    current_date += timedelta(days=1)

../resources/generated/commercial_property/2022/01/commercial_property_snapshot_20220101.csv
../resources/generated/commercial_property/2022/01/commercial_property_snapshot_20220102.csv
../resources/generated/commercial_property/2022/01/commercial_property_snapshot_20220103.csv


In [9]:
df = spark.read.format("delta").load("../delta-data-tmp/append_test1")
print(df.count())
df.orderBy("property_id", "snapshot_date").show()

300
+-----------+-----------------+-------------+------------+--------+---------+--------------+------------+-------------+----+---+---+
|property_id|           street|street_number|        city|zip_code| category|property_value|energy_label|snapshot_date|yyyy| mm| dd|
+-----------+-----------------+-------------+------------+--------+---------+--------------+------------+-------------+----+---+---+
|       P001|    Poplar Street|          388|Fayetteville|   27505| Workshop|     109568.45|           A|   2022-01-01|2022| 01| 01|
|       P001|    Poplar Street|          388|Fayetteville|   27505| Workshop|     109568.45|           A|   2022-01-02|2022| 01| 02|
|       P001|    Poplar Street|          388|Fayetteville|   27505| Workshop|     109568.45|           A|   2022-01-03|2022| 01| 03|
|       P002|     Maple Street|          401|Indian Trail|   27572|   Office|     381282.69|           F|   2022-01-01|2022| 01| 01|
|       P002|     Maple Street|          401|Indian Trail|   2757

In [None]:
%load_ext sparksql_magic

In [None]:
%%sparksql
DESCRIBE HISTORY integration.property_test1

In [None]:
from pyspark.sql.functions import col

th = spark.sql("DESCRIBE HISTORY integration.property_test1")
th.select("version", col("timestamp"), "operation"
          , col("operationMetrics.numTargetRowsInserted").alias("RowsInserted")
          , col("operationMetrics.numTargetRowsUpdated").alias("RowsUpdated")
          , "operationMetrics.numSourceRows", "operationMetrics.executionTimeMs"
          ).show()

In [None]:
example_property = spark.table("integration.property_test1").where("valid_from>date '2022-01-01'").select("property_id").limit(1).collect()[0][0]
#spark.table("integration.property_test1").option("versionAsOf", "1").where("valid_to=date '9999-12-31'").where(f"property_id='{example_property}'").show()
spark.sql(f"SELECT * FROM integration.property_test1 WHERE valid_to=date '9999-12-31' AND property_id='{example_property}'").show()
spark.sql(f"SELECT * FROM integration.property_test1 VERSION AS OF 1 WHERE valid_to=date '9999-12-31' AND property_id='{example_property}'").show()

In [None]:
dt = DeltaTable.forName(spark,"integration.property_test1")
dt.history().select("version", col("timestamp"), "operation"
          , col("operationMetrics.numTargetRowsInserted").alias("RowsInserted")
          , col("operationMetrics.numTargetRowsUpdated").alias("RowsUpdated")
          , "operationMetrics.numSourceRows", "operationMetrics.executionTimeMs"
          ).show()
print("compaction")
dt.optimize().executeCompaction().show()
dt.history().select("version", col("timestamp"), "operation"
          , col("operationMetrics.numTargetRowsInserted").alias("RowsInserted")
          , col("operationMetrics.numTargetRowsUpdated").alias("RowsUpdated")
          , "operationMetrics.numSourceRows", "operationMetrics.executionTimeMs"
          ).show()
print("ZOrder")
dt.optimize().executeZOrderBy("property_id").show()