This example reads snapshot data and creates a slowly changing dimension from it using hashing and window functions.

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("SvnLocalSpark") \
    .master("local")\
    .getOrCreate()

print(f"spark {spark.version} {spark.sparkContext.uiWebUrl}")

spark 3.5.4 http://DESKTOP-4GOMK6M:4040


In [2]:
raw = spark.read.options(delimiter=",", header=True, inferSchema=True).csv("../resources/sourcedata/commercial_property_snapshots_100_M39.csv")
raw.printSchema()

root
 |-- date: date (nullable = true)
 |-- property_id: string (nullable = true)
 |-- street: string (nullable = true)
 |-- street_number: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- zip_code: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- property_value: double (nullable = true)
 |-- energy_label: string (nullable = true)



In [3]:
raw.where("property_id='P012'").orderBy("date").limit(20).show()

+----------+-----------+---------------+-------------+----------+--------+--------+--------------+------------+
|      date|property_id|         street|street_number|      city|zip_code|category|property_value|energy_label|
+----------+-----------+---------------+-------------+----------+--------+--------+--------------+------------+
|2022-01-01|       P012|Carolina Avenue|          847|Wilmington|   28777|  Office|     187462.36|           C|
|2022-01-02|       P012|Carolina Avenue|          847|Wilmington|   28777|  Office|     187462.36|           C|
|2022-01-03|       P012|Carolina Avenue|          847|Wilmington|   28777|  Office|     187462.36|           C|
|2022-01-04|       P012|Carolina Avenue|          847|Wilmington|   28777|  Office|     187462.36|           C|
|2022-01-05|       P012|Carolina Avenue|          847|Wilmington|   28777|  Office|     191214.34|           C|
|2022-01-06|       P012|Carolina Avenue|          847|Wilmington|   28777|  Office|     191214.34|      

In [4]:
from svn.ingest.Scd2Util import Scd2Util

scd =   Scd2Util.fromSnapshots(raw,["property_id"],"date")

scd.where("property_id='P012'").orderBy("date").limit(20).show()

+-----------+----------+----------+---------------+-------------+----------+--------+--------+--------------+------------+
|property_id|valid_from|  valid_to|         street|street_number|      city|zip_code|category|property_value|energy_label|
+-----------+----------+----------+---------------+-------------+----------+--------+--------+--------------+------------+
|       P012|2022-01-01|2022-01-05|Carolina Avenue|          847|Wilmington|   28777|  Office|     187462.36|           C|
|       P012|2022-01-05|2022-01-18|Carolina Avenue|          847|Wilmington|   28777|  Office|     191214.34|           C|
|       P012|2022-01-18|2022-02-19|Carolina Avenue|          847|Wilmington|   28777|  Office|     196199.71|           C|
|       P012|2022-02-19|2022-03-13|Carolina Avenue|          847|Wilmington|   28777|  Office|     200512.03|           C|
|       P012|2022-03-13|2022-03-29|Carolina Avenue|          847|Wilmington|   28777|  Office|     199764.19|           C|
|       P012|202