In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, current_timestamp, expr

In [9]:
from ntbk_logger import get_notebook_logger

logger = get_notebook_logger(logfile="logs/scd.log")

In [10]:
spark = SparkSession.builder \
    .appName("scd") \
    .master("local[*]") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

In [11]:
data = [
    {"id": 1, "name": "Alice", "age": 29, "department": "HR", "salary": 50000},
    {"id": 2, "name": "Bob", "age": 35, "department": "Engineering", "salary": 80000},
    {"id": 3, "name": "Charlie", "age": 28, "department": "Sales", "salary": 45000},
    {"id": 4, "name": "David", "age": 42, "department": "Engineering", "salary": 95000},
    {"id": 5, "name": "Eva", "age": 31, "department": "HR", "salary": 52000},
    {"id": 6, "name": "Frank", "age": 36, "department": "Marketing", "salary": 60000},
    {"id": 7, "name": "Grace", "age": 26, "department": "Sales", "salary": 47000},
    {"id": 8, "name": "Henry", "age": 40, "department": "Engineering", "salary": 91000},
    {"id": 9, "name": "Ivy", "age": 30, "department": "Marketing", "salary": 58000},
    {"id": 10, "name": "Jack", "age": 38, "department": "Sales", "salary": 49000}
]

df = spark.createDataFrame(data)

df.show()

+---+-----------+---+-------+------+
|age| department| id|   name|salary|
+---+-----------+---+-------+------+
| 29|         HR|  1|  Alice| 50000|
| 35|Engineering|  2|    Bob| 80000|
| 28|      Sales|  3|Charlie| 45000|
| 42|Engineering|  4|  David| 95000|
| 31|         HR|  5|    Eva| 52000|
| 36|  Marketing|  6|  Frank| 60000|
| 26|      Sales|  7|  Grace| 47000|
| 40|Engineering|  8|  Henry| 91000|
| 30|  Marketing|  9|    Ivy| 58000|
| 38|      Sales| 10|   Jack| 49000|
+---+-----------+---+-------+------+



In [12]:
start_time = current_timestamp()
cur_status = "active"

In [13]:
df = df.withColumn("start_time", current_timestamp()) \
        .withColumn("end_time", expr("current_timestamp() + interval 4 years")) \
        .withColumn("cur_status", lit("active"))

df.show()

+---+-----------+---+-------+------+--------------------+--------------------+----------+
|age| department| id|   name|salary|          start_time|            end_time|cur_status|
+---+-----------+---+-------+------+--------------------+--------------------+----------+
| 29|         HR|  1|  Alice| 50000|2025-10-06 17:11:...|2029-10-06 17:11:...|    active|
| 35|Engineering|  2|    Bob| 80000|2025-10-06 17:11:...|2029-10-06 17:11:...|    active|
| 28|      Sales|  3|Charlie| 45000|2025-10-06 17:11:...|2029-10-06 17:11:...|    active|
| 42|Engineering|  4|  David| 95000|2025-10-06 17:11:...|2029-10-06 17:11:...|    active|
| 31|         HR|  5|    Eva| 52000|2025-10-06 17:11:...|2029-10-06 17:11:...|    active|
| 36|  Marketing|  6|  Frank| 60000|2025-10-06 17:11:...|2029-10-06 17:11:...|    active|
| 26|      Sales|  7|  Grace| 47000|2025-10-06 17:11:...|2029-10-06 17:11:...|    active|
| 40|Engineering|  8|  Henry| 91000|2025-10-06 17:11:...|2029-10-06 17:11:...|    active|
| 30|  Mar

In [15]:
df.write.format("delta").mode("overwrite").save("data/employee")