In [1]:
# Set the PySpark environment variables
import os
import sys

os.environ["SPARK_HOME"] = r"C:\_dev\spark-3.5.1-hadoop3"
# os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
# os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ["PYSPARK_PYTHON"] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window


# Create a SparkSession
spark = SparkSession.builder.appName("QA_SCD").getOrCreate()

In [3]:
# read csv file
source = spark.read.csv(
    "D:\\pyspark-tutorial\\SCD\\data\\gaddress.csv", header=True, inferSchema=True
)

source.printSchema()

source.show(30)

root
 |-- Id: integer (nullable = true)
 |-- PhysicalAddress_Latitude: integer (nullable = true)
 |-- __START_AT: string (nullable = true)
 |-- __END_AT: string (nullable = true)

+---+------------------------+--------------------+--------------------+
| Id|PhysicalAddress_Latitude|          __START_AT|            __END_AT|
+---+------------------------+--------------------+--------------------+
|  1|                     111|2023-02-17T08:29:...|2023-02-18T08:29:...|
|  1|                     112|2023-02-18T08:29:...|2023-02-19T08:29:...|
|  1|                     112|2023-02-19T08:29:...|2023-02-20T08:29:...|
|  1|                     112|2023-02-20T08:29:...|2023-02-21T08:29:...|
|  1|                     113|2023-02-21T08:29:...|                NULL|
|  2|                     221|2023-02-22T08:29:...|                NULL|
|  3|                     331|2023-02-23T08:29:...|                NULL|
|  4|                     441|2023-02-24T08:29:...|2023-02-25T08:29:...|
|  4|            

In [4]:
# target = spark.read.csv("D:\\pyspark-tutorial\\SCD\\data\\location_dim.csv", header=True, inferSchema=True)

# target.printSchema()

# target.show(30)

In [5]:
windowPartition = Window.partitionBy("Id").orderBy("__START_AT")

source_new =source.withColumn(
    "prev_PhysicalAddress_Latitude",
    lag("PhysicalAddress_Latitude", 1).over(windowPartition),
).withColumn(
    "next_PhysicalAddress_Latitude",
    lead("PhysicalAddress_Latitude", 1).over(windowPartition),
).withColumn(
    "adjacent",
    when(
        (col("PhysicalAddress_Latitude") != col("prev_PhysicalAddress_Latitude"))
        & (col("PhysicalAddress_Latitude") == col("next_PhysicalAddress_Latitude")),
        lit("First"),
    )
    .when(
        (col("PhysicalAddress_Latitude") == col("prev_PhysicalAddress_Latitude"))
        & (col("PhysicalAddress_Latitude") == col("next_PhysicalAddress_Latitude")),
        lit("Middle"),
    )
    .when(
        (col("PhysicalAddress_Latitude") == col("prev_PhysicalAddress_Latitude"))
        & (col("PhysicalAddress_Latitude") != col("next_PhysicalAddress_Latitude")),
        lit("Last"),
    )
    .otherwise(lit("False")),
).withColumn("__START_AT_NEW", when(col("adjacent") != "Middle", col("__START_AT")).otherwise(lit(None))).withColumn("__END_AT_NEW", when(col("adjacent") != "Middle", col("__END_AT")).otherwise(lit(None)))

source_new.show()

# drop rows with adjacent != Middle
source_new = source_new.filter(col("adjacent") != "Middle")

source_new.show()

+---+------------------------+--------------------+--------------------+-----------------------------+-----------------------------+--------+--------------------+--------------------+
| Id|PhysicalAddress_Latitude|          __START_AT|            __END_AT|prev_PhysicalAddress_Latitude|next_PhysicalAddress_Latitude|adjacent|      __START_AT_NEW|        __END_AT_NEW|
+---+------------------------+--------------------+--------------------+-----------------------------+-----------------------------+--------+--------------------+--------------------+
|  1|                     111|2023-02-17T08:29:...|2023-02-18T08:29:...|                         NULL|                          112|   False|2023-02-17T08:29:...|2023-02-18T08:29:...|
|  1|                     112|2023-02-18T08:29:...|2023-02-19T08:29:...|                          111|                          112|   First|2023-02-18T08:29:...|2023-02-19T08:29:...|
|  1|                     112|2023-02-19T08:29:...|2023-02-20T08:29:...|        

In [7]:
# Transfer to new dataframe from source
source_expected = source_new.withColumn("__START_AT", col("__START_AT_NEW")) \
.withColumn("__END_AT", col("__END_AT_NEW")) \
.drop("prev_PhysicalAddress_Latitude", "next_PhysicalAddress_Latitude", "adjacent", "__START_AT_NEW", "__END_AT_NEW")
source_expected.show()

+---+------------------------+--------------------+--------------------+
| Id|PhysicalAddress_Latitude|          __START_AT|            __END_AT|
+---+------------------------+--------------------+--------------------+
|  1|                     111|2023-02-17T08:29:...|2023-02-18T08:29:...|
|  1|                     112|2023-02-18T08:29:...|2023-02-19T08:29:...|
|  1|                     112|2023-02-20T08:29:...|2023-02-21T08:29:...|
|  1|                     113|2023-02-21T08:29:...|                NULL|
|  2|                     221|2023-02-22T08:29:...|                NULL|
|  3|                     331|2023-02-23T08:29:...|                NULL|
|  4|                     441|2023-02-24T08:29:...|2023-02-25T08:29:...|
|  4|                    NULL|2023-02-25T08:29:...|2023-02-26T08:29:...|
|  4|                     442|2023-02-26T08:29:...|                NULL|
|  5|                     442|2023-02-27T08:29:...|2023-02-28T08:29:...|
|  5|                    NULL|2023-02-28T08:29:...|