In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("SvnLocalSpark") \
    .config("spark.jars", "../scala-udaf/target/scala-2.13/svn-local-spark_2.13-0.1.0-SNAPSHOT.jar") \
    .master("local")\
    .getOrCreate()

print(f"spark {spark.version} {spark.sparkContext.uiWebUrl}")

spark 3.5.4 http://DESKTOP-4GOMK6M:4040


In [2]:
spark.sql("CREATE SCHEMA IF NOT EXISTS landing")
# create external table
spark.catalog.getDatabase("landing")
spark.catalog.createTable(
    tableName = "landing.commercial_properties",
    source = "csv",
    description = "property values",
    header="true", delimiter=",", path="../../../resources/sourcedata/commercial_property_snapshots_100_M39.csv", inferSchema="true")

java_udf1 = spark.sparkContext._jvm.spark.udaf.LeadUnequalDateString.register(spark._jsparkSession)

In [3]:
%load_ext sparksql_magic

In [4]:
%%sparksql -l 150
SELECT cp.property_id, cp.city, cp.property_value, cp.energy_label
,CASE WHEN energy_label IN ('A','B') THEN 'Green' Else 'Non-green' END AS green
,hlp1.up_or_downgrade
FROM landing.commercial_properties AS cp
LEFT OUTER JOIN (SELECT property_id
        ,CASE WHEN energy_label IN ('A','B') THEN 'Green' Else 'Non-green' END AS green
        ,MAX(`date`) AS up_or_downgrade
        FROM landing.commercial_properties
        WHERE  `date`<date "2024-12-31"
        GROUP BY 1,2) AS hlp1
    ON hlp1.property_id = cp.property_id 
    AND hlp1.green <> CASE WHEN cp.energy_label IN ('A','B') THEN 'Green' Else 'Non-green' END
WHERE cp.`date`=date "2024-12-31"

0,1,2,3,4,5
property_id,city,property_value,energy_label,green,up_or_downgrade
P001,Raleigh,528302.48,F,Non-green,2022-10-29
P002,Concord,1152838.35,F,Non-green,2023-12-15
P003,Asheville,471126.83,A,Green,
P004,Gastonia,1413682.53,D,Non-green,
P005,Winston-Salem,593854.47,E,Non-green,
P006,Goldsboro,649366.59,F,Non-green,2024-09-26
P007,Sanford,620120.98,C,Non-green,
P008,Charlotte,185266.36,D,Non-green,
P009,Durham,132990.29,C,Non-green,2023-07-19


In [5]:
%%sparksql -l 150
SELECT *
FROM (
    SELECT cp.`date`, cp.property_id, cp.city, cp.property_value, cp.energy_label
    ,CASE WHEN energy_label IN ('A','B') THEN 'Green' Else 'Non-green' END AS green
    ,LEAD_UNEQUAL_DATE_STRING(`date`, CASE WHEN energy_label IN ('A','B') THEN 'Green' Else 'Non-green' END) 
        OVER(PARTITION BY property_id ORDER BY `date` DESC ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) as up_or_downgrade
    FROM landing.commercial_properties AS cp) as t1
WHERE t1.`date`=date "2024-12-31"

0,1,2,3,4,5,6
date,property_id,city,property_value,energy_label,green,up_or_downgrade
2024-12-31,P001,Raleigh,528302.48,F,Non-green,2022-10-29
2024-12-31,P002,Concord,1152838.35,F,Non-green,2023-12-15
2024-12-31,P003,Asheville,471126.83,A,Green,
2024-12-31,P004,Gastonia,1413682.53,D,Non-green,
2024-12-31,P005,Winston-Salem,593854.47,E,Non-green,
2024-12-31,P006,Goldsboro,649366.59,F,Non-green,2024-09-26
2024-12-31,P007,Sanford,620120.98,C,Non-green,
2024-12-31,P008,Charlotte,185266.36,D,Non-green,
2024-12-31,P009,Durham,132990.29,C,Non-green,2023-07-19
