Simple example creating a local spark session, an (external) table and selection data using spark sql and pyspark.

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("SvnLocalSpark") \
    .master("local")\
    .getOrCreate()

print(f"spark {spark.version} {spark.sparkContext.uiWebUrl}")

spark 3.5.4 http://DESKTOP-4GOMK6M:4040


In [2]:
spark.sql("CREATE SCHEMA IF NOT EXISTS landing")
# create external table
spark.catalog.getDatabase("landing")
spark.catalog.createTable(
    tableName = "landing.commercial_properties",
    source = "csv",
    description = "property values",
    header="true", delimiter=",", path="../../../resources/sourcedata/commercial_property_snapshots_100_M39.csv", inferSchema="true")

raw = spark.table("landing.commercial_properties")
raw.limit(10).show()

+----------+-----------+-----------------+-------------+-------------+--------+---------+--------------+------------+
|      date|property_id|           street|street_number|         city|zip_code| category|property_value|energy_label|
+----------+-----------+-----------------+-------------+-------------+--------+---------+--------------+------------+
|2022-01-01|       P001|Greensboro Street|          430|      Raleigh|   28457| Workshop|     230818.13|           A|
|2022-01-01|       P002|      Pine Street|          634|      Concord|   27901|Warehouse|     495643.33|           F|
|2022-01-01|       P003|   Tar Heel Drive|          846|    Asheville|   28330|   Office|     227615.59|           A|
|2022-01-01|       P004|       Ash Street|          931|     Gastonia|   27064| Workshop|     414742.28|           F|
|2022-01-01|       P005|    Spring Street|          759|Winston-Salem|   28753|Warehouse|     286378.57|           C|
|2022-01-01|       P006|       Oak Street|          933|

In [3]:
%load_ext sparksql_magic

In [4]:
%%sparksql
SELECT COUNT(1) AS total_rows
    , COUNT(DISTINCT `date`) AS total_snapshots
    , COUNT(DISTINCT property_id) AS total_properties
    , COUNT(DISTINCT city) AS cities
FROM landing.commercial_properties

0,1,2,3
total_rows,total_snapshots,total_properties,cities
118600,1186,100,25
