In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("SvnLocalSpark") \
    .config("spark.sql.warehouse.dir", "../delta-data-tmp")\
    .config("spark.jars.packages","io.delta:delta-spark_2.13:3.3.0")\
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
    .master("local")\
    .getOrCreate()

print(f"spark {spark.version} {spark.sparkContext.uiWebUrl}")

spark 3.5.4 http://DESKTOP-4GOMK6M:4040


In [2]:
import shutil
import os

folder_path = "../delta-data-tmp/copytest.db"

# Check if the folder exists
if os.path.exists(folder_path):
    # Delete the folder and all its contents
    shutil.rmtree(folder_path)

In [3]:
%load_ext sparksql_magic

In [4]:
%%sparksql
CREATE SCHEMA IF NOT EXISTS copytest

In [5]:
%%sparksql
CREATE TABLE copytest.commercial_property(
    `date` date,
    property_id string,
    street string,
    street_number string,
    city string,
    zip_code string,
    category string,
    property_value double,
    energy_label string
)
USING DELTA
-- USING DELTA is required, otherwise it will default to hive and throw an error if hive is not enabled

In [6]:
%%sparksql 
DESCRIBE EXTENDED copytest.commercial_property

0,1,2
col_name,data_type,comment
date,date,
property_id,string,
street,string,
street_number,string,
city,string,
zip_code,string,
category,string,
property_value,double,
energy_label,string,


In [7]:
%%sparksql
CREATE OR REPLACE TEMPORARY VIEW source_cp
USING csv
OPTIONS (
  path '../resources/generated/commercial_property/2024/10/*',
  header true,
  inferSchema true
)

In [8]:
%%sparksql
WITH src as 
(SELECT *
, input_file_name() as fn
, locate("/commercial_property_snapshot_",fn) as datepos
, to_date(substring(fn,datepos + 30, 8), 'yyyyMMdd') as `date`
FROM source_cp)
INSERT INTO copytest.commercial_property 
SELECT `date`, property_id, street, street_number, city, zip_code, category, property_value, energy_label
FROM src

In [14]:
%%sparksql
DROP TABLE IF EXISTS copytest.commercial_propertyV2;


In [None]:
%%sparksql
-- CREATE TABLE AS SELECT does not work with CTE
CREATE TABLE copytest.commercial_propertyV2
USING delta
AS SELECT property_id, street, street_number, city, zip_code, category, property_value, energy_label
FROM  source_cp;