# spark job to run on Data Lake notebook

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ShortType, DateType
from pyspark.sql.functions import col, datediff

# define schema
schema = StructType([
    StructField("Rego", StringType(), True),
    StructField("Brand", StringType(), True),
    StructField("Model", StringType(), True),
    StructField("Trim", StringType(), True),
    StructField("Year", ShortType(), True),
    StructField("Odometer", IntegerType(), True),
    StructField("Price", IntegerType(), True),
    StructField("Date listed", DateType(), True),
    StructField("Date removed", DateType(), True),
    StructField("Turnover", ShortType(), True)
])

# Read the JSON files into a Spark DataFrame
df = spark.read.json("/mnt/bronze", schema=schema)

# Apply transformations
df = df.withColumn("Turnover", datediff(col("Date removed"), col("Date listed")).cast("int"))
# Apply transformations
df = df.withColumn("Turnover", datediff(col("Date removed"), col("Date listed")).cast("int"))

df.show()

##### Read the data from the Delta table

delta_df = spark.read.format("delta").load("/mnt/delta/usedCarSales")
delta_df.show()

# `spark.conf` to optimize performance 

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ShortType, DateType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, datediff

##### Finetune Spark settings

In [None]:
# 0: all tables will be broadcast. -1 is broadcast join is disabled
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 

# parallelism running, the more partitions, the more computing intensive
# ONLY for data shuffling, joining, and aggregating can run 5
spark.conf.set("spark.sql.shuffle.partitions", 5)

# Numbers of parallelism running for ALL TASKS
# apart from shuffling, joining, and aggregating, all other jobs can run 10 
spark.conf.set("spark.default.parallelism", 10)

In [None]:
spark = SparkSession.builder.appName("UsedCarSalesData").getOrCreate()

# define schema (desired `dtypes` and the `order of columns`)

schema = StructType([
    StructField("Rego", StringType(), True),
    StructField("Brand", StringType(), True),
    StructField("Model", StringType(), True),
    StructField("Trim", StringType(), True),
    StructField("Year", ShortType(), True),
    StructField("Odometer", IntegerType(), True),
    StructField("Price", IntegerType(), True),
    StructField("Date listed", DateType(), True),
    StructField("Date removed", DateType(), True),
    StructField("Turnover", ShortType(), True)
])

# Spark Join
df = spark.read.json("raw data/new_listing_*.json", schema=schema)
# Spark Transformation
df = df.withColumn("Turnover", datediff(col("Date removed"), col("Date listed")).cast("int"))
