In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ShortType, DateType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, datediff

spark = SparkSession.builder.appName("UsedCarSalesData").getOrCreate()

# define schema (desired `dtypes` and the `order of columns`)

schema = StructType([
    StructField("Rego", StringType(), True),
    StructField("Brand", StringType(), True),
    StructField("Model", StringType(), True),
    StructField("Trim", StringType(), True),
    StructField("Year", ShortType(), True),
    StructField("Odometer", IntegerType(), True),
    StructField("Price", IntegerType(), True),
    StructField("Date listed", DateType(), True),
    StructField("Date removed", DateType(), True),
    StructField("Turnover", ShortType(), True)
])

# Spark Join
df = spark.read.json("raw data/new_listing_*.json", schema=schema)
# Spark Transformation
df = df.withColumn("Turnover", datediff(col("Date removed"), col("Date listed")).cast("int"))


In [4]:
df.show()
df.dtypes

+------+------+-----+----+----+--------+-----+-----------+------------+--------+
|  Rego| Brand|Model|Trim|Year|Odometer|Price|Date listed|Date removed|Turnover|
+------+------+-----+----+----+--------+-----+-----------+------------+--------+
|LHZ303|Toyota|    B|   2|2020|  196084|90277| 2020-05-19|  2021-01-27|     253|
|NKQ284|Nissan|    A|   2|2020|  188788|50621| 2020-08-14|  2020-08-24|      10|
|QAS328|Toyota|    D|   3|2022|  165034|98279| 2022-08-23|  2023-03-26|     215|
|FMO676|Toyota|    A|   3|2018|  160474|75542| 2018-05-23|  2018-09-26|     126|
|AQQ765|Toyota|    A|   1|2020|  144053|46287| 2020-09-17|  2021-01-06|     111|
|NYB555|Nissan|    A|   1|2021|  168031|60009| 2021-11-17|  2022-08-01|     257|
|WVX703|Nissan|    B|   1|2017|  185065|27538| 2017-12-13|  2018-07-30|     229|
|DWF187|Nissan|    D|   2|2020|  100041|49905| 2020-05-20|  2020-07-16|      57|
|KBA540|Toyota|    A|   1|2018|  165656|76923| 2018-07-22|  2018-11-18|     119|
|DVH720|Toyota|    B|   2|20

[('Rego', 'string'),
 ('Brand', 'string'),
 ('Model', 'string'),
 ('Trim', 'string'),
 ('Year', 'smallint'),
 ('Odometer', 'int'),
 ('Price', 'int'),
 ('Date listed', 'date'),
 ('Date removed', 'date'),
 ('Turnover', 'int')]

#### Write the original CSV data into Parquet format

In [None]:
df.write.parquet("silver data")