In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

In [7]:
spark = SparkSession.builder.appName("BMW Sales Analysis").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [8]:
schema = StructType(
    [
        StructField("Model", StringType(), True),
        StructField("Year", IntegerType(), True),
        StructField("Region", StringType(), True),
        StructField("Color", StringType(), True),
        StructField("Fuel_Type", StringType(), True),
        StructField("Transmission", StringType(), True),
        StructField("Engine_Size_L", DoubleType(), True),
        StructField("Mileage_KM", IntegerType(), True),
        StructField("Price_USD", IntegerType(), True),
        StructField("Sales_Volume", IntegerType(), True),
        StructField("Sales_Classification", StringType(), True),
    ]
)

In [9]:
df = spark.read.csv(
    path="dataset/BMW sales data (2010-2024).xls",
    schema=schema,
    header=True,
    mode="PERMISSIVE",
)

In [10]:
df.count()

50000

In [11]:
df.printSchema()

root
 |-- Model: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Engine_Size_L: double (nullable = true)
 |-- Mileage_KM: integer (nullable = true)
 |-- Price_USD: integer (nullable = true)
 |-- Sales_Volume: integer (nullable = true)
 |-- Sales_Classification: string (nullable = true)



In [12]:
df.createOrReplaceTempView("BMW_dataset")

In [15]:
avg_price_per_region = spark.sql(
    """
    select region, round(avg(price_usd), 2) as avg_price
    from bmw_dataset
    group by region
    order by avg_price desc
    """
)
avg_price_per_region.show()

+-------------+---------+
|       region|avg_price|
+-------------+---------+
|         Asia| 75554.93|
|North America| 75070.05|
|       Europe| 74988.36|
|South America|  74973.6|
|       Africa| 74885.77|
|  Middle East| 74726.79|
+-------------+---------+



In [16]:
top_selling_models = spark.sql(
    """
    select model, sum(sales_volume) as sales_volume
    from bmw_dataset
    group by model
    order by sales_volume desc
    """
)
top_selling_models.show()

+--------+------------+
|   model|sales_volume|
+--------+------------+
|7 Series|    23786466|
|      i8|    23423891|
|      X1|    23406060|
|3 Series|    23281303|
|      i3|    23133849|
|5 Series|    23097519|
|      M5|    22779688|
|      X3|    22745529|
|      X5|    22709749|
|      X6|    22661986|
|      M3|    22349694|
+--------+------------+



In [19]:
count_by_fuel_type = spark.sql(
    """
    select fuel_type, count(model) no_of_cars
    from bmw_dataset
    group by fuel_type 
    order by fuel_type desc
    """
)
count_by_fuel_type.show()

+---------+----------+
|fuel_type|no_of_cars|
+---------+----------+
|   Petrol|     12550|
|   Hybrid|     12716|
| Electric|     12471|
|   Diesel|     12263|
+---------+----------+



In [27]:
high_mil_low_price = spark.sql(
    """
    select model, year, mileage_km, price_usd
    from bmw_dataset
    where mileage_km >= 100000 
          and
          price_usd <= 40000 
    order by price_usd asc
    """
)
high_mil_low_price.show()

+--------+----+----------+---------+
|   model|year|mileage_km|price_usd|
+--------+----+----------+---------+
|      i8|2013|    165642|    30000|
|      X5|2022|    100254|    30001|
|5 Series|2015|    190634|    30002|
|      M5|2020|    151258|    30010|
|7 Series|2015|    169744|    30025|
|      M3|2019|    118626|    30037|
|      X1|2015|    174797|    30039|
|      X5|2017|    152167|    30046|
|      X1|2017|    162189|    30046|
|      X3|2024|    134083|    30054|
|      M3|2022|    178091|    30055|
|      X1|2021|    179254|    30056|
|      i3|2015|    178470|    30059|
|      i3|2014|    151719|    30061|
|      M3|2014|    179483|    30062|
|      i3|2024|    116337|    30067|
|7 Series|2016|    114482|    30068|
|3 Series|2013|    162509|    30068|
|      X1|2013|    115667|    30069|
|      X3|2010|    115565|    30075|
+--------+----+----------+---------+
only showing top 20 rows


In [34]:
transmission_distr_per_region = spark.sql(
    """
    select region, 
           sum(case when transmission = 'Manual' then 1 else 0 end) as manuals,
           sum(case when transmission = 'Automatic' then 1 else 0 end) as automatics
    from bmw_dataset
    group by region    
    """
)
transmission_distr_per_region.show()

+-------------+-------+----------+
|       region|manuals|automatics|
+-------------+-------+----------+
|       Europe|   4217|      4117|
|       Africa|   4132|      4121|
|North America|   4163|      4172|
|South America|   4218|      4033|
|  Middle East|   4228|      4145|
|         Asia|   4196|      4258|
+-------------+-------+----------+



In [51]:
engine_size_analysis = spark.sql(
    """
    select fuel_type,
        cast(year as int) as year,
        round(avg(cast(engine_size_l as double)), 2) as avg_engine_size
    from bmw_dataset
    where cast(year as int) >= 2015
    group by fuel_type, cast(year as int)
    order by year asc
    """
)
engine_size_analysis.show()

+---------+----+---------------+
|fuel_type|year|avg_engine_size|
+---------+----+---------------+
|   Diesel|2015|           3.31|
|   Petrol|2015|           3.23|
|   Hybrid|2015|           3.27|
| Electric|2015|           3.18|
|   Diesel|2016|           3.22|
|   Hybrid|2016|           3.26|
| Electric|2016|           3.27|
|   Petrol|2016|           3.25|
| Electric|2017|           3.23|
|   Petrol|2017|            3.3|
|   Hybrid|2017|           3.21|
|   Diesel|2017|           3.26|
|   Petrol|2018|           3.26|
|   Hybrid|2018|           3.22|
|   Diesel|2018|           3.24|
| Electric|2018|           3.22|
|   Petrol|2019|           3.23|
|   Hybrid|2019|           3.19|
|   Diesel|2019|            3.3|
| Electric|2019|           3.22|
+---------+----+---------------+
only showing top 20 rows


Since I group by both fuel_type and year, I am expected to get multiple rows per year (one per fuel type). If you want only one row per year, you need to drop `fuel_type` from group by.

Besides this, I also have casted them to double and integers. I had already provided schema, but who knows what can happen?:) 

In [63]:
popular_color_per_region = spark.sql(
    """
    select region, color, count(model) no_of_models
    from bmw_dataset
    group by region, color
    order by no_of_models desc
"""
)
popular_color_per_region.show(5)

+-------------+------+------------+
|       region| color|no_of_models|
+-------------+------+------------+
|       Europe| Black|        1473|
|North America|   Red|        1461|
|         Asia| Black|        1460|
|North America|Silver|        1435|
|  Middle East|  Grey|        1429|
+-------------+------+------------+
only showing top 5 rows


In [62]:
yearly_sales = spark.sql(
    """
    select year, sum(sales_volume) as sales_volume
    from bmw_dataset
    group by year
    order by sales_volume desc
    """
)
yearly_sales.show(5)

+----+------------+
|year|sales_volume|
+----+------------+
|2022|    17920946|
|2024|    17527854|
|2019|    17191956|
|2015|    17010207|
|2014|    16958960|
+----+------------+
only showing top 5 rows


At this point, I finally understood that the data is **synthetic/unrealistic.**