In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

In [7]:
spark = SparkSession.builder.appName("BMW Sales Analysis").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [8]:
schema = StructType(
    [
        StructField("Model", StringType(), True),
        StructField("Year", IntegerType(), True),
        StructField("Region", StringType(), True),
        StructField("Color", StringType(), True),
        StructField("Fuel_Type", StringType(), True),
        StructField("Transmission", StringType(), True),
        StructField("Engine_Size_L", DoubleType(), True),
        StructField("Mileage_KM", IntegerType(), True),
        StructField("Price_USD", IntegerType(), True),
        StructField("Sales_Volume", IntegerType(), True),
        StructField("Sales_Classification", StringType(), True),
    ]
)

In [9]:
df = spark.read.csv(
    path="dataset/BMW sales data (2010-2024).xls",
    schema=schema,
    header=True,
    mode="PERMISSIVE",
)

In [10]:
df.count()

50000

In [11]:
df.printSchema()

root
 |-- Model: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Engine_Size_L: double (nullable = true)
 |-- Mileage_KM: integer (nullable = true)
 |-- Price_USD: integer (nullable = true)
 |-- Sales_Volume: integer (nullable = true)
 |-- Sales_Classification: string (nullable = true)



In [12]:
df.createOrReplaceTempView("BMW_dataset")

In [15]:
avg_price_per_region = spark.sql(
    """
    select region, round(avg(price_usd), 2) as avg_price
    from bmw_dataset
    group by region
    order by avg_price desc
    """
)
avg_price_per_region.show()

+-------------+---------+
|       region|avg_price|
+-------------+---------+
|         Asia| 75554.93|
|North America| 75070.05|
|       Europe| 74988.36|
|South America|  74973.6|
|       Africa| 74885.77|
|  Middle East| 74726.79|
+-------------+---------+



In [16]:
top_selling_models = spark.sql(
    """
    select model, sum(sales_volume) as sales_volume
    from bmw_dataset
    group by model
    order by sales_volume desc
    """
)
top_selling_models.show()

+--------+------------+
|   model|sales_volume|
+--------+------------+
|7 Series|    23786466|
|      i8|    23423891|
|      X1|    23406060|
|3 Series|    23281303|
|      i3|    23133849|
|5 Series|    23097519|
|      M5|    22779688|
|      X3|    22745529|
|      X5|    22709749|
|      X6|    22661986|
|      M3|    22349694|
+--------+------------+



In [19]:
count_by_fuel_type = spark.sql(
    """
    select fuel_type, count(model) no_of_cars
    from bmw_dataset
    group by fuel_type 
    order by fuel_type desc
    """
)
count_by_fuel_type.show()

+---------+----------+
|fuel_type|no_of_cars|
+---------+----------+
|   Petrol|     12550|
|   Hybrid|     12716|
| Electric|     12471|
|   Diesel|     12263|
+---------+----------+



In [65]:
high_mil_low_price = spark.sql(
    """
    select model, year, mileage_km, price_usd
    from bmw_dataset
    where mileage_km >= 100000 
          and
          price_usd <= 40000 
    order by price_usd asc
    """
)
high_mil_low_price.show(5)

+--------+----+----------+---------+
|   model|year|mileage_km|price_usd|
+--------+----+----------+---------+
|      i8|2013|    165642|    30000|
|      X5|2022|    100254|    30001|
|5 Series|2015|    190634|    30002|
|      M5|2020|    151258|    30010|
|7 Series|2015|    169744|    30025|
+--------+----+----------+---------+
only showing top 5 rows


In [66]:
transmission_distr_per_region = spark.sql(
    """
    select region, 
           sum(case when transmission = 'Manual' then 1 else 0 end) as manuals,
           sum(case when transmission = 'Automatic' then 1 else 0 end) as automatics
    from bmw_dataset
    group by region    
    """
)
transmission_distr_per_region.show()

+-------------+-------+----------+
|       region|manuals|automatics|
+-------------+-------+----------+
|       Europe|   4217|      4117|
|       Africa|   4132|      4121|
|North America|   4163|      4172|
|South America|   4218|      4033|
|  Middle East|   4228|      4145|
|         Asia|   4196|      4258|
+-------------+-------+----------+



In [67]:
engine_size_analysis = spark.sql(
    """
    select fuel_type,
        cast(year as int) as year,
        round(avg(cast(engine_size_l as double)), 2) as avg_engine_size
    from bmw_dataset
    where cast(year as int) >= 2015
    group by fuel_type, cast(year as int)
    order by year asc
    """
)
engine_size_analysis.show(5)

+---------+----+---------------+
|fuel_type|year|avg_engine_size|
+---------+----+---------------+
|   Diesel|2015|           3.31|
|   Petrol|2015|           3.23|
|   Hybrid|2015|           3.27|
| Electric|2015|           3.18|
|   Diesel|2016|           3.22|
+---------+----+---------------+
only showing top 5 rows


Since I group by both fuel_type and year, I am expected to get multiple rows per year (one per fuel type). If you want only one row per year, you need to drop `fuel_type` from group by.

Besides this, I also have casted them to double and integers. I had already provided schema, but who knows what can happen?:) 

In [68]:
popular_color_per_region = spark.sql(
    """
    select region, color, count(model) no_of_models
    from bmw_dataset
    group by region, color
    order by no_of_models desc
"""
)
popular_color_per_region.show(5)

+-------------+------+------------+
|       region| color|no_of_models|
+-------------+------+------------+
|       Europe| Black|        1473|
|North America|   Red|        1461|
|         Asia| Black|        1460|
|North America|Silver|        1435|
|  Middle East|  Grey|        1429|
+-------------+------+------------+
only showing top 5 rows


In [69]:
yearly_sales = spark.sql(
    """
    select year, sum(sales_volume) as sales_volume
    from bmw_dataset
    group by year
    order by sales_volume desc
    """
)
yearly_sales.show(5)

+----+------------+
|year|sales_volume|
+----+------------+
|2022|    17920946|
|2024|    17527854|
|2019|    17191956|
|2015|    17010207|
|2014|    16958960|
+----+------------+
only showing top 5 rows


At this point, I finally understood that the data is **synthetic/unrealistic.** You can still analyze relative trends (which models sell more vs less in your dataset, how averages change by year, etc.), but don’t treat the absolute numbers as real-world BMW sales.

In [70]:
price_vs_mileage_corr = spark.sql(
    """
    select model, mileage_km, price_usd
    from bmw_dataset
    order by price_usd desc
    """
)
price_vs_mileage_corr.show(10)

+--------+----------+---------+
|   model|mileage_km|price_usd|
+--------+----------+---------+
|      i8|    115320|   119998|
|      i8|    163849|   119997|
|      X6|    142419|   119997|
|      X1|    172950|   119996|
|3 Series|     12264|   119994|
|      i8|     26622|   119992|
|      X6|     27540|   119988|
|5 Series|    181043|   119988|
|      X1|    146281|   119988|
|      X6|     95648|   119986|
+--------+----------+---------+
only showing top 10 rows


In [72]:
price_vs_mileage_corr = df.stat.corr("mileage_km", "price_usd")
print("Correlation between mileage and price:", price_vs_mileage_corr)

Correlation between mileage and price: -0.00423819457462334


- Close to -1 --> strong negative correlation (higher mileage --> lower price) which is quite **realistic**.
- Close to 0 --> no relationship.
- Close to +1 --> strong positive correlation (rare for price vs mileage).

In [77]:
ranking_expensiv_cars = spark.sql(
    """
    select *
    from (
        select region,
            model,
            price_usd,
            dense_rank() over (partition by region order by price_usd desc) as ranking
        from bmw_dataset
    ) t
    where ranking <= 3
    order by region, ranking
    """
)
ranking_expensiv_cars.show()

+-------------+--------+---------+-------+
|       region|   model|price_usd|ranking|
+-------------+--------+---------+-------+
|       Africa|      i8|   119997|      1|
|       Africa|      X1|   119996|      2|
|       Africa|      X6|   119988|      3|
|       Africa|      X1|   119988|      3|
|         Asia|      X6|   119997|      1|
|         Asia|5 Series|   119988|      2|
|         Asia|7 Series|   119978|      3|
|       Europe|      i8|   119985|      1|
|       Europe|5 Series|   119981|      2|
|       Europe|      X1|   119961|      3|
|  Middle East|      i8|   119998|      1|
|  Middle East|3 Series|   119994|      2|
|  Middle East|5 Series|   119978|      3|
|North America|      i8|   119992|      1|
|North America|3 Series|   119970|      2|
|North America|5 Series|   119963|      3|
|South America|      X6|   119986|      1|
|South America|3 Series|   119982|      2|
|South America|      X6|   119981|      3|
+-------------+--------+---------+-------+



In [87]:
recent_car_per_model = spark.sql(
    """
    select * from (
        select distinct model, 
            year,
            rank() over(partition by model order by year desc) 
                            as year_ranking
        from bmw_dataset) t
    where year_ranking = 1
    """
)
recent_car_per_model.show()

+--------+----+------------+
|   model|year|year_ranking|
+--------+----+------------+
|3 Series|2024|           1|
|5 Series|2024|           1|
|7 Series|2024|           1|
|      M3|2024|           1|
|      M5|2024|           1|
|      X1|2024|           1|
|      X3|2024|           1|
|      X5|2024|           1|
|      X6|2024|           1|
|      i3|2024|           1|
|      i8|2024|           1|
+--------+----+------------+



Now, I will try to find all cars whose `Price_USD` is above the overall average price of all cars.

In [99]:
above_average = spark.sql(
    """
    select model, year, price_usd
    from (
           select *, 
                avg(price_usd) over(partition by model) as avg_price
           from bmw_dataset 
    ) t
    where price_usd > avg_price
    """
)
above_average.show()

+--------+----+---------+
|   model|year|price_usd|
+--------+----+---------+
|3 Series|2012|   117995|
|3 Series|2023|    86402|
|3 Series|2010|    86660|
|3 Series|2020|    78509|
|3 Series|2017|    86684|
|3 Series|2020|    79774|
|3 Series|2024|   113482|
|3 Series|2019|   108637|
|3 Series|2024|    79650|
|3 Series|2012|    96429|
|3 Series|2013|   101189|
|3 Series|2022|   107968|
|3 Series|2023|    84635|
|3 Series|2015|    89156|
|3 Series|2021|    87186|
|3 Series|2017|   100678|
|3 Series|2023|   101460|
|3 Series|2021|   112265|
|3 Series|2012|    99478|
|3 Series|2017|   115369|
+--------+----+---------+
only showing top 20 rows
