In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

In [7]:
spark = SparkSession.builder.appName("BMW Sales Analysis").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [8]:
schema = StructType(
    [
        StructField("Model", StringType(), True),
        StructField("Year", IntegerType(), True),
        StructField("Region", StringType(), True),
        StructField("Color", StringType(), True),
        StructField("Fuel_Type", StringType(), True),
        StructField("Transmission", StringType(), True),
        StructField("Engine_Size_L", DoubleType(), True),
        StructField("Mileage_KM", IntegerType(), True),
        StructField("Price_USD", IntegerType(), True),
        StructField("Sales_Volume", IntegerType(), True),
        StructField("Sales_Classification", StringType(), True),
    ]
)

In [9]:
df = spark.read.csv(
    path="dataset/BMW sales data (2010-2024).xls",
    schema=schema,
    header=True,
    mode="PERMISSIVE",
)

In [10]:
df.count()

50000

In [11]:
df.printSchema()

root
 |-- Model: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Engine_Size_L: double (nullable = true)
 |-- Mileage_KM: integer (nullable = true)
 |-- Price_USD: integer (nullable = true)
 |-- Sales_Volume: integer (nullable = true)
 |-- Sales_Classification: string (nullable = true)



In [12]:
df.createOrReplaceTempView("BMW_dataset")

In [15]:
avg_price_per_region = spark.sql(
    """
    select region, round(avg(price_usd), 2) as avg_price
    from bmw_dataset
    group by region
    order by avg_price desc
    """
)
avg_price_per_region.show()

+-------------+---------+
|       region|avg_price|
+-------------+---------+
|         Asia| 75554.93|
|North America| 75070.05|
|       Europe| 74988.36|
|South America|  74973.6|
|       Africa| 74885.77|
|  Middle East| 74726.79|
+-------------+---------+



In [16]:
top_selling_models = spark.sql(
    """
    select model, sum(sales_volume) as sales_volume
    from bmw_dataset
    group by model
    order by sales_volume desc
    """
)
top_selling_models.show()

+--------+------------+
|   model|sales_volume|
+--------+------------+
|7 Series|    23786466|
|      i8|    23423891|
|      X1|    23406060|
|3 Series|    23281303|
|      i3|    23133849|
|5 Series|    23097519|
|      M5|    22779688|
|      X3|    22745529|
|      X5|    22709749|
|      X6|    22661986|
|      M3|    22349694|
+--------+------------+

