In [41]:
import re
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

In [96]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Car_Analysis")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
# Tip to reader: use WARN for development, ERROR in prod

In [97]:
df = spark.read.csv(
    path="dataset/Cars Datasets 2025.csv",
    header=True,
    mode="PERMISSIVE",
)

## Data Transformations

### Renaming Column Names to Camel Case

In [98]:
def to_camel_case(column_name):
    """
    Convert column name to camel case (Python PEP8 style)
    Example: "Company Names" -> "companys"
    """

    name = column_name.lower()
    name = name.replace(' ', '_').replace('/', '_').replace('(', '').replace(')', '')
    name = ''.join(c if c.isalnum() or c == '_' else '' for c in name)
    name = '_'.join(filter(None, name.split('_')))
    return name

In [99]:
current_columns = df.columns
column_mapping = {col: to_camel_case(col) for col in current_columns}

for old_name, new_name in column_mapping.items():
    df = df.withColumnRenamed(old_name, new_name)

In [100]:
df = df \
     .withColumnRenamed('companys', 'company') \
     .withColumnRenamed('cars_names', 'name') \
     .withColumnRenamed('engines', 'engine') \
     .withColumnRenamed('horsepower', 'horse_power') \
     .withColumnRenamed('total_speed', 'speed') \
     .withColumnRenamed('performance0_100_km_h', 'performance') \
     .withColumnRenamed('cars_prices', 'price') \
     .withColumnRenamed('fuel_types', 'fuel')

### Casting Values

In [101]:
def safe_numeric_extract(column_name):
    """Safely extract numeric value and handle empty results"""
    extracted = F.regexp_extract(F.col(column_name), r"([\d.]+)", 1)
    return F.when(extracted != "", extracted).cast(DoubleType())

df_copy = df \
    .withColumn('horse_power', safe_numeric_extract('horse_power')) \
    .withColumn('speed', safe_numeric_extract('speed')) \
    .withColumn('performance', safe_numeric_extract('performance')) \
    .withColumn('torque', safe_numeric_extract('torque'))

In [102]:
def transform_price(price: str) -> DoubleType:
    """
    Price transformation function.
    Returns None if price cannot be converted to numeric value.
    """

    numbers = [str(num) for num in range(0, 10)]
    price = price \
                .replace("$", "") \
                .replace(" ", "") \
                .replace(",", "")
    for char in price:
        if char not in numbers: return None
    
    return price

transform_price_udf = F.udf(transform_price, StringType())

df_copy_1 = df_copy \
            .withColumn('price', transform_price_udf(F.col('price'))) \
            .dropna(how='any') \
            .withColumn('price', F.col('price').cast(DoubleType()))

In [103]:
df_copy_1.printSchema()

root
 |-- company_names: string (nullable = true)
 |-- name: string (nullable = true)
 |-- engine: string (nullable = true)
 |-- cc_battery_capacity: string (nullable = true)
 |-- horse_power: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- performance: double (nullable = true)
 |-- price: double (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seats: string (nullable = true)
 |-- torque: double (nullable = true)



In [104]:
df_copy_1.show(5)

+-------------+-------------+------+-------------------+-----------+-----+-----------+---------+---------------+-----+------+
|company_names|         name|engine|cc_battery_capacity|horse_power|speed|performance|    price|           fuel|seats|torque|
+-------------+-------------+------+-------------------+-----------+-----+-----------+---------+---------------+-----+------+
|      FERRARI|SF90 STRADALE|    V8|            3990 cc|      963.0|340.0|        2.5|1100000.0|plug in hyrbrid|    2| 800.0|
|  ROLLS ROYCE|      PHANTOM|   V12|            6749 cc|      563.0|250.0|        5.3| 460000.0|         Petrol|    5| 900.0|
|     MERCEDES|      GT 63 S|    V8|           3,982 cc|      630.0|250.0|        3.2| 161000.0|         Petrol|    4| 900.0|
|         AUDI|   AUDI R8 Gt|   V10|           5,204 cc|      602.0|320.0|        3.6| 253290.0|         Petrol|    2| 560.0|
|          BMW| Mclaren 720s|    V8|           3,994 cc|      710.0|341.0|        2.9| 499000.0|         Petrol|    2|

In [105]:
df_copy = df_copy_1

In [106]:
df_copy.show(5)

+-------------+-------------+------+-------------------+-----------+-----+-----------+---------+---------------+-----+------+
|company_names|         name|engine|cc_battery_capacity|horse_power|speed|performance|    price|           fuel|seats|torque|
+-------------+-------------+------+-------------------+-----------+-----+-----------+---------+---------------+-----+------+
|      FERRARI|SF90 STRADALE|    V8|            3990 cc|      963.0|340.0|        2.5|1100000.0|plug in hyrbrid|    2| 800.0|
|  ROLLS ROYCE|      PHANTOM|   V12|            6749 cc|      563.0|250.0|        5.3| 460000.0|         Petrol|    5| 900.0|
|     MERCEDES|      GT 63 S|    V8|           3,982 cc|      630.0|250.0|        3.2| 161000.0|         Petrol|    4| 900.0|
|         AUDI|   AUDI R8 Gt|   V10|           5,204 cc|      602.0|320.0|        3.6| 253290.0|         Petrol|    2| 560.0|
|          BMW| Mclaren 720s|    V8|           3,994 cc|      710.0|341.0|        2.9| 499000.0|         Petrol|    2|

### Working with `null` and dropping unnecessary columns

In [462]:
df_copy = df_copy.dropna(how='any') # any row that contains at least one null

In [463]:
df_copy = df_copy.drop('cc_battery_capacity', 'engine', 'seats')

In [464]:
df_copy.count()

1211

In [465]:
df_copy.printSchema()

root
 |-- company: string (nullable = true)
 |-- name: string (nullable = true)
 |-- horse_power: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- performance: double (nullable = true)
 |-- price: double (nullable = true)
 |-- fuel: string (nullable = true)
 |-- torque: double (nullable = true)



In [466]:
df_copy.createOrReplaceTempView("car_dataset")

## Exploration

### I. Average horse_power by Fuel type

In [467]:
spark.sql(
    """
    select fuel, round(avg(horse_power), 2) as avg_hp
    from car_dataset
    group by fuel
    order by avg_hp desc
    """
).show(5)


+---------------+------+
|           fuel|avg_hp|
+---------------+------+
|plug in hyrbrid| 963.0|
|Hybrid (Petrol)| 567.5|
|       Electric|341.63|
|         Hybrid|317.81|
|         Petrol| 309.3|
+---------------+------+
only showing top 5 rows


### II. Most Powerful Engine per Company

In [468]:
spark.sql(
    """
    select * 
    from (
        select company, name, speed, 
            row_number() over(partition by company order by horse_power desc) as rn
        from car_dataset
    ) t
    where rn = 1
    """
).select('company', 'name', 'speed').show()

+-----------------+--------------------+-----+
|          company|                name|speed|
+-----------------+--------------------+-----+
|     ASTON MARTIN|            VALKYRIE|402.0|
|             AUDI|         R8 V10 PLUS|330.0|
|            Acura|          NSX Type S|307.0|
|          BENTLEY|Continental GT Azure|318.0|
|              BMW|        Mclaren 720s|341.0|
|          Bugatti|              Bolide|500.0|
|         Cadillac|          Escalade V|200.0|
|        Chevrolet|          Camaro ZL1|318.0|
|          FERRARI|       SF90 STRADALE|340.0|
|             Ford|Mustang Shelby GT500|290.0|
|              GMC|Hummer EV SUV Edi...|170.0|
|            HONDA|        CIVIC TYPE R|272.0|
|          HYUNDAI|             IONIQ 6|185.0|
|Jaguar Land Rover|   Jaguar F-Type SVR|320.0|
|             Jeep|Wagoneer S (Elect...|177.0|
|              KIA|       SPORTAGE PHEV|180.0|
|            KIA  |SPORTAGE 2024(BAS...|200.0|
|              Kia|  EV6 GT Performance|260.0|
|      LAMBOR

### III. Fastest Acceleration Ranking

In [469]:
spark.sql(
    """
    select company, name, performance, 
           dense_rank() over(order by performance asc) as rank
    from car_dataset
    """
).select('company', 'name', 'performance').show()

+------------+--------------------+-----------+
|     company|                name|performance|
+------------+--------------------+-----------+
|     FERRARI|       SF90 STRADALE|        2.5|
| ROLLS ROYCE|             PHANTOM|        5.3|
|        Ford|                 KA+|       10.5|
|    MERCEDES|             GT 63 S|        3.2|
|        AUDI|          AUDI R8 Gt|        3.6|
|         BMW|        Mclaren 720s|        2.9|
|ASTON MARTIN|          VANTAGE F1|        3.6|
|     BENTLEY|Continental GT Azure|        4.0|
| LAMBORGHINI|     VENENO ROADSTER|        2.9|
|     FERRARI|          F8 TRIBUTO|        2.9|
|     FERRARI|             812 GTS|        2.9|
|     FERRARI|           PORTOFINO|        3.2|
|     FERRARI|                ROMA|        3.4|
|     FERRARI|           MONZA SP2|        2.9|
|     FERRARI|           F8 SPIDER|        2.9|
|     FERRARI|         PORTOFINO M|        3.2|
|     FERRARI|         ROMA SPIDER|        3.4|
|      TOYOTA|            GR SUPRA|     

### IV. Top 3 Expensive Cars per Company

In [471]:
top3_expensive = spark.sql(
    """
    select * 
    from ( 
        select company, name, price, 
            dense_rank() over(partition by company order by price desc) as rank
        from car_dataset
    ) t
    where rank <= 3
    order by price desc
    """
)
top3_expensive.show()

+-----------------+--------------------+-------------+----+
|          company|                name|        price|rank|
+-----------------+--------------------+-------------+----+
|            Mazda|     787B (Race Car)| 5.0000007E13|   1|
|Jaguar Land Rover|       Jaguar F-PACE|   5.500065E9|   1|
|             Ford|          Expedition|   5.000075E9|   1|
|            Mazda|               CX-90|    5.00006E9|   2|
|             Ford|     Explorer Hybrid|   5.000055E9|   2|
|             Ford|    F-350 Super Duty|   4.500075E9|   3|
|            Mazda|               CX-80|    4.50005E9|   3|
|       Mitsubishi|Pajero Final Edition|    4.50005E9|   1|
|       Mitsubishi|          Pajero GLS|    4.50005E9|   1|
|       Mitsubishi|Pajero SWB (Short...|    4.50005E9|   1|
|           Toyota|        Crown Signia|   4.359048E9|   1|
|       Mitsubishi|  Lancer Evolution X|    4.00005E9|   2|
|       Mitsubishi|              Pajero|   4.000045E9|   3|
|           Toyota|      RAV4 (6th Gen)|

### V. Best Horsepower-to-Price Ratio

In [496]:
# I am having some issues witch casting prices.
# I will, for some time, omit prices that higher than 1,000,000,000. 
df_copy = df_copy.filter(F.col('price') <= 100000000)
df_copy.count()

843

In [499]:
spark.sql(
    """
    select company, 
           name, 
           horse_power, 
           price, 
           round((price / horse_power), 2) as ratio
    from car_dataset
    order by ratio desc
    limit 10
    """
).show()

+-----------------+--------------------+-----------+------------+-----------------+
|          company|                name|horse_power|       price|            ratio|
+-----------------+--------------------+-----------+------------+-----------------+
|            Mazda|     787B (Race Car)|      700.0|5.0000007E13|7.142858142857E10|
|       Mitsubishi|   i-MiEV (Electric)|       66.0|  2.300026E9|    3.484887879E7|
|            Mazda|          Carol P360|       26.0|    8.0001E8|    3.076961538E7|
|            Mazda|  Cosmo Sport (110S)|      110.0|  3.000035E9|    2.727304545E7|
|       Mitsubishi|          Delica D:5|      145.0|   3.50004E9|     2.41382069E7|
|       Mitsubishi|Pajero Final Edition|      189.0|   4.50005E9|    2.380978836E7|
|       Mitsubishi|          Pajero GLS|      189.0|   4.50005E9|    2.380978836E7|
|       Mitsubishi|Pajero SWB (Short...|      189.0|   4.50005E9|    2.380978836E7|
|            Mazda|      MX-30 Electric|      143.0|  3.300037E9|    2.30771

### VI. Cars Above Company Average Speed

In [500]:
company_avg_speed = spark.sql(
    """
    select company, round(avg(speed), 2) as avg_speed 
    from car_dataset 
    group by company
    """
)
company_avg_speed.show(10)

+-----------------+---------+
|          company|avg_speed|
+-----------------+---------+
|       Volkswagen|   203.59|
|          Peugeot|   202.13|
|         MERCEDES|    250.0|
|      LAMBORGHINI|    334.0|
|          HYUNDAI|    200.0|
|            KIA  |    200.0|
|Jaguar Land Rover|   249.58|
|             Jeep|   190.47|
|       Mitsubishi|   178.71|
|              Kia|   209.44|
+-----------------+---------+
only showing top 10 rows


In [502]:
cars_above_avg = spark.sql(
    """
    select company, name, speed
    from car_dataset c
    where speed > (
        select round(avg(speed))
        from car_dataset
        where company = c.company
    )
    order by speed desc
    """
)
cars_above_avg.show()

+------------+--------------------+-----+
|     company|                name|speed|
+------------+--------------------+-----+
|     Bugatti|              Bolide|500.0|
|     Bugatti|  Chiron Super Sport|490.0|
|ASTON MARTIN|            VALKYRIE|402.0|
|       Tesla|          Roadster 2|402.0|
|ASTON MARTIN|            VALHALLA|362.0|
| LAMBORGHINI|     VENENO ROADSTER|356.0|
| LAMBORGHINI|   AVENTADOR ULTIMAE|355.0|
| LAMBORGHINI|AVENTADOR LP 780-...|355.0|
| LAMBORGHINI|AVENTADOR LP 780-...|355.0|
|      Nissan|   R390 GT1 Road Car|354.0|
| LAMBORGHINI|         AVENTADOR S|350.0|
| LAMBORGHINI|                SIAN|350.0|
| LAMBORGHINI|        AVENTADOR SV|350.0|
| LAMBORGHINI|  AVENTADOR ROADSTER|350.0|
| LAMBORGHINI|       AVENTADOR SVJ|350.0|
| LAMBORGHINI|AVENTADOR SVJ ROA...|350.0|
| LAMBORGHINI|  AVENTADOR SVJ XAGO|350.0|
|        Ford|                  GT|348.0|
|         BMW|        Mclaren 720s|341.0|
|     FERRARI|       SF90 STRADALE|340.0|
+------------+--------------------

### Cars Above Overall Average Horsepower

In [None]:
spark.sql(
    """
    select company, name, horse_power
    from car_dataset
    where horse_power > (select round(avg(horse_power), 2) as avg_hp
                         from car_dataset)
    order by horse_power desc
    """
).show()

+------------+--------------------+-----------+
|company_name|            car_name|horse_power|
+------------+--------------------+-----------+
|     Bugatti|              Bolide|     1850.0|
|     Bugatti|  Chiron Super Sport|     1600.0|
|     Bugatti|          Centodieci|     1600.0|
|     Bugatti|             Mistral|     1600.0|
|     Bugatti|              Chiron|     1500.0|
|     Bugatti|    Chiron Pur Sport|     1500.0|
|     Bugatti|        Chiron Sport|     1500.0|
|     Bugatti|        Chiron Noire|     1500.0|
|     Bugatti|                Divo|     1500.0|
|     Bugatti|    La Voiture Noire|     1500.0|
|ASTON MARTIN|            VALKYRIE|     1160.0|
|ASTON MARTIN|            VALHALLA|     1000.0|
|     FERRARI|       SF90 STRADALE|      963.0|
|ASTON MARTIN|              VICTOR|      836.0|
|         GMC|Hummer EV SUV Edi...|      830.0|
|         GMC|Hummer EV SUV Ext...|      830.0|
|         GMC|Hummer EV Adventu...|      830.0|
| LAMBORGHINI|                SIAN|     

### Acceleration Performance Buckets

### Most Common Fuel Type per Company

In [None]:
spark.sql(
    """
    select company, fuel_type, count(fuel_type)
    from car_dataset
    group by company, fuel_type
    order by company
    """
).show()

+------------+---------------+----------------+
|company_name|      fuel_type|count(fuel_type)|
+------------+---------------+----------------+
|ASTON MARTIN|         Petrol|               9|
|ASTON MARTIN|         Hybrid|               2|
|        AUDI|         Petrol|              19|
|        AUDI|       Electric|               2|
|       Acura|         Hybrid|               7|
|       Acura|         Petrol|              20|
|     BENTLEY|         Petrol|               1|
|         BMW|         Hybrid|               1|
|         BMW|         Petrol|              30|
|         BMW|         Diesel|              10|
|     Bugatti|         Petrol|              10|
|    Cadillac|         Petrol|              17|
|    Cadillac|       Electric|               3|
|   Chevrolet|       Electric|               4|
|   Chevrolet|         Diesel|               4|
|   Chevrolet|         Petrol|              50|
|     FERRARI|plug in hyrbrid|               1|
|     FERRARI|         Petrol|          

In [None]:
most_common = spark.sql(
    """
    select company, fuel_type, cnt,
        dense_rank() over(partition by company order by cnt desc) as rank
    from (
        select company, fuel_type, count(fuel_type) as cnt
        from car_dataset
        group by company, fuel_type
    ) t
    """
)

most_common.filter(F.col('rank') == 1) \
           .select(['company', 'fuel_type', 'cnt']) \
           .show()

+-----------------+---------+---+
|     company_name|fuel_type|cnt|
+-----------------+---------+---+
|     ASTON MARTIN|   Petrol|  9|
|             AUDI|   Petrol| 19|
|            Acura|   Petrol| 20|
|          BENTLEY|   Petrol|  1|
|              BMW|   Petrol| 30|
|          Bugatti|   Petrol| 10|
|         Cadillac|   Petrol| 17|
|        Chevrolet|   Petrol| 50|
|          FERRARI|   Petrol|  8|
|             Ford|   Petrol| 32|
|              GMC|   Petrol| 48|
|            HONDA|   Petrol|  5|
|          HYUNDAI|   Petrol| 11|
|Jaguar Land Rover|   Petrol| 34|
|             Jeep|   Petrol| 14|
|              KIA|   Petrol| 11|
|            KIA  |   Petrol|  1|
|              Kia|   Petrol| 39|
|      LAMBORGHINI|   Petrol| 23|
|         MAHINDRA|   Diesel|  2|
+-----------------+---------+---+
only showing top 20 rows


### Top Speed Leaders by Seat Category

### Company Ranking by Avg Horsepower

In [None]:
spark.sql(
    """

    select company, avg_hp,
        dense_rank() over(order by avg_hp desc) as rank
    from (
            select company, round(avg(horse_power), 2) as avg_hp
            from car_dataset
            group by company
    ) t
    """
).show()

+-----------------+------+----+
|     company_name|avg_hp|rank|
+-----------------+------+----+
|          Bugatti|1565.0|   1|
|          FERRARI|709.89|   2|
|     ASTON MARTIN|701.09|   3|
|      LAMBORGHINI|691.54|   4|
|     ROLLS ROYCE | 591.0|   5|
|      ROLLS ROYCE|583.76|   6|
|          BENTLEY| 550.0|   7|
|            Tesla|514.71|   8|
|          Porsche|463.26|   9|
|            Volvo|456.67|  10|
|         MERCEDES|432.86|  11|
|             AUDI|393.19|  12|
|              GMC|377.44|  13|
|         Cadillac| 368.4|  14|
|Jaguar Land Rover|355.65|  15|
|             Jeep|318.89|  16|
|           NISSAN| 316.4|  17|
|             Ford|314.97|  18|
|            Acura|311.04|  19|
|        Chevrolet|303.67|  20|
+-----------------+------+----+
only showing top 20 rows


25/08/26 10:19:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/26 10:19:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/26 10:19:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/26 10:19:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/26 10:19:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/08/26 10:19:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
