In [41]:
import re
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

In [117]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Car_Analysis")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
# Tip to reader: use WARN for development, ERROR in prod

In [118]:
df = spark.read.csv(
    path="dataset/Cars Datasets 2025.csv",
    header=True,
    mode="PERMISSIVE",
)

## Data Transformations

### Renaming Column Names to Camel Case

In [121]:
def to_camel_case(column_name):
    """
    Convert column name to camel case (Python PEP8 style)
    Example: "Company Names" -> "company_names"
    """

    name = column_name.lower()
    name = name.replace(' ', '_').replace('/', '_').replace('(', '').replace(')', '')
    name = ''.join(c if c.isalnum() or c == '_' else '' for c in name)
    name = '_'.join(filter(None, name.split('_')))
    return name

In [122]:
current_columns = df.columns
column_mapping = {col: to_camel_case(col) for col in current_columns}

for old_name, new_name in column_mapping.items():
    df = df.withColumnRenamed(old_name, new_name)

In [123]:
df = df \
     .withColumnRenamed('company_names', 'company') \
     .withColumnRenamed('cars_names', 'name') \
     .withColumnRenamed('engines', 'engine') \
     .withColumnRenamed('horsepower', 'horse_power') \
     .withColumnRenamed('total_speed', 'speed') \
     .withColumnRenamed('performance0_100_km_h', 'performance') \
     .withColumnRenamed('cars_prices', 'price') \
     .withColumnRenamed('fuel_types', 'fuel')

In [124]:
df.printSchema()

root
 |-- company: string (nullable = true)
 |-- name: string (nullable = true)
 |-- engine: string (nullable = true)
 |-- cc_battery_capacity: string (nullable = true)
 |-- horse_power: string (nullable = true)
 |-- speed: string (nullable = true)
 |-- performance: string (nullable = true)
 |-- price: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seats: string (nullable = true)
 |-- torque: string (nullable = true)



### Casting Values

In [125]:
def safe_numeric_extract(column_name):
    """Safely extract numeric value and handle empty results"""
    extracted = F.regexp_extract(F.col(column_name), r"([\d.]+)", 1)
    return F.when(extracted != "", extracted).cast(DoubleType())

df_copy = df \
    .withColumn('horse_power', safe_numeric_extract('horse_power')) \
    .withColumn('speed', safe_numeric_extract('speed')) \
    .withColumn('performance', safe_numeric_extract('performance')) \
    .withColumn('torque', safe_numeric_extract('torque'))

In [126]:
def transform_price(price: str) -> DoubleType:
    """
    Price transformation function.
    Returns None if price cannot be converted to numeric value.
    """

    numbers = [str(num) for num in range(0, 10)]
    price = price \
                .replace("$", "") \
                .replace(" ", "") \
                .replace(",", "")
    for char in price:
        if char not in numbers: return None
    
    return price

transform_price_udf = F.udf(transform_price, StringType())

df_copy_1 = df_copy \
            .withColumn('price', transform_price_udf(F.col('price'))) \
            .dropna(how='any') \
            .withColumn('price', F.col('price').cast(DoubleType()))

In [127]:
df_copy_1.printSchema()

root
 |-- company: string (nullable = true)
 |-- name: string (nullable = true)
 |-- engine: string (nullable = true)
 |-- cc_battery_capacity: string (nullable = true)
 |-- horse_power: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- performance: double (nullable = true)
 |-- price: double (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seats: string (nullable = true)
 |-- torque: double (nullable = true)



In [128]:
df_copy_1.show(5)

+-----------+-------------+------+-------------------+-----------+-----+-----------+---------+---------------+-----+------+
|    company|         name|engine|cc_battery_capacity|horse_power|speed|performance|    price|           fuel|seats|torque|
+-----------+-------------+------+-------------------+-----------+-----+-----------+---------+---------------+-----+------+
|    FERRARI|SF90 STRADALE|    V8|            3990 cc|      963.0|340.0|        2.5|1100000.0|plug in hyrbrid|    2| 800.0|
|ROLLS ROYCE|      PHANTOM|   V12|            6749 cc|      563.0|250.0|        5.3| 460000.0|         Petrol|    5| 900.0|
|   MERCEDES|      GT 63 S|    V8|           3,982 cc|      630.0|250.0|        3.2| 161000.0|         Petrol|    4| 900.0|
|       AUDI|   AUDI R8 Gt|   V10|           5,204 cc|      602.0|320.0|        3.6| 253290.0|         Petrol|    2| 560.0|
|        BMW| Mclaren 720s|    V8|           3,994 cc|      710.0|341.0|        2.9| 499000.0|         Petrol|    2| 770.0|
+-------

In [129]:
df_copy = df_copy_1

In [130]:
df_copy.show(5)

+-----------+-------------+------+-------------------+-----------+-----+-----------+---------+---------------+-----+------+
|    company|         name|engine|cc_battery_capacity|horse_power|speed|performance|    price|           fuel|seats|torque|
+-----------+-------------+------+-------------------+-----------+-----+-----------+---------+---------------+-----+------+
|    FERRARI|SF90 STRADALE|    V8|            3990 cc|      963.0|340.0|        2.5|1100000.0|plug in hyrbrid|    2| 800.0|
|ROLLS ROYCE|      PHANTOM|   V12|            6749 cc|      563.0|250.0|        5.3| 460000.0|         Petrol|    5| 900.0|
|   MERCEDES|      GT 63 S|    V8|           3,982 cc|      630.0|250.0|        3.2| 161000.0|         Petrol|    4| 900.0|
|       AUDI|   AUDI R8 Gt|   V10|           5,204 cc|      602.0|320.0|        3.6| 253290.0|         Petrol|    2| 560.0|
|        BMW| Mclaren 720s|    V8|           3,994 cc|      710.0|341.0|        2.9| 499000.0|         Petrol|    2| 770.0|
+-------

### Working with `null` and dropping unnecessary columns

In [131]:
df_copy = df_copy \
                .dropna(how='any') \
                .drop('cc_battery_capacity', 'engine', 'seats')

In [132]:
df_copy.count()

1042

In [133]:
df_copy.printSchema()

root
 |-- company: string (nullable = true)
 |-- name: string (nullable = true)
 |-- horse_power: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- performance: double (nullable = true)
 |-- price: double (nullable = true)
 |-- fuel: string (nullable = true)
 |-- torque: double (nullable = true)



In [134]:
df_copy.createOrReplaceTempView("car_dataset")

## Exploration

### I. Average horse_power by Fuel type

In [135]:
spark.sql(
    """
    select fuel, round(avg(horse_power), 2) as avg_hp
    from car_dataset
    group by fuel
    order by avg_hp desc
    """
).show(5)


+---------------+------+
|           fuel|avg_hp|
+---------------+------+
|plug in hyrbrid| 963.0|
|Hybrid (Petrol)| 567.5|
|       Electric|346.14|
|         Petrol|336.52|
|         Hybrid|326.04|
+---------------+------+
only showing top 5 rows


### II. Most Powerful Engine per Company

In [136]:
spark.sql(
    """
    select * 
    from (
        select company, name, speed, 
            row_number() over(partition by company order by horse_power desc) as rn
        from car_dataset
    ) t
    where rn = 1
    """
).select('company', 'name', 'speed').show()

+-----------------+--------------------+-----+
|          company|                name|speed|
+-----------------+--------------------+-----+
|     ASTON MARTIN|            VALKYRIE|402.0|
|             AUDI|         R8 V10 PLUS|330.0|
|            Acura|          NSX Type S|307.0|
|          BENTLEY|Continental GT Azure|318.0|
|              BMW|        Mclaren 720s|341.0|
|          Bugatti|              Bolide|500.0|
|         Cadillac|          Escalade V|200.0|
|        Chevrolet|          Camaro ZL1|318.0|
|          FERRARI|       SF90 STRADALE|340.0|
|             Ford|Mustang Shelby GT500|290.0|
|              GMC|Hummer EV SUV Edi...|170.0|
|            HONDA|        CIVIC TYPE R|272.0|
|          HYUNDAI|             IONIQ 6|185.0|
|Jaguar Land Rover|   Jaguar F-Type SVR|320.0|
|             Jeep|Wagoneer S (Elect...|177.0|
|              KIA|       SPORTAGE PHEV|180.0|
|            KIA  |SPORTAGE 2024(BAS...|200.0|
|              Kia|  EV6 GT Performance|260.0|
|      LAMBOR

### III. Fastest Acceleration Ranking

In [137]:
spark.sql(
    """
    select company, name, performance, 
           dense_rank() over(order by performance asc) as rank
    from car_dataset
    """
).select('company', 'name', 'performance').show()

+------------+--------------------+-----------+
|     company|                name|performance|
+------------+--------------------+-----------+
|     FERRARI|       SF90 STRADALE|        2.5|
| ROLLS ROYCE|             PHANTOM|        5.3|
|    MERCEDES|             GT 63 S|        3.2|
|        AUDI|          AUDI R8 Gt|        3.6|
|         BMW|        Mclaren 720s|        2.9|
|ASTON MARTIN|          VANTAGE F1|        3.6|
|     BENTLEY|Continental GT Azure|        4.0|
| LAMBORGHINI|     VENENO ROADSTER|        2.9|
|     FERRARI|          F8 TRIBUTO|        2.9|
|     FERRARI|             812 GTS|        2.9|
|     FERRARI|           PORTOFINO|        3.2|
|     FERRARI|                ROMA|        3.4|
|     FERRARI|           MONZA SP2|        2.9|
|     FERRARI|           F8 SPIDER|        2.9|
|     FERRARI|         PORTOFINO M|        3.2|
|     FERRARI|         ROMA SPIDER|        3.4|
|      TOYOTA|            GR SUPRA|        4.1|
|      TOYOTA|           TOYOTA 86|     

### IV. Top 3 Expensive Cars per Company

In [138]:
top3_expensive = spark.sql(
    """
    select * 
    from ( 
        select company, name, price, 
            dense_rank() over(partition by company order by price desc) as rank
        from car_dataset
    ) t
    where rank <= 3
    order by price desc
    """
)
top3_expensive.show()

+------------+--------------------+---------+----+
|     company|                name|    price|rank|
+------------+--------------------+---------+----+
|     Bugatti|    La Voiture Noire|    1.8E7|   1|
|     Bugatti|          Centodieci|9000000.0|   2|
|     Bugatti|                Divo|5800000.0|   3|
| LAMBORGHINI|     VENENO ROADSTER|4500000.0|   1|
|ASTON MARTIN|            VALKYRIE|3200000.0|   1|
| LAMBORGHINI|                SIAN|2800000.0|   2|
|     FERRARI|           MONZA SP2|1700000.0|   1|
|ASTON MARTIN|              VICTOR|1300000.0|   2|
|ASTON MARTIN|       LAGONDA TARAF|1100000.0|   3|
|ASTON MARTIN|            VALHALLA|1100000.0|   3|
|     FERRARI|       SF90 STRADALE|1100000.0|   2|
|      Nissan|GT-R50 by Italdesign|1100000.0|   1|
|      Nissan|   R390 GT1 Road Car|1000000.0|   2|
|     Porsche|911 RSR (Motorspo...| 750000.0|   1|
| LAMBORGHINI|  AVENTADOR SVJ XAGO| 603000.0|   3|
|      Nissan|       GT-R LM NISMO| 600000.0|   3|
| ROLLS ROYCE|   PHANTOM CELEST

### V. Best Horsepower-to-Price Ratio

In [140]:
spark.sql(
    """
    select company, 
           name, 
           horse_power, 
           price, 
           round((price / horse_power), 2) as ratio
    from car_dataset
    order by ratio asc
    limit 10
    """
).show()

+-----------+--------------+-----------+-------+-----+
|    company|          name|horse_power|  price|ratio|
+-----------+--------------+-----------+-------+-----+
|Tata Motors|Indica V2 Xeta|       65.0| 5000.0|76.92|
|Tata Motors|    Indigo GLX|       85.0| 7200.0|84.71|
|      Tesla|    Cybertruck|      800.0|69900.0|87.38|
|Tata Motors|    Indica eV2|       70.0| 6200.0|88.57|
|     TOYOTA|         CAMRY|      301.0|27000.0| 89.7|
|     NISSAN|          370Z|      332.0|30000.0|90.36|
|    HYUNDAI|      Veloster|      275.0|25000.0|90.91|
|Tata Motors|    Indigo eCS|       70.0| 6500.0|92.86|
|  Chevrolet|   Colorado WT|      310.0|29200.0|94.19|
|     Nissan|       Rasheen|      105.0|10000.0|95.24|
+-----------+--------------+-----------+-------+-----+



### VI. Cars Above Company Average Speed

In [141]:
company_avg_speed = spark.sql(
    """
    select company, round(avg(speed), 2) as avg_speed 
    from car_dataset 
    group by company
    """
)
company_avg_speed.show(10)

+-----------------+---------+
|          company|avg_speed|
+-----------------+---------+
|       Volkswagen|   203.59|
|          Peugeot|   202.13|
|         MERCEDES|    250.0|
|      LAMBORGHINI|    334.0|
|          HYUNDAI|    200.0|
|            KIA  |    200.0|
|Jaguar Land Rover|   250.08|
|             Jeep|   190.47|
|              Kia|   209.44|
|        Chevrolet|   200.21|
+-----------------+---------+
only showing top 10 rows


In [142]:
cars_above_avg = spark.sql(
    """
    select company, name, speed
    from car_dataset c
    where speed > (
        select round(avg(speed))
        from car_dataset
        where company = c.company
    )
    order by speed desc
    """
)
cars_above_avg.show()

+------------+--------------------+-----+
|     company|                name|speed|
+------------+--------------------+-----+
|     Bugatti|              Bolide|500.0|
|     Bugatti|  Chiron Super Sport|490.0|
|ASTON MARTIN|            VALKYRIE|402.0|
|       Tesla|          Roadster 2|402.0|
|ASTON MARTIN|            VALHALLA|362.0|
| LAMBORGHINI|     VENENO ROADSTER|356.0|
| LAMBORGHINI|   AVENTADOR ULTIMAE|355.0|
| LAMBORGHINI|AVENTADOR LP 780-...|355.0|
| LAMBORGHINI|AVENTADOR LP 780-...|355.0|
|      Nissan|   R390 GT1 Road Car|354.0|
| LAMBORGHINI|         AVENTADOR S|350.0|
| LAMBORGHINI|                SIAN|350.0|
| LAMBORGHINI|        AVENTADOR SV|350.0|
| LAMBORGHINI|  AVENTADOR ROADSTER|350.0|
| LAMBORGHINI|       AVENTADOR SVJ|350.0|
| LAMBORGHINI|AVENTADOR SVJ ROA...|350.0|
| LAMBORGHINI|  AVENTADOR SVJ XAGO|350.0|
|        Ford|                  GT|348.0|
|         BMW|        Mclaren 720s|341.0|
|     FERRARI|       SF90 STRADALE|340.0|
+------------+--------------------

### VII. Acceleration Performace Buckets

In [147]:
spark.sql(
    """
    select 
          sum(case 
              when performance >=0 and performance < 3 then 1 else 0 end) as super_fast,
          sum(case 
              when performance >=3 and performance < 5 then 1 else 0 end) as fast,
          sum(case 
              when performance >=5 then 1 else 0 end) as normal
    from car_dataset
    """
).show()

+----------+----+------+
|super_fast|fast|normal|
+----------+----+------+
|        52| 206|   784|
+----------+----+------+



In [153]:
spark.sql(
    """
    select company, 
           name, 
           performance,
           (case
                when performance >=0 and performance < 3 then "super_fast"
                when performance >=3 and performance < 5 then "fast"
                when performance >=5 then "normal"
            end) as status
    from car_dataset
    """
).show()

+------------+--------------------+-----------+----------+
|     company|                name|performance|    status|
+------------+--------------------+-----------+----------+
|     FERRARI|       SF90 STRADALE|        2.5|super_fast|
| ROLLS ROYCE|             PHANTOM|        5.3|    normal|
|    MERCEDES|             GT 63 S|        3.2|      fast|
|        AUDI|          AUDI R8 Gt|        3.6|      fast|
|         BMW|        Mclaren 720s|        2.9|super_fast|
|ASTON MARTIN|          VANTAGE F1|        3.6|      fast|
|     BENTLEY|Continental GT Azure|        4.0|      fast|
| LAMBORGHINI|     VENENO ROADSTER|        2.9|super_fast|
|     FERRARI|          F8 TRIBUTO|        2.9|super_fast|
|     FERRARI|             812 GTS|        2.9|super_fast|
|     FERRARI|           PORTOFINO|        3.2|      fast|
|     FERRARI|                ROMA|        3.4|      fast|
|     FERRARI|           MONZA SP2|        2.9|super_fast|
|     FERRARI|           F8 SPIDER|        2.9|super_fas

### VIII. Most Common Fuel Type Per Company

In [156]:
spark.sql(
    """
    select company, fuel, count(*) no_of_cars
    from car_dataset
    group by company, fuel
    order by company
    """
).show()

+------------+---------------+----------+
|     company|           fuel|no_of_cars|
+------------+---------------+----------+
|ASTON MARTIN|         Petrol|         9|
|ASTON MARTIN|         Hybrid|         2|
|        AUDI|       Electric|         2|
|        AUDI|         Petrol|        19|
|       Acura|         Hybrid|         7|
|       Acura|         Petrol|        20|
|     BENTLEY|         Petrol|         1|
|         BMW|         Hybrid|         1|
|         BMW|         Petrol|        30|
|         BMW|         Diesel|        10|
|     Bugatti|         Petrol|        10|
|    Cadillac|         Petrol|        17|
|    Cadillac|       Electric|         3|
|   Chevrolet|       Electric|         4|
|   Chevrolet|         Diesel|         4|
|   Chevrolet|         Petrol|        50|
|     FERRARI|plug in hyrbrid|         1|
|     FERRARI|         Petrol|         8|
|        Ford|         Hybrid|         3|
|        Ford|         Petrol|        21|
+------------+---------------+----

In [158]:
most_common = spark.sql(
    """
    select company, fuel, cnt,
        dense_rank() over(partition by company order by cnt desc) as rank
    from (
        select company, fuel, count(fuel) as cnt
        from car_dataset
        group by company, fuel
    ) t
    """
)

most_common.filter(F.col('rank') == 1) \
           .select(['company', 'fuel', 'cnt']) \
           .show()

+-----------------+------+---+
|          company|  fuel|cnt|
+-----------------+------+---+
|     ASTON MARTIN|Petrol|  9|
|             AUDI|Petrol| 19|
|            Acura|Petrol| 20|
|          BENTLEY|Petrol|  1|
|              BMW|Petrol| 30|
|          Bugatti|Petrol| 10|
|         Cadillac|Petrol| 17|
|        Chevrolet|Petrol| 50|
|          FERRARI|Petrol|  8|
|             Ford|Petrol| 21|
|              GMC|Petrol| 48|
|            HONDA|Petrol|  5|
|          HYUNDAI|Petrol| 11|
|Jaguar Land Rover|Petrol| 33|
|             Jeep|Petrol| 14|
|              KIA|Petrol| 11|
|            KIA  |Petrol|  1|
|              Kia|Petrol| 39|
|      LAMBORGHINI|Petrol| 23|
|         MAHINDRA|Diesel|  2|
+-----------------+------+---+
only showing top 20 rows


### IX. Price Gap Analysis

In [160]:
spark.sql(
    """
    select company, 
           max(price) as most_expensive,
           min(price) least_expensive,
           max(price) - min(price) as gap
    from car_dataset
    group by company
    order by gap desc
    """
).show()

+-----------------+--------------+---------------+---------+
|          company|most_expensive|least_expensive|      gap|
+-----------------+--------------+---------------+---------+
|          Bugatti|         1.8E7|      3000000.0|    1.5E7|
|      LAMBORGHINI|     4500000.0|       211000.0|4289000.0|
|     ASTON MARTIN|     3200000.0|       142000.0|3058000.0|
|          FERRARI|     1700000.0|       210000.0|1490000.0|
|           Nissan|     1100000.0|        10000.0|1090000.0|
|          Porsche|      750000.0|        58000.0| 692000.0|
|             Ford|      500000.0|        23000.0| 477000.0|
|              BMW|      499000.0|        26000.0| 473000.0|
|             AUDI|      253290.0|        35000.0| 218290.0|
|      ROLLS ROYCE|      515000.0|       320000.0| 195000.0|
|            Tesla|      200000.0|        40240.0| 159760.0|
|         MERCEDES|      200000.0|        46000.0| 154000.0|
|           TOYOTA|      170000.0|        18000.0| 152000.0|
|       Volkswagen|     

In [None]:
### At ths point, I wanted to know the most expensive cars from each company
spark.sql(
    """
    select * from 
        (select company, name, price, 
            dense_rank() over(partition by company order by price desc) as rnk
        from car_dataset) t
    where rnk = 1
    order by price desc
    """
).select('company', 'name', 'price').show()

+------------+--------------------+---------+
|     company|                name|    price|
+------------+--------------------+---------+
|     Bugatti|    La Voiture Noire|    1.8E7|
| LAMBORGHINI|     VENENO ROADSTER|4500000.0|
|ASTON MARTIN|            VALKYRIE|3200000.0|
|     FERRARI|           MONZA SP2|1700000.0|
|      Nissan|GT-R50 by Italdesign|1100000.0|
|     Porsche|911 RSR (Motorspo...| 750000.0|
| ROLLS ROYCE|   PHANTOM CELESTIAL| 515000.0|
|        Ford|                  GT| 500000.0|
|         BMW|        Mclaren 720s| 499000.0|
|ROLLS ROYCE |               GHOST| 332000.0|
|     BENTLEY|Continental GT Azure| 311000.0|
|        AUDI|          AUDI R8 Gt| 253290.0|
|    MERCEDES|  BENZ MAYBACH S 680| 200000.0|
|       Tesla|          Roadster 2| 200000.0|
|      TOYOTA|         CENTURY SUV| 170000.0|
|       Volvo|          Volvo FH16| 160000.0|
|       Acura|          NSX Type S| 157000.0|
|       Acura|NSX 3.5L Hybrid C...| 157000.0|
|       Acura|  NSX Carbon Edition

In [170]:
### The least expensive cars from each company
spark.sql(
    """
    select * from 
        (select company, name, price, 
            dense_rank() over(partition by company order by price asc) as rnk
        from car_dataset) t
    where rnk = 1
    order by price asc
    """
).select('company', 'name', 'price').show()

+-------------+--------------------+-------+
|      company|                name|  price|
+-------------+--------------------+-------+
|  Tata Motors|           Nano GenX| 4000.0|
|       Nissan|             Clipper|10000.0|
|       Nissan|             Rasheen|10000.0|
|MARUTI SUZUKI|               SWIFT|10400.0|
|   Volkswagen|                 Fox|12000.0|
|      HYUNDAI|                 I10|12400.0|
|    Chevrolet|            Spark LS|14395.0|
|        HONDA|                CITY|15400.0|
|     MAHINDRA|             SCORPIO|16400.0|
|          Kia|                 Rio|17000.0|
|       TOYOTA|              AYGO X|18000.0|
|       NISSAN|              SENTRA|20000.0|
|      Peugeot|           301 Sedan|20000.0|
|         Ford|           Fiesta ST|23000.0|
|          BMW|                114i|26000.0|
|          KIA|SPORTAGE 2024(BAS...|26000.0|
|        KIA  |SPORTAGE 2024(BAS...|26500.0|
|        Acura| ILX Technology Plus|28000.0|
|         Jeep|            Renegade|28345.0|
|         

### X. Ranking Companies by their average horse power

In [175]:
spark.sql(
    """
    select company, avg_hp, 
           dense_rank() over(order by avg_hp desc) as rnk
    from 
        (
        select company, round(avg(horse_power), 2) as avg_hp
        from car_dataset
        group by company
        ) t
    """
).show()

+-----------------+------+---+
|          company|avg_hp|rnk|
+-----------------+------+---+
|          Bugatti|1565.0|  1|
|          FERRARI|709.89|  2|
|     ASTON MARTIN|701.09|  3|
|      LAMBORGHINI|691.54|  4|
|     ROLLS ROYCE | 591.0|  5|
|      ROLLS ROYCE|583.76|  6|
|          BENTLEY| 550.0|  7|
|          Porsche|463.26|  8|
|            Volvo|456.67|  9|
|         MERCEDES|432.86| 10|
|             AUDI|393.19| 11|
|         Cadillac| 368.4| 12|
|              GMC|364.68| 13|
|            Tesla| 360.6| 14|
|Jaguar Land Rover|357.21| 15|
|             Ford|343.91| 16|
|           Toyota| 326.0| 17|
|             Jeep|318.89| 18|
|           NISSAN| 316.4| 19|
|            Acura|311.04| 20|
+-----------------+------+---+
only showing top 20 rows


### XI. Acceleration vs Torque Correlation

In [194]:
spark.sql(
    """
    select 
        company, 
        name, 
        performance, 
        torque, 
        case
            when torque >= 0 and torque < 400 then 'Low Torque'
            when torque >= 400 and torque < 800 then 'Medium Torque'
            when torque >= 800 then 'High Torque'
        end as torque_status
    from car_dataset
    order by performance
    """
).show()


+------------+--------------------+-----------+------+-------------+
|     company|                name|performance|torque|torque_status|
+------------+--------------------+-----------+------+-------------+
|       Tesla|          Roadster 2|        1.9|  10.0|   Low Torque|
|       Tesla|       Model S Plaid|        2.1|   1.0|   Low Torque|
|     Bugatti|              Bolide|        2.2|1600.0|  High Torque|
|     Bugatti|    Chiron Pur Sport|        2.3|1600.0|  High Torque|
|     Bugatti|  Chiron Super Sport|        2.4|1600.0|  High Torque|
|     Bugatti|        Chiron Sport|        2.4|1600.0|  High Torque|
|     Bugatti|                Divo|        2.4|1600.0|  High Torque|
|     Bugatti|          Centodieci|        2.4|1600.0|  High Torque|
|     Bugatti|             Mistral|        2.4|1600.0|  High Torque|
|      Nissan|GT-R50 by Italdesign|        2.5| 780.0|Medium Torque|
|     FERRARI|       SF90 STRADALE|        2.5| 800.0|  High Torque|
|ASTON MARTIN|            VALKYRIE

In [195]:
# Grouping by torque status
spark.sql(
    """
    select 
        torque_status, 
        count(*) as no_of_cars, 
        round(avg(performance), 2) as avg_acceleration
    from ( 
        select 
            company, 
            name, 
            performance, 
            torque, 
            case
                when torque >= 0 and torque < 400 then 'Low Torque'
                when torque >= 400 and torque < 800 then 'Medium Torque'
                when torque >= 800 then 'High Torque'
            end as torque_status
        from car_dataset
        order by performance
    ) t
    group by torque_status
    """
).show()


+-------------+----------+----------------+
|torque_status|no_of_cars|avg_acceleration|
+-------------+----------+----------------+
|   Low Torque|       507|            8.89|
|  High Torque|        96|            5.32|
|Medium Torque|       439|            5.54|
+-------------+----------+----------------+



### XII. Cars Above Overall Average Horsepower

In [193]:
spark.sql(
    """
    select 
        company, 
        name, 
        performance, 
        torque, 
        horse_power,
        round(
            horse_power - (
                select round(avg(horse_power), 2) as avg_hp
                from car_dataset
            ), 
            2
        ) as gap
    from car_dataset
    where horse_power > (
        select round(avg(horse_power), 2) as avg_hp
        from car_dataset
    )
    """
).show()


+------------+--------------------+-----------+------+-----------+------+
|     company|                name|performance|torque|horse_power|   gap|
+------------+--------------------+-----------+------+-----------+------+
|     FERRARI|       SF90 STRADALE|        2.5| 800.0|      963.0|641.96|
| ROLLS ROYCE|             PHANTOM|        5.3| 900.0|      563.0|241.96|
|    MERCEDES|             GT 63 S|        3.2| 900.0|      630.0|308.96|
|        AUDI|          AUDI R8 Gt|        3.6| 560.0|      602.0|280.96|
|         BMW|        Mclaren 720s|        2.9| 770.0|      710.0|388.96|
|ASTON MARTIN|          VANTAGE F1|        3.6| 685.0|      656.0|334.96|
|     BENTLEY|Continental GT Azure|        4.0| 900.0|      550.0|228.96|
| LAMBORGHINI|     VENENO ROADSTER|        2.9| 690.0|      750.0|428.96|
|     FERRARI|          F8 TRIBUTO|        2.9| 770.0|      710.0|388.96|
|     FERRARI|             812 GTS|        2.9| 718.0|      789.0|467.96|
|     FERRARI|           PORTOFINO|   