In [170]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

In [171]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Car_Analysis")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")
# Tip to reader: use WARN for development, ERROR in prod

In [172]:
df = spark.read.csv(
    path="dataset/Cars Datasets 2025.csv",
    header=True,
    mode="PERMISSIVE",
)

### Data Transformations

In [209]:
df_clean = df.dropna(how="any")

In [210]:
df_clean = (
    df_clean.withColumnRenamed("Company Names", "company_name")
    .withColumnRenamed("Cars Names", "car_name")
    .withColumnRenamed("Total Speed", "total_speed")
    .withColumnRenamed("Performance(0 - 100 )KM/H", "performance")
    .withColumnRenamed("HorsePower", "horse_power")
    .withColumnRenamed("Fuel Types", "fuel_type")
    .withColumnRenamed("Cars Prices", "price")
)

In [211]:
df_clean = (
    df_clean.withColumn("horse_power", F.regexp_replace("horse_power", r"\(.*?\)", ""))
    .withColumn("horse_power", F.regexp_replace("horse_power", r"[^\d\-\./]", " "))
    .withColumn("horse_power", F.trim(F.col("horse_power")))
)

In [212]:
df_clean = df_clean.withColumn(
    "horse_power",
    F.when(
        F.col("horse_power").rlike(r"^\d+\s*-\s*\d+$"),
        (
            F.split(F.col("horse_power"), "-").getItem(0).cast("int")
            + F.split(F.col("horse_power"), "-").getItem(1).cast("int")
        )
        / 2,
    )
    .when(
        F.col("horse_power").rlike(r"^\d+\s*/\s*\d+$"),
        (
            F.split(F.col("horse_power"), "/").getItem(0).cast("int")
            + F.split(F.col("horse_power"), "/").getItem(1).cast("int")
        )
        / 2,
    )
    .when(
        F.col("horse_power").rlike(r"^\d+(\.\d+)?$"),
        F.col("horse_power").cast("double"),
    )
    .otherwise(None),
)

In [None]:
df_clean = (
    df_clean.withColumn("price", F.regexp_replace("price", r"[^\d]", ""))
    .withColumn("price", F.trim(F.col("price")))
    .withColumn("price", F.when(F.col("price") == "", None) # handling "" values.
                .otherwise(F.col("price").cast("double") # convert all other to double
    ))
)
df_clean.show(1)

+------------+-------------+-------+-------------------+-----------+-----------+-----------+-----+---------------+-----+------+
|company_name|     car_name|Engines|CC/Battery Capacity|horse_power|total_speed|performance|price|      fuel_type|Seats|Torque|
+------------+-------------+-------+-------------------+-----------+-----------+-----------+-----+---------------+-----+------+
|     FERRARI|SF90 STRADALE|     V8|            3990 cc|      963.0|   340 km/h|    2.5 sec|1.1E7|plug in hyrbrid|    2|800 Nm|
+------------+-------------+-------+-------------------+-----------+-----------+-----------+-----+---------------+-----+------+
only showing top 1 row


In [214]:
df_clean = df_clean.dropna(subset=["price"])
df_clean.count()

1208

In [215]:
df_clean.createOrReplaceTempView("car_dataset")

### Average horse_power by Fuel type

In [147]:
avg_hs_fuelt_type = spark.sql(
    """
    select fuel_type, round(avg(horse_power), 2) as avg_hp
    from car_dataset
    group by fuel_type
    order by avg_hp desc
    """
)

### Most Powerful Engine per Company

In [148]:
most_powerful = spark.sql(
    """
    select * 
    from (
        select company_name, car_name, total_speed, 
            row_number() over(partition by company_name order by horse_power desc) as rn
        from car_dataset
    ) t
    where rn = 1
    """
)

### Fastest Acceleration Ranking - in process

### Top 3 Expensive Cars per Company

In [217]:
top3_expensive = spark.sql(
    """
    select * 
    from ( 
        select company_name, car_name, price, 
            dense_rank() over(partition by company_name order by price desc) as rank
        from car_dataset
    ) t
    where rank <= 3
    """
)
top3_expensive.show()

+------------+--------------------+---------+----+
|company_name|            car_name|    price|rank|
+------------+--------------------+---------+----+
|ASTON MARTIN|            VALKYRIE|    3.2E7|   1|
|ASTON MARTIN|              VICTOR|    1.3E7|   2|
|ASTON MARTIN|            VALHALLA|    1.1E7|   3|
|ASTON MARTIN|       LAGONDA TARAF|    1.1E7|   3|
|        AUDI|          AUDI R8 Gt|2532900.0|   1|
|        AUDI|         R8 V10 PLUS|1940000.0|   2|
|        AUDI|       RS7 SPORTBACK|1140000.0|   3|
|       Acura|          NSX Type S|1570000.0|   1|
|       Acura|NSX 3.5L Hybrid C...|1570000.0|   1|
|       Acura|  NSX Carbon Edition|1570000.0|   1|
|       Acura|NSX GT3 (Track Ve...|1500000.0|   2|
|       Acura|     TLX PMC Edition| 620000.0|   3|
|     BENTLEY|Continental GT Azure|3110000.0|   1|
|         BMW|        Mclaren 720s|4990000.0|   1|
|         BMW|         I8 ROADSTER|1650000.0|   2|
|         BMW|       M8 GRAN COUPE|1460000.0|   3|
|     Bugatti|          Centodi