In [19]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DoubleType,
)

In [20]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Car_Analysis")
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")
# Tip to reader: use WARN for development, ERROR in prod

In [21]:
df = spark.read.csv(
    path="dataset/Cars Datasets 2025.csv",
    header=True,
    mode="PERMISSIVE",
)

In [23]:
df.show(10)

+-------------+--------------------+-----------+-------------------+----------+-----------+-------------------------+---------------+---------------+-----+------------+
|Company Names|          Cars Names|    Engines|CC/Battery Capacity|HorsePower|Total Speed|Performance(0 - 100 )KM/H|    Cars Prices|     Fuel Types|Seats|      Torque|
+-------------+--------------------+-----------+-------------------+----------+-----------+-------------------------+---------------+---------------+-----+------------+
|      FERRARI|       SF90 STRADALE|         V8|            3990 cc|    963 hp|   340 km/h|                  2.5 sec|    $1,100,000 |plug in hyrbrid|    2|      800 Nm|
|  ROLLS ROYCE|             PHANTOM|        V12|            6749 cc|    563 hp|   250 km/h|                  5.3 sec|      $460,000 |         Petrol|    5|      900 Nm|
|         Ford|                 KA+|1.2L Petrol|           1,200 cc|  70-85 hp|   165 km/h|                 10.5 sec|$12,000-$15,000|         Petrol|    5|

### Data Transformations

In [33]:
df_clean = (
    df.withColumn("HorsePower", F.regexp_replace("HorsePower", r"\(.*?\)", ""))
    .withColumn("HorsePower", F.regexp_replace("HorsePower", r"[^\d\-\./]", " "))
    .withColumn("HorsePower", F.trim(F.col("HorsePower")))
)

df_clean = df_clean.withColumn(
    "HorsePower",
    F.when(
        F.col("HorsePower").rlike(r"^\d+\s*-\s*\d+$"),
        (
            F.split(F.col("HorsePower"), "-").getItem(0).cast("int")
            + F.split(F.col("HorsePower"), "-").getItem(1).cast("int")
        )
        / 2,
    )
    .when(
        F.col("HorsePower").rlike(r"^\d+\s*/\s*\d+$"),
        (
            F.split(F.col("HorsePower"), "/").getItem(0).cast("int")
            + F.split(F.col("HorsePower"), "/").getItem(1).cast("int")
        )
        / 2,
    )
    .when(
        F.col("HorsePower").rlike(r"^\d+(\.\d+)?$"), F.col("HorsePower").cast("double")
    )
    .otherwise(None),
)

df_clean = df_clean.withColumnRenamed("HorsePower", "horse_power").withColumnRenamed(
    "Fuel Types", "fuel_type"
)
df_clean.show(3)

+-------------+-------------+-----------+-------------------+-----------+-----------+-------------------------+---------------+---------------+-----+------------+
|Company Names|   Cars Names|    Engines|CC/Battery Capacity|horse_power|Total Speed|Performance(0 - 100 )KM/H|    Cars Prices|      fuel_type|Seats|      Torque|
+-------------+-------------+-----------+-------------------+-----------+-----------+-------------------------+---------------+---------------+-----+------------+
|      FERRARI|SF90 STRADALE|         V8|            3990 cc|      963.0|   340 km/h|                  2.5 sec|    $1,100,000 |plug in hyrbrid|    2|      800 Nm|
|  ROLLS ROYCE|      PHANTOM|        V12|            6749 cc|      563.0|   250 km/h|                  5.3 sec|      $460,000 |         Petrol|    5|      900 Nm|
|         Ford|          KA+|1.2L Petrol|           1,200 cc|       77.5|   165 km/h|                 10.5 sec|$12,000-$15,000|         Petrol|    5|100 - 140 Nm|
+-------------+-------

### Everage Horsepower by Fuel Type

In [40]:
df_clean = df_clean.na.drop(subset=["horse_power", "fuel_type"])

In [41]:
df_clean.createOrReplaceTempView("car_dataset")

In [44]:
avg_hs_fuelt_type = spark.sql(
    """
    select fuel_type, round(avg(horse_power), 2) as avg_hp
    from car_dataset
    group by fuel_type
    order by avg_hp desc
    """
)
avg_hs_fuelt_type.show(21)

+--------------------+------+
|           fuel_type|avg_hp|
+--------------------+------+
|     plug in hyrbrid| 963.0|
|     Hybrid (Petrol)| 567.5|
|     Petrol (Hybrid)| 400.0|
|            Electric|359.38|
|              Hybrid|320.09|
|              Petrol|311.68|
|      Plug-in Hybrid| 265.6|
|Hybrid (Gas + Ele...| 240.0|
|              Diesel|227.14|
|      Petrol, Diesel| 225.5|
|       Petrol/Diesel|220.88|
|       Hybrid/Petrol| 220.5|
|          Petrol/AWD| 213.5|
|       Petrol/Hybrid|190.97|
|      Petrol, Hybrid| 187.0|
|            Hydrogen|154.67|
|       Diesel Hybrid| 150.0|
|           Petrol/EV| 147.0|
|     Hybrid/Electric| 139.0|
|       Diesel/Petrol|111.63|
|          CNG/Petrol|  86.0|
+--------------------+------+



21