***Different functions of pyspark***

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('spark_app_3').master('local[*]').getOrCreate()

In [2]:
spark

In [3]:
df = spark.read.csv('car_price_dataset.csv', header=True, inferSchema=True)

In [4]:
df.show(5)

+----------+------+----+-----------+---------+--------------+-------+-----+-----------+-----+
|     Brand| Model|Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+----------+------+----+-----------+---------+--------------+-------+-----+-----------+-----+
|       Kia|   Rio|2020|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501|
| Chevrolet|Malibu|2012|        2.0|   Hybrid|     Automatic|   5356|    2|          3|12092|
|  Mercedes|   GLA|2020|        4.2|   Diesel|     Automatic| 231440|    4|          2|11171|
|      Audi|    Q5|2023|        2.0| Electric|        Manual| 160971|    2|          1|11780|
|Volkswagen|  Golf|2003|        2.6|   Hybrid|Semi-Automatic| 286618|    3|          3| 2867|
+----------+------+----+-----------+---------+--------------+-------+-----+-----------+-----+
only showing top 5 rows



In [5]:
# Filter Method
# docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.filter.html

df.filter(df.Fuel_Type == 'Hybrid').show(5)
# Or we can write it as
df.where(df.Price > 11444).orderBy(df.Price.desc()).show(5)

+----------+--------+----+-----------+---------+--------------+-------+-----+-----------+-----+
|     Brand|   Model|Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+----------+--------+----+-----------+---------+--------------+-------+-----+-----------+-----+
| Chevrolet|  Malibu|2012|        2.0|   Hybrid|     Automatic|   5356|    2|          3|12092|
|Volkswagen|    Golf|2003|        2.6|   Hybrid|Semi-Automatic| 286618|    3|          3| 2867|
|       Kia|Sportage|2014|        2.6|   Hybrid|        Manual|  98700|    3|          4| 9926|
|       BMW|5 Series|2013|        1.3|   Hybrid|     Automatic| 296824|    2|          3| 5863|
|Volkswagen|    Golf|2009|        4.5|   Hybrid|        Manual|  42795|    4|          3|11444|
+----------+--------+----+-----------+---------+--------------+-------+-----+-----------+-----+
only showing top 5 rows

+------+--------+----+-----------+---------+------------+-------+-----+-----------+-----+
| Brand|   Model|Year

In [6]:
# more than 1 filter 
df.filter(df.Engine_Size == 5.0).filter(df.Transmission == 'Automatic').show(5)
# Or for more efficient way we can write it using '&' for AND operator, '|' for OR operator
df.filter(
    (df.Transmission=='Automatic') &
    (df.Model=='Corolla') |
    (df.Model=='A3')
).show(5)

+---------+--------+----+-----------+---------+------------+-------+-----+-----------+-----+
|    Brand|   Model|Year|Engine_Size|Fuel_Type|Transmission|Mileage|Doors|Owner_Count|Price|
+---------+--------+----+-----------+---------+------------+-------+-----+-----------+-----+
|     Ford|Explorer|2013|        5.0| Electric|   Automatic| 188195|    5|          3|12736|
|     Ford|  Fiesta|2011|        5.0|   Petrol|   Automatic| 125014|    4|          5|11399|
|Chevrolet|  Impala|2019|        5.0|   Petrol|   Automatic| 266558|    2|          2|10968|
|   Toyota|    RAV4|2011|        5.0|   Hybrid|   Automatic|  13473|    5|          4|14630|
|      Kia|  Optima|2022|        5.0| Electric|   Automatic| 260953|    5|          4|13980|
+---------+--------+----+-----------+---------+------------+-------+-----+-----------+-----+
only showing top 5 rows

+------+-------+----+-----------+---------+--------------+-------+-----+-----------+-----+
| Brand|  Model|Year|Engine_Size|Fuel_Type|  Tr

In [7]:
spark.stop()