## ***Advance Usecases in Spark***

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType,DateType

In [2]:
spark = SparkSession.builder.appName('spark_app_4').master('local[*]').getOrCreate()

In [3]:
# Defining Schema for the dataset
# Like we do model in django
schema = StructType([
    StructField('Brand', StringType(), nullable=True),
    StructField('Model', StringType(), nullable=True),
    StructField('Year', DateType(), nullable=True),
    StructField('Engine_Size', FloatType(), nullable=True),
    StructField('Fuel_Type', StringType(), nullable=True),
    StructField('Transmission', StringType(), nullable=True),
    StructField('Mileage', IntegerType(), nullable=True),
    StructField('Doors', IntegerType(), nullable=True),
    StructField('Owner_Count', IntegerType(), nullable=True),
    StructField('Price', IntegerType(), nullable=True)
])

In [4]:
# Creating a dataframe
df = spark.read.csv('car_price_dataset.csv', schema=schema, header=True)
df.show(5)

+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|     Brand| Model|      Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|       Kia|   Rio|2020-01-01|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501|
| Chevrolet|Malibu|2012-01-01|        2.0|   Hybrid|     Automatic|   5356|    2|          3|12092|
|  Mercedes|   GLA|2020-01-01|        4.2|   Diesel|     Automatic| 231440|    4|          2|11171|
|      Audi|    Q5|2023-01-01|        2.0| Electric|        Manual| 160971|    2|          1|11780|
|Volkswagen|  Golf|2003-01-01|        2.6|   Hybrid|Semi-Automatic| 286618|    3|          3| 2867|
+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
only showing top 5 rows



In [5]:
# To know the data types of the columns of the dataframe
df.printSchema(), df.dtypes

root
 |-- Brand: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Year: date (nullable = true)
 |-- Engine_Size: float (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Mileage: integer (nullable = true)
 |-- Doors: integer (nullable = true)
 |-- Owner_Count: integer (nullable = true)
 |-- Price: integer (nullable = true)



(None,
 [('Brand', 'string'),
  ('Model', 'string'),
  ('Year', 'date'),
  ('Engine_Size', 'float'),
  ('Fuel_Type', 'string'),
  ('Transmission', 'string'),
  ('Mileage', 'int'),
  ('Doors', 'int'),
  ('Owner_Count', 'int'),
  ('Price', 'int')])

In [6]:
# Filter data 
from pyspark.sql.functions import lower, upper
from datetime import datetime
df.filter(
    (lower(df.Model) == 'rio') & 
    (df.Year == datetime.strptime('2020', '%Y'))
).show(5)

+-----+-----+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|Brand|Model|      Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+-----+-----+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|  Kia|  Rio|2020-01-01|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501|
|  Kia|  Rio|2020-01-01|        1.8|   Hybrid|     Automatic|  28207|    5|          5|13835|
|  Kia|  Rio|2020-01-01|        3.1|   Diesel|Semi-Automatic| 175803|    5|          4| 9683|
|  Kia|  Rio|2020-01-01|        1.3|   Hybrid|Semi-Automatic| 232567|    5|          5| 7748|
|  Kia|  Rio|2020-01-01|        3.8|   Hybrid|        Manual| 137014|    4|          3|12159|
+-----+-----+----------+-----------+---------+--------------+-------+-----+-----------+-----+
only showing top 5 rows



In [7]:
# to get the first 'n'number of rows df.take(n) or df.show(5),
# Here df.show(5) return data in a tabular format and df.show returns data in a list of rowe format.
df.take(5),  df.show(5)

+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|     Brand| Model|      Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|       Kia|   Rio|2020-01-01|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501|
| Chevrolet|Malibu|2012-01-01|        2.0|   Hybrid|     Automatic|   5356|    2|          3|12092|
|  Mercedes|   GLA|2020-01-01|        4.2|   Diesel|     Automatic| 231440|    4|          2|11171|
|      Audi|    Q5|2023-01-01|        2.0| Electric|        Manual| 160971|    2|          1|11780|
|Volkswagen|  Golf|2003-01-01|        2.6|   Hybrid|Semi-Automatic| 286618|    3|          3| 2867|
+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
only showing top 5 rows



([Row(Brand='Kia', Model='Rio', Year=datetime.date(2020, 1, 1), Engine_Size=4.199999809265137, Fuel_Type='Diesel', Transmission='Manual', Mileage=289944, Doors=3, Owner_Count=5, Price=8501),
  Row(Brand='Chevrolet', Model='Malibu', Year=datetime.date(2012, 1, 1), Engine_Size=2.0, Fuel_Type='Hybrid', Transmission='Automatic', Mileage=5356, Doors=2, Owner_Count=3, Price=12092),
  Row(Brand='Mercedes', Model='GLA', Year=datetime.date(2020, 1, 1), Engine_Size=4.199999809265137, Fuel_Type='Diesel', Transmission='Automatic', Mileage=231440, Doors=4, Owner_Count=2, Price=11171),
  Row(Brand='Audi', Model='Q5', Year=datetime.date(2023, 1, 1), Engine_Size=2.0, Fuel_Type='Electric', Transmission='Manual', Mileage=160971, Doors=2, Owner_Count=1, Price=11780),
  Row(Brand='Volkswagen', Model='Golf', Year=datetime.date(2003, 1, 1), Engine_Size=2.5999999046325684, Fuel_Type='Hybrid', Transmission='Semi-Automatic', Mileage=286618, Doors=3, Owner_Count=3, Price=2867)],
 None)

***Manupulating columns in a DataFrame***
- df.select('<columnn Name>') or df.select(['\<Column_1\>', '\<Column_2\>', ....]).
- df.filter(df.column_name == 'some name') or df.filter((df.column_name == 'some name 1') & (df.column_name_2 = 'some name 2'))
- df.orderBy(df.column_name.desc) used for ordering of data.

In [8]:
# Selectingsingle or multipe Columns
df.select('Brand').show(2)
df.select(['Brand', 'Model']).show(2)

+---------+
|    Brand|
+---------+
|      Kia|
|Chevrolet|
+---------+
only showing top 2 rows

+---------+------+
|    Brand| Model|
+---------+------+
|      Kia|   Rio|
|Chevrolet|Malibu|
+---------+------+
only showing top 2 rows



In [9]:
# Filter command
# Filterind data where brand name is Rio.
df.filter(df.Brand == 'Kia').show(2)

+-----+--------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|Brand|   Model|      Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+-----+--------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|  Kia|     Rio|2020-01-01|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501|
|  Kia|Sportage|2001-01-01|        4.7| Electric|Semi-Automatic| 157495|    2|          2| 7950|
+-----+--------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
only showing top 2 rows



In [10]:
#  Filtering data case Insensitively using lower function in pyspark.sql.functions.lower
df.filter(lower(df.Model) == 'rio').show(2)

+-----+-----+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|Brand|Model|      Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+-----+-----+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|  Kia|  Rio|2020-01-01|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501|
|  Kia|  Rio|2000-01-01|        3.4|   Diesel|Semi-Automatic| 257427|    3|          3| 2351|
+-----+-----+----------+-----------+---------+--------------+-------+-----+-----------+-----+
only showing top 2 rows



In [11]:
# Filterinf data with multiple commands with AND statement
df.filter(
    (df.Brand == 'Mercedes') &
    (df.Price > '11171')
).show()

+--------+-------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|   Brand|  Model|      Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+--------+-------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|Mercedes|    GLA|2021-01-01|        1.5|   Petrol|        Manual|  34640|    2|          1|11207|
|Mercedes|    GLA|2016-01-01|        3.9|   Petrol|Semi-Automatic|  19868|    3|          4|12402|
|Mercedes|    GLA|2013-01-01|        3.3| Electric|Semi-Automatic|  80184|    3|          5|11696|
|Mercedes|C-Class|2023-01-01|        4.6| Electric|Semi-Automatic| 274137|    2|          3|12117|
|Mercedes|C-Class|2018-01-01|        4.0|   Petrol|        Manual|  66347|    3|          2|12173|
|Mercedes|E-Class|2023-01-01|        3.9|   Hybrid|Semi-Automatic|  58107|    3|          2|14737|
|Mercedes|E-Class|2014-01-01|        3.8| Electric|Semi-Automatic|   5877|    5|          5|13982|
|Mercedes|

In [12]:
# Filtering data with Or Statement
df.filter(
    (df.Brand == 'Mercedes') |
    (df.Brand == 'Kia')
).show()

+--------+--------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|   Brand|   Model|      Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+--------+--------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|     Kia|     Rio|2020-01-01|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501|
|Mercedes|     GLA|2020-01-01|        4.2|   Diesel|     Automatic| 231440|    4|          2|11171|
|     Kia|Sportage|2001-01-01|        4.7| Electric|Semi-Automatic| 157495|    2|          2| 7950|
|     Kia|Sportage|2014-01-01|        2.6|   Hybrid|        Manual|  98700|    3|          4| 9926|
|     Kia|     Rio|2000-01-01|        3.4|   Diesel|Semi-Automatic| 257427|    3|          3| 2351|
|Mercedes|     GLA|2021-01-01|        1.5|   Petrol|        Manual|  34640|    2|          1|11207|
|Mercedes|     GLA|2016-01-01|        3.9|   Petrol|Semi-Automatic|  19868|    3|          4|12402|


In [13]:
# Selecting Column after filtering
df.filter(
    (df.Brand == 'Mercedes') |
    (df.Brand == 'Kia')
).select(['Owner_Count', 'Doors']).show(5)

+-----------+-----+
|Owner_Count|Doors|
+-----------+-----+
|          5|    3|
|          2|    4|
|          2|    2|
|          4|    3|
|          3|    3|
+-----------+-----+
only showing top 5 rows



In [14]:
# Sorting of data based on column
# type 1
df.filter(
    (lower(df.Fuel_Type) == 'petrol') &
    (df.Brand == 'Mercedes')
).orderBy(df.Price.asc()).show()
# type 2
df.filter(
    (lower(df.Fuel_Type) == 'petrol') &
    (df.Brand == 'Mercedes')
).sort('Price', ascending=False).show()

+--------+-------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|   Brand|  Model|      Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+--------+-------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|Mercedes|C-Class|2002-01-01|        1.2|   Petrol|        Manual| 211790|    4|          3| 2000|
|Mercedes|    GLA|2000-01-01|        2.0|   Petrol|Semi-Automatic| 289423|    2|          2| 2000|
|Mercedes|    GLA|2003-01-01|        1.3|   Petrol|        Manual| 294888|    5|          2| 2000|
|Mercedes|    GLA|2004-01-01|        2.1|   Petrol|        Manual| 297401|    4|          3| 2000|
|Mercedes|C-Class|2003-01-01|        1.0|   Petrol|        Manual| 290535|    5|          2| 2000|
|Mercedes|E-Class|2002-01-01|        1.4|   Petrol|     Automatic| 258150|    3|          2| 2437|
|Mercedes|E-Class|2009-01-01|        1.3|   Petrol|        Manual| 277187|    2|          4| 2556|
|Mercedes|

***Manipulating columns in PySpark***

In [15]:
from pyspark.sql.functions import lit, rand

In [16]:
# I want to add a new column named new_price which has 18% GST included with it
df.withColumn('price_with_tax', df.Price+(0.18*df.Price)).select(["Brand", "Model", "Price", "price_with_tax"]).show()

+----------+--------+-----+--------------+
|     Brand|   Model|Price|price_with_tax|
+----------+--------+-----+--------------+
|       Kia|     Rio| 8501|      10031.18|
| Chevrolet|  Malibu|12092|      14268.56|
|  Mercedes|     GLA|11171|      13181.78|
|      Audi|      Q5|11780|       13900.4|
|Volkswagen|    Golf| 2867|       3383.06|
|    Toyota|   Camry| 7242|       8545.56|
|     Honda|   Civic|11208|      13225.44|
|       Kia|Sportage| 7950|        9381.0|
|       Kia|Sportage| 9926|      11712.68|
|    Toyota|    RAV4| 6545|        7723.1|
|       BMW|5 Series| 5863|       6918.34|
|Volkswagen|    Golf|11444|      13503.92|
|     Honda|    CR-V|10842|      12793.56|
|   Hyundai| Elantra| 4820|        5687.6|
|Volkswagen|    Golf| 5981|       7057.58|
|Volkswagen|    Golf| 9697|      11442.46|
|   Hyundai| Elantra|14837|      17507.66|
|Volkswagen|  Tiguan|11576|      13659.68|
|       Kia|     Rio| 2351|       2774.18|
|  Mercedes|     GLA|11207|      13224.26|
+----------

***Dropping columns with PySpark***
- dropping single column. e.g df.drop('column_name')
- dropping multiple column. e.g df.drop('column1', 'column_2',...)
- docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.drop.html

In [17]:
df.drop('Doors').show(5)

+----------+------+----------+-----------+---------+--------------+-------+-----------+-----+
|     Brand| Model|      Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Owner_Count|Price|
+----------+------+----------+-----------+---------+--------------+-------+-----------+-----+
|       Kia|   Rio|2020-01-01|        4.2|   Diesel|        Manual| 289944|          5| 8501|
| Chevrolet|Malibu|2012-01-01|        2.0|   Hybrid|     Automatic|   5356|          3|12092|
|  Mercedes|   GLA|2020-01-01|        4.2|   Diesel|     Automatic| 231440|          2|11171|
|      Audi|    Q5|2023-01-01|        2.0| Electric|        Manual| 160971|          1|11780|
|Volkswagen|  Golf|2003-01-01|        2.6|   Hybrid|Semi-Automatic| 286618|          3| 2867|
+----------+------+----------+-----------+---------+--------------+-------+-----------+-----+
only showing top 5 rows



In [18]:
df.drop('Transmission', 'Owner_Count').show(5)

+----------+------+----------+-----------+---------+-------+-----+-----+
|     Brand| Model|      Year|Engine_Size|Fuel_Type|Mileage|Doors|Price|
+----------+------+----------+-----------+---------+-------+-----+-----+
|       Kia|   Rio|2020-01-01|        4.2|   Diesel| 289944|    3| 8501|
| Chevrolet|Malibu|2012-01-01|        2.0|   Hybrid|   5356|    2|12092|
|  Mercedes|   GLA|2020-01-01|        4.2|   Diesel| 231440|    4|11171|
|      Audi|    Q5|2023-01-01|        2.0| Electric| 160971|    2|11780|
|Volkswagen|  Golf|2003-01-01|        2.6|   Hybrid| 286618|    3| 2867|
+----------+------+----------+-----------+---------+-------+-----+-----+
only showing top 5 rows



***Renaming columns in Pyspark***
- renaimng single column: df.withColumnRenamed('Prvious_column_name', 'new_column_name')
- renaming multiple columns: df.withColumnsRenamed({'from_col_1: 'to_col_1', 'from_col_2': 'to_col_2', ...})
- docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumnsRenamed.html

In [19]:
df.withColumnRenamed('Price', 'Price_without_tax').show(5)

+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----------------+
|     Brand| Model|      Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price_without_tax|
+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----------------+
|       Kia|   Rio|2020-01-01|        4.2|   Diesel|        Manual| 289944|    3|          5|             8501|
| Chevrolet|Malibu|2012-01-01|        2.0|   Hybrid|     Automatic|   5356|    2|          3|            12092|
|  Mercedes|   GLA|2020-01-01|        4.2|   Diesel|     Automatic| 231440|    4|          2|            11171|
|      Audi|    Q5|2023-01-01|        2.0| Electric|        Manual| 160971|    2|          1|            11780|
|Volkswagen|  Golf|2003-01-01|        2.6|   Hybrid|Semi-Automatic| 286618|    3|          3|             2867|
+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----------

In [20]:
df.withColumnsRenamed({'Mileage': 'KM/h', 'Doors': 'Num_of_doors'}).show(5)

+----------+------+----------+-----------+---------+--------------+------+------------+-----------+-----+
|     Brand| Model|      Year|Engine_Size|Fuel_Type|  Transmission|  KM/h|Num_of_doors|Owner_Count|Price|
+----------+------+----------+-----------+---------+--------------+------+------------+-----------+-----+
|       Kia|   Rio|2020-01-01|        4.2|   Diesel|        Manual|289944|           3|          5| 8501|
| Chevrolet|Malibu|2012-01-01|        2.0|   Hybrid|     Automatic|  5356|           2|          3|12092|
|  Mercedes|   GLA|2020-01-01|        4.2|   Diesel|     Automatic|231440|           4|          2|11171|
|      Audi|    Q5|2023-01-01|        2.0| Electric|        Manual|160971|           2|          1|11780|
|Volkswagen|  Golf|2003-01-01|        2.6|   Hybrid|Semi-Automatic|286618|           3|          3| 2867|
+----------+------+----------+-----------+---------+--------------+------+------------+-----------+-----+
only showing top 5 rows



***Using Pyspark Functions***
- It is present in the pyspark.sql.functions
- Docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html
- Will Handle all the functions in a separeate file.

***Group By***
- df.groupBy('column_name').sum/mean or any formula
- docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.groupBy.html

In [21]:
df.groupBy('Brand').avg('Price').show()

+----------+-----------------+
|     Brand|       avg(Price)|
+----------+-----------------+
|Volkswagen|8928.377450980392|
|       Kia| 8880.08606557377|
| Chevrolet|9015.683948155533|
|   Hyundai|8778.279396984924|
|     Honda|8665.596630327056|
|      Audi| 8929.37379576108|
|  Mercedes|8980.087048832273|
|       BMW|8704.068068068069|
|    Toyota|8798.184536082474|
|      Ford|8852.570610687022|
+----------+-----------------+



In [22]:
# e.g 2
df.groupBy('Fuel_Type').avg('Mileage').withColumnRenamed('avg(Mileage)', 'AverageMileage').show()

+---------+------------------+
|Fuel_Type|    AverageMileage|
+---------+------------------+
|   Diesel|150261.53304140127|
|   Hybrid|145577.58703628212|
| Electric|151059.30742857142|
|   Petrol|149917.69460580914|
+---------+------------------+



In [23]:
# e.g 3
df.groupBy('Fuel_Type').sum('Price').withColumnRenamed('sum(Price)', 'SumOfPrice').show()

+---------+----------+
|Fuel_Type|SumOfPrice|
+---------+----------+
|   Diesel|  20390749|
|   Hybrid|  22354263|
| Electric|  26334578|
|   Petrol|  19450054|
+---------+----------+



***User Defind Functions in Pyspark(UDF)***
- User defind functions can be found on pyspark.sql.functions
- We can use any function as sparks udf with adding decorator to a function.
- docs: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.udf.html

In [24]:
# from pyspark.sql.functions import udf
# from pyspark.sql.types import IntegerType

# @udf(returnType=IntegerType())
# def my_func(num):
#     # function to add 18% of GST to price
#     if num is not None:
#         return num + 2
# # This will work with spark enging not inbuilt java engine 
# df.select(my_func('Price')).show()


# # tried this with google colab with ths example 

# from pyspark.sql import SparkSession
# spark = SparkSession.builder.appName('my_app').master('local[*]').getOrCreate()

# df = spark.read.csv('/content/sample_data/california_housing_test.csv', header=True)
# df.printSchema()

# from pyspark.sql.functions import udf

# @udf
# def my_func(name):
#   if name is not None:
#     return str(float(name) + 1)

# df.select('*', my_func(df.housing_median_age).alias('housing_median_age + 1')).show(5)

***Pandas dataFrame to PySpark  dataframe and vice versa***
- to pandas: df_pd = df.toPandas()
- to spasrk: df_sp = spark_session.createDataFrame(df_pd)
- docs: https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/pandas_pyspark.html

In [25]:
# converting spark dataframe to pandas
df_pd = df.toPandas()
df_pd.head(5)

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
0,Kia,Rio,2020-01-01,4.2,Diesel,Manual,289944,3,5,8501
1,Chevrolet,Malibu,2012-01-01,2.0,Hybrid,Automatic,5356,2,3,12092
2,Mercedes,GLA,2020-01-01,4.2,Diesel,Automatic,231440,4,2,11171
3,Audi,Q5,2023-01-01,2.0,Electric,Manual,160971,2,1,11780
4,Volkswagen,Golf,2003-01-01,2.6,Hybrid,Semi-Automatic,286618,3,3,2867


In [26]:
df_pd.dtypes

Brand            object
Model            object
Year             object
Engine_Size     float32
Fuel_Type        object
Transmission     object
Mileage           int32
Doors             int32
Owner_Count       int32
Price             int32
dtype: object

In [27]:
# converting pandas dataframe to spark dataframe
# This will workj with spark engind tested in google colab
# new_sp_df = spark.createDataFrame(df_pd)
# new_sp_df.show()

***Handling null values in pyspark DataFrame***
- df.na.fill(0).show()

In [28]:
df.na.fill(0).show(5)

+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|     Brand| Model|      Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|       Kia|   Rio|2020-01-01|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501|
| Chevrolet|Malibu|2012-01-01|        2.0|   Hybrid|     Automatic|   5356|    2|          3|12092|
|  Mercedes|   GLA|2020-01-01|        4.2|   Diesel|     Automatic| 231440|    4|          2|11171|
|      Audi|    Q5|2023-01-01|        2.0| Electric|        Manual| 160971|    2|          1|11780|
|Volkswagen|  Golf|2003-01-01|        2.6|   Hybrid|Semi-Automatic| 286618|    3|          3| 2867|
+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
only showing top 5 rows



In [29]:
df.fillna({"Model": "classified"}).show(5)

+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|     Brand| Model|      Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
|       Kia|   Rio|2020-01-01|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501|
| Chevrolet|Malibu|2012-01-01|        2.0|   Hybrid|     Automatic|   5356|    2|          3|12092|
|  Mercedes|   GLA|2020-01-01|        4.2|   Diesel|     Automatic| 231440|    4|          2|11171|
|      Audi|    Q5|2023-01-01|        2.0| Electric|        Manual| 160971|    2|          1|11780|
|Volkswagen|  Golf|2003-01-01|        2.6|   Hybrid|Semi-Automatic| 286618|    3|          3| 2867|
+----------+------+----------+-----------+---------+--------------+-------+-----+-----------+-----+
only showing top 5 rows



In [30]:
spark.stop()