In [1]:
from pyspark.sql import SparkSession
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
sc = SparkSession.builder.appName('spark_app_2').master('local[*]').getOrCreate()

***Creating a dataframe in pyspark***

- There are 4 ways for creating a dataframe
  1. *Programatically:* From a list of values(list of tuples, list of values, dictionary etc).
  2. *From Pandas Dataframe:* (Only valid with the python API)
  3. *From a Spark RDD*: by Defining the structure of the data.
  4. *From Datasources*: Spark sql supports reading external files through different methods.
     - *JSON File:* sc.read.json('path/to/file') method, every line should be in json format.
     - *Paraquet File:* sc.read.paraquet('path/to/file') method
     - *Csv File:* sc.read.csv('path/to/file') method
     - Other kind of file/datasources can be read by using the method spark.read.format('...').load('...')

In [3]:
df = sc.read.csv('car_price_dataset.csv', header=True, inferSchema=True)
df.show()

+----------+--------+----+-----------+---------+--------------+-------+-----+-----------+-----+
|     Brand|   Model|Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+----------+--------+----+-----------+---------+--------------+-------+-----+-----------+-----+
|       Kia|     Rio|2020|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501|
| Chevrolet|  Malibu|2012|        2.0|   Hybrid|     Automatic|   5356|    2|          3|12092|
|  Mercedes|     GLA|2020|        4.2|   Diesel|     Automatic| 231440|    4|          2|11171|
|      Audi|      Q5|2023|        2.0| Electric|        Manual| 160971|    2|          1|11780|
|Volkswagen|    Golf|2003|        2.6|   Hybrid|Semi-Automatic| 286618|    3|          3| 2867|
|    Toyota|   Camry|2007|        2.7|   Petrol|     Automatic| 157889|    4|          4| 7242|
|     Honda|   Civic|2010|        3.4| Electric|     Automatic| 139584|    3|          1|11208|
|       Kia|Sportage|2001|        4.7| E

In [4]:
# Creating a dataframe programatically
#  To make it workable need to downgrade the python version to 3.9.
# os.environ['PYSPARK_PYTHON'] = sys.executable
# os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
# header = ['city', 'type', 'price']
# data = map(
#     lambda r: (r[0], r[1], float(r[2])), map(
#     lambda x: x.split(','),
#         ["Paris,Food,19.00", "Marseille,Clothing,12.00",
#             "Paris,Food,8.00", "Paris,Clothing,15.00",
#             "Marseille,Food,20.00", "Lyon,Book,10.00"]
# ))
# progrmdf = sc.createDataFrame(data, schema=header)

# progrmdf.show()

In [5]:
from platform import python_version

print(python_version())

3.12.2


In [6]:
# It returns first n rows
df.take(5)

[Row(Brand='Kia', Model='Rio', Year=2020, Engine_Size=4.2, Fuel_Type='Diesel', Transmission='Manual', Mileage=289944, Doors=3, Owner_Count=5, Price=8501),
 Row(Brand='Chevrolet', Model='Malibu', Year=2012, Engine_Size=2.0, Fuel_Type='Hybrid', Transmission='Automatic', Mileage=5356, Doors=2, Owner_Count=3, Price=12092),
 Row(Brand='Mercedes', Model='GLA', Year=2020, Engine_Size=4.2, Fuel_Type='Diesel', Transmission='Automatic', Mileage=231440, Doors=4, Owner_Count=2, Price=11171),
 Row(Brand='Audi', Model='Q5', Year=2023, Engine_Size=2.0, Fuel_Type='Electric', Transmission='Manual', Mileage=160971, Doors=2, Owner_Count=1, Price=11780),
 Row(Brand='Volkswagen', Model='Golf', Year=2003, Engine_Size=2.6, Fuel_Type='Hybrid', Transmission='Semi-Automatic', Mileage=286618, Doors=3, Owner_Count=3, Price=2867)]

In [7]:
# printSchema: It basically returns the schema or the datatype of the dataframe
df.printSchema()

root
 |-- Brand: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Engine_Size: double (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Mileage: integer (nullable = true)
 |-- Doors: integer (nullable = true)
 |-- Owner_Count: integer (nullable = true)
 |-- Price: integer (nullable = true)



In [8]:
# dtypers: It returns the datatypes of the datafrsame. through a list of tuples.
df.dtypes

[('Brand', 'string'),
 ('Model', 'string'),
 ('Year', 'int'),
 ('Engine_Size', 'double'),
 ('Fuel_Type', 'string'),
 ('Transmission', 'string'),
 ('Mileage', 'int'),
 ('Doors', 'int'),
 ('Owner_Count', 'int'),
 ('Price', 'int')]

In [9]:
# explain(): It return the overall information of the datatype, type of file etc
df.explain()

== Physical Plan ==
FileScan csv [Brand#17,Model#18,Year#19,Engine_Size#20,Fuel_Type#21,Transmission#22,Mileage#23,Doors#24,Owner_Count#25,Price#26] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/D:/codes/pyspark/notebooks/car_price_dataset.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Brand:string,Model:string,Year:int,Engine_Size:double,Fuel_Type:string,Transmission:string...




In [10]:
# select(): takes a list of arguments(columns) and returns columns
df.select(['Brand', 'Model']).show()

+----------+--------+
|     Brand|   Model|
+----------+--------+
|       Kia|     Rio|
| Chevrolet|  Malibu|
|  Mercedes|     GLA|
|      Audi|      Q5|
|Volkswagen|    Golf|
|    Toyota|   Camry|
|     Honda|   Civic|
|       Kia|Sportage|
|       Kia|Sportage|
|    Toyota|    RAV4|
|       BMW|5 Series|
|Volkswagen|    Golf|
|     Honda|    CR-V|
|   Hyundai| Elantra|
|Volkswagen|    Golf|
|Volkswagen|    Golf|
|   Hyundai| Elantra|
|Volkswagen|  Tiguan|
|       Kia|     Rio|
|  Mercedes|     GLA|
+----------+--------+
only showing top 20 rows



***Basic manipulation of schema***
**Schema:**
- In PySpark, a schema defines the structure of a DataFrame. It specifies the names and data types of the columns, ensuring data consistency and integrity. Schemas are crucial when creating DataFrames, reading data from files, or performing transformations.
- A schema is defined using StructType and StructField. StructType is a collection of StructField objects, where each StructField represents a column and its properties. These properties include the column name, data type, and whether it can contain null values.

In [11]:
from pyspark.sql.types import StringType, FloatType, StructType, StructField, IntegerType

schema = StructType([
    StructField('Brand', StringType(), nullable=True),
    StructField('Model', StringType(), nullable=True),
    StructField('Year', IntegerType(), nullable=True),
    StructField('Engine_Size', FloatType(), nullable=True),
    StructField('Fuel_Type', StringType(), nullable=True),
    StructField('Transmission', StringType(), nullable=True),
    StructField('Mileage', IntegerType(), nullable=True),
    StructField('Doors', IntegerType(), nullable=True),
    StructField('Owner_Count', IntegerType(), nullable=True),
    StructField('Price', IntegerType(), nullable=True)
])

schema

StructType([StructField('Brand', StringType(), True), StructField('Model', StringType(), True), StructField('Year', IntegerType(), True), StructField('Engine_Size', FloatType(), True), StructField('Fuel_Type', StringType(), True), StructField('Transmission', StringType(), True), StructField('Mileage', IntegerType(), True), StructField('Doors', IntegerType(), True), StructField('Owner_Count', IntegerType(), True), StructField('Price', IntegerType(), True)])

In [12]:
main_df = sc.read.csv(
    'car_price_dataset.csv',
    header=True,
    schema=schema
)

In [13]:
# Showing 5 rows
main_df.show(5)

+----------+------+----+-----------+---------+--------------+-------+-----+-----------+-----+
|     Brand| Model|Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+----------+------+----+-----------+---------+--------------+-------+-----+-----------+-----+
|       Kia|   Rio|2020|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501|
| Chevrolet|Malibu|2012|        2.0|   Hybrid|     Automatic|   5356|    2|          3|12092|
|  Mercedes|   GLA|2020|        4.2|   Diesel|     Automatic| 231440|    4|          2|11171|
|      Audi|    Q5|2023|        2.0| Electric|        Manual| 160971|    2|          1|11780|
|Volkswagen|  Golf|2003|        2.6|   Hybrid|Semi-Automatic| 286618|    3|          3| 2867|
+----------+------+----+-----------+---------+--------------+-------+-----+-----------+-----+
only showing top 5 rows



In [14]:
# Showing last 5 rows while ordering by price descending
# docs:  https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.orderBy.html
main_df.orderBy(main_df.Price.desc()).show(5)

+------+--------+----+-----------+---------+------------+-------+-----+-----------+-----+
| Brand|   Model|Year|Engine_Size|Fuel_Type|Transmission|Mileage|Doors|Owner_Count|Price|
+------+--------+----+-----------+---------+------------+-------+-----+-----------+-----+
|Toyota| Corolla|2021|        4.7| Electric|   Automatic|  14924|    5|          3|18301|
|  Audi|      A3|2023|        5.0|   Hybrid|   Automatic|  12234|    5|          5|18255|
|  Ford|Explorer|2020|        4.8| Electric|   Automatic|  19112|    4|          5|18017|
| Honda|  Accord|2022|        4.0| Electric|   Automatic|  14658|    4|          2|17906|
| Honda|    CR-V|2023|        4.6|   Hybrid|   Automatic|  10046|    4|          4|17899|
+------+--------+----+-----------+---------+------------+-------+-----+-----------+-----+
only showing top 5 rows



In [15]:
main_df.printSchema()

root
 |-- Brand: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Engine_Size: float (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Mileage: integer (nullable = true)
 |-- Doors: integer (nullable = true)
 |-- Owner_Count: integer (nullable = true)
 |-- Price: integer (nullable = true)



***Case Study***
- Whenever I tried to make interfield to StringField in the Schema, then that field became field became string for the Integers.
- While Convering a column containing string data to Integer it became Null in that case.

In [16]:
sc.stop()