#### PySpark Configurations ####

In [1]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
# Disable Adaptive Query Execution - AQE
""" 
Adaptive Query Execution (AQE) is an optimization technique in Spark SQL that makes use of the runtime statistics to choose the most efficient query execution 
plan, which is enabled by default since Apache Spark 3.2.0.
"""
# spark.conf.set("spark.sql.adaptive.enabled", False)

' \nAdaptive Query Execution (AQE) is an optimization technique in Spark SQL that makes use of the runtime statistics to choose the most efficient query execution \nplan, which is enabled by default since Apache Spark 3.2.0.\n'

In [3]:
# Initialize Spark session
spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
                .appName("Ansh-Lamba-Apache-Spark") \
                    .config("spark.ui.port", "4040") \
                        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/13 19:35:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Reading data from CSV file ####

In [4]:
# Create root directory
INPUT_DATA_ROOT = "/opt/spark-data/input/ansh-lamba"

In [5]:
# Read CSV file with Infered schema
df_mega_mart = spark.read.format("csv") \
                    .option('inferSchema',True) \
                        .option("header", True) \
                            .load(f"{INPUT_DATA_ROOT}/MegaMart.csv")

                                                                                

In [6]:
# Check first N records
df_mega_mart.show(5, truncate=False)

+--------+-------+----------+----------+----------------+------------+--------+--------------+--------------+------------+
|order_id|user_id|order_date|product_id|product_category|product_name|quantity|price_per_unit|payment_method|order_status|
+--------+-------+----------+----------+----------------+------------+--------+--------------+--------------+------------+
|1001    |U188   |2025-04-20|P940      |Fashion         |Sneakers    |2       |58.53         |PayPal        |Cancelled   |
|1002    |U062   |2025-04-16|P794      |Fashion         |T-Shirt     |3       |83.76         |UPI           |Returned    |
|1003    |U058   |2025-04-18|P326      |Fashion         |Sunglasses  |2       |78.85         |PayPal        |Processing  |
|1004    |U011   |2025-04-10|P574      |Fashion         |Sunglasses  |5       |46.49         |PayPal        |Delivered   |
|1005    |U003   |2025-04-19|P988      |Home Decor      |Photo Frame |2       |78.61         |PayPal        |Returned    |
+--------+------

In [7]:
# Check dataframe schema
df_mega_mart.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- user_id: string (nullable = true)
 |-- order_date: date (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price_per_unit: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- order_status: string (nullable = true)



#### Transformations - Narrow & Wide ####

In [8]:
# Select all records where product_category == 'Fashion' using FILTER()
df_fashion = df_mega_mart \
                .filter(col('product_category') == 'Fashion')

In [9]:
# Check first N records
df_fashion.show(5, truncate=False)

+--------+-------+----------+----------+----------------+------------+--------+--------------+--------------+------------+
|order_id|user_id|order_date|product_id|product_category|product_name|quantity|price_per_unit|payment_method|order_status|
+--------+-------+----------+----------+----------------+------------+--------+--------------+--------------+------------+
|1001    |U188   |2025-04-20|P940      |Fashion         |Sneakers    |2       |58.53         |PayPal        |Cancelled   |
|1002    |U062   |2025-04-16|P794      |Fashion         |T-Shirt     |3       |83.76         |UPI           |Returned    |
|1003    |U058   |2025-04-18|P326      |Fashion         |Sunglasses  |2       |78.85         |PayPal        |Processing  |
|1004    |U011   |2025-04-10|P574      |Fashion         |Sunglasses  |5       |46.49         |PayPal        |Delivered   |
|1012    |U148   |2025-04-24|P315      |Fashion         |Sunglasses  |5       |69.14         |Credit Card   |Processing  |
+--------+------

In [10]:
# Check execution plan
df_fashion.explain()

== Physical Plan ==
*(1) Filter (isnotnull(product_category#21) AND (product_category#21 = Fashion))
+- FileScan csv [order_id#17,user_id#18,order_date#19,product_id#20,product_category#21,product_name#22,quantity#23,price_per_unit#24,payment_method#25,order_status#26] Batched: false, DataFilters: [isnotnull(product_category#21), (product_category#21 = Fashion)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/opt/spark-data/input/ansh-lamba/MegaMart.csv], PartitionFilters: [], PushedFilters: [IsNotNull(product_category), EqualTo(product_category,Fashion)], ReadSchema: struct<order_id:int,user_id:string,order_date:date,product_id:string,product_category:string,prod...




In [11]:
# Count all records by product_category using GROUPBY()
df_product_category_summary = df_mega_mart \
                                .groupby('product_category') \
                                    .agg(count(col('order_id')).alias('total_records'))

In [12]:
# Check first N records
df_product_category_summary.show(5, truncate=False)

[Stage 4:>                                                          (0 + 1) / 1]

+----------------+-------------+
|product_category|total_records|
+----------------+-------------+
|Kitchen         |203          |
|Fashion         |188          |
|Electronics     |223          |
|Books           |193          |
|Home Decor      |193          |
+----------------+-------------+



                                                                                

In [13]:
# Check execution plan
df_product_category_summary.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[product_category#21], functions=[count(order_id#17)])
   +- Exchange hashpartitioning(product_category#21, 200), ENSURE_REQUIREMENTS, [plan_id=108]
      +- HashAggregate(keys=[product_category#21], functions=[partial_count(order_id#17)])
         +- FileScan csv [order_id#17,product_category#21] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/opt/spark-data/input/ansh-lamba/MegaMart.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<order_id:int,product_category:string>




#### Transformations - Repartition VS. Coalesce ####

In [14]:
# Get number of Partitions
df_product_category_summary.rdd.getNumPartitions()

1

In [15]:
# Create 3 partitions on dataframe using Repartition
df_product_category_summary = df_product_category_summary.repartition(3)

In [16]:
# Get number of Partitions - Repartition VS. Coalesce
df_product_category_summary.rdd.getNumPartitions()

3

In [17]:
# Check execution plan
df_product_category_summary.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   ShuffleQueryStage 1
   +- Exchange RoundRobinPartitioning(3), REPARTITION_BY_NUM, [plan_id=185]
      +- *(2) HashAggregate(keys=[product_category#21], functions=[count(order_id#17)])
         +- AQEShuffleRead coalesced
            +- ShuffleQueryStage 0
               +- Exchange hashpartitioning(product_category#21, 200), ENSURE_REQUIREMENTS, [plan_id=162]
                  +- *(1) HashAggregate(keys=[product_category#21], functions=[partial_count(order_id#17)])
                     +- FileScan csv [order_id#17,product_category#21] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/opt/spark-data/input/ansh-lamba/MegaMart.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<order_id:int,product_category:string>
+- == Initial Plan ==
   Exchange RoundRobinPartitioning(3), REPARTITION_BY_NUM, [plan_id=150]
   +- HashAggregate(keys=[product_category#21], func

In [18]:
# Return dataframe back to 1 partition using Coalesce
df_product_category_summary = df_product_category_summary.coalesce(1)

In [19]:
# Get number of Partitions - Repartition VS. Coalesce
df_product_category_summary.rdd.getNumPartitions()

1

In [20]:
# Check execution plan
df_product_category_summary.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   Coalesce 1
   +- ShuffleQueryStage 1
      +- Exchange RoundRobinPartitioning(3), REPARTITION_BY_NUM, [plan_id=255]
         +- *(2) HashAggregate(keys=[product_category#21], functions=[count(order_id#17)])
            +- AQEShuffleRead coalesced
               +- ShuffleQueryStage 0
                  +- Exchange hashpartitioning(product_category#21, 200), ENSURE_REQUIREMENTS, [plan_id=228]
                     +- *(1) HashAggregate(keys=[product_category#21], functions=[partial_count(order_id#17)])
                        +- FileScan csv [order_id#17,product_category#21] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/opt/spark-data/input/ansh-lamba/MegaMart.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<order_id:int,product_category:string>
+- == Initial Plan ==
   Coalesce 1
   +- Exchange RoundRobinPartitioning(3), REPARTITION_BY_NUM, [plan_id=21

#### Transformations - JOINS ####