In [0]:
import pandas as pd 
df=pd.read_csv("https://raw.githubusercontent.com/Drushti2706/Ecommerce-sales-dataset/refs/heads/main/ecommerce_sales_data.csv")

In [0]:
events = spark.createDataFrame(df)

events.show(5)
events.printSchema()


+--------+-----------+------+---+----------------+------------+--------+-------+----------+----------------+---------+------+
|Order ID|Customer ID|Gender|Age|Product Category|Product Name|Quantity|  Price|Order Date|  Payment Method|     City|Rating|
+--------+-----------+------+---+----------------+------------+--------+-------+----------+----------------+---------+------+
| ORD0001|   CUST9376|Female| 43|            Home|        Lamp|       1|1368.69|07-06-2025|Cash on Delivery|Hyderabad|     3|
| ORD0002|   CUST3289|  Male| 57|            Toys|    Lego Set|       5| 782.44|11-12-2024|Cash on Delivery|  Chennai|     5|
| ORD0003|   CUST6409|Female| 53|        Clothing|      Jacket|       1|3676.18|05-05-2025|     Credit Card|Bangalore|     4|
| ORD0004|   CUST8815|Female| 51|          Beauty|     Perfume|       2|4836.37|25-06-2025|Cash on Delivery|   Mumbai|     5|
| ORD0005|   CUST1018|Female| 39|     Electronics|  Smartphone|       4|3580.24|25-12-2024|             UPI|  Kolkata|

Select Columns

In [0]:
events.select(
    "Order ID",
    "Product Category",
    "Product Name",
    "Quantity",
    "Price",
    "City"
).show(10)


+--------+----------------+-------------+--------+-------+---------+
|Order ID|Product Category| Product Name|Quantity|  Price|     City|
+--------+----------------+-------------+--------+-------+---------+
| ORD0001|            Home|         Lamp|       1|1368.69|Hyderabad|
| ORD0002|            Toys|     Lego Set|       5| 782.44|  Chennai|
| ORD0003|        Clothing|       Jacket|       1|3676.18|Bangalore|
| ORD0004|          Beauty|      Perfume|       2|4836.37|   Mumbai|
| ORD0005|     Electronics|   Smartphone|       4|3580.24|  Kolkata|
| ORD0006|           Books|      Fiction|       2|4593.39|   Mumbai|
| ORD0007|          Beauty|      Perfume|       4| 634.38|  Chennai|
| ORD0008|            Toys|Action Figure|       5|1509.52|Bangalore|
| ORD0009|        Clothing|        Jeans|       1|3102.59|  Kolkata|
| ORD0010|        Clothing|      T-Shirt|       1|4501.05|  Chennai|
+--------+----------------+-------------+--------+-------+---------+
only showing top 10 rows


Filter Data

In [0]:
high_value_orders = events.filter(events["Price"] > 3000)
high_value_orders.count()


41

Group By Operations
Orders by Product Category

In [0]:
events.groupBy("Product Category").count().show()


+----------------+-----+
|Product Category|count|
+----------------+-----+
|            Home|   17|
|            Toys|   12|
|        Clothing|   21|
|          Beauty|   13|
|     Electronics|   19|
|           Books|   18|
+----------------+-----+



Total Quantity Sold per Category

In [0]:
category_sales = (
    events.groupBy("Product Category")
    .sum("Quantity")
    .withColumnRenamed("sum(Quantity)", "Total_Quantity")
)

category_sales.show()


+----------------+--------------+
|Product Category|Total_Quantity|
+----------------+--------------+
|            Home|            51|
|            Toys|            36|
|        Clothing|            56|
|          Beauty|            35|
|     Electronics|            58|
|           Books|            58|
+----------------+--------------+



Order By (Sorting)

In [0]:
category_sales.orderBy("Total_Quantity", ascending=False).show()


+----------------+--------------+
|Product Category|Total_Quantity|
+----------------+--------------+
|     Electronics|            58|
|           Books|            58|
|        Clothing|            56|
|            Home|            51|
|            Toys|            36|
|          Beauty|            35|
+----------------+--------------+



Top N Analysis

Top 5 selling product categories:

In [0]:
top_5_categories = category_sales.orderBy(
    "Total_Quantity", ascending=False
).limit(5)

top_5_categories.show()


+----------------+--------------+
|Product Category|Total_Quantity|
+----------------+--------------+
|           Books|            58|
|     Electronics|            58|
|        Clothing|            56|
|            Home|            51|
|            Toys|            36|
+----------------+--------------+



Spark SQL vs DataFrame API

In [0]:
events.createOrReplaceTempView("ecommerce_tbl")


In [0]:
%sql
SELECT City,
       COUNT(*) AS total_orders,
       SUM(Quantity * Price) AS total_sales
FROM ecommerce_tbl
GROUP BY City
ORDER BY total_sales DESC


City,total_orders,total_sales
Kolkata,20,190462.98
Hyderabad,15,130424.17
Mumbai,20,118183.94999999998
Chennai,22,109181.81
Bangalore,14,90184.49
Delhi,9,55782.47


Lazy Evaluation Demo

In [0]:
filtered_df = events.filter(events["Rating"] >= 4)
# No execution yet

filtered_df.show()  # Action triggers Spark job


+--------+-----------+------+---+----------------+-------------+--------+-------+----------+----------------+---------+------+
|Order ID|Customer ID|Gender|Age|Product Category| Product Name|Quantity|  Price|Order Date|  Payment Method|     City|Rating|
+--------+-----------+------+---+----------------+-------------+--------+-------+----------+----------------+---------+------+
| ORD0002|   CUST3289|  Male| 57|            Toys|     Lego Set|       5| 782.44|11-12-2024|Cash on Delivery|  Chennai|     5|
| ORD0003|   CUST6409|Female| 53|        Clothing|       Jacket|       1|3676.18|05-05-2025|     Credit Card|Bangalore|     4|
| ORD0004|   CUST8815|Female| 51|          Beauty|      Perfume|       2|4836.37|25-06-2025|Cash on Delivery|   Mumbai|     5|
| ORD0013|   CUST2018|  Male| 27|          Beauty|     Lipstick|       3|2188.04|02-06-2025|             UPI|   Mumbai|     5|
| ORD0014|   CUST1437|  Male| 58|           Books|       Comics|       5|3434.89|09-11-2024|             UPI|  