# Ex-2130 - Order & Group & Aggregate


In [1]:
import requests

! curl -L -o data.zip  https://www.kaggle.com/api/v1/datasets/download/zahidmughal2343/amazon-sales-2025
! unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  3617  100  3617    0     0   6979      0 --:--:-- --:--:-- --:--:--  6979
Archive:  data.zip
replace amazon_sales_data 2025.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: amazon_sales_data 2025.csv  


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [3]:
# 2. Load the CSV file assuming headers exist and let Spark infer data types
df = spark.read.option("header", "true").option("inferSchema", "true").csv("amazon_sales_data 2025.csv")

In [4]:
# 3. Check the structure of the DataFrame
df.printSchema()

root
 |-- Order ID: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Total Sales: integer (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Customer Location: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Status: string (nullable = true)



In [5]:
# 2a. Sort by Order ID
df_sorted_order = df.orderBy("Order ID")
df_sorted_order.show()

+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|        Product|       Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+---------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0001|14-03-25|  Running Shoes|       Footwear|   60|       3|        180|   Emma Clark|         New York|    Debit Card|Cancelled|
| ORD0002|20-03-25|     Headphones|    Electronics|  100|       4|        400|Emily Johnson|    San Francisco|    Debit Card|  Pending|
| ORD0003|15-02-25|  Running Shoes|       Footwear|   60|       2|        120|     John Doe|           Denver|    Amazon Pay|Cancelled|
| ORD0004|19-02-25|  Running Shoes|       Footwear|   60|       3|        180|Olivia Wilson|           Dallas|   Credit Card|  Pending|
| ORD0005|10-03-25|     Smartwatch|    Electroni

In [6]:

# 2b. Sort by Category and Product
df_sorted_category_product = df.orderBy(["Category", "Product"])
df_sorted_category_product.show()

+--------+--------+-------+--------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|Product|Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+-------+--------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0200|10-02-25|   Book|   Books|   15|       2|         30|  Chris White|      Los Angeles|    Debit Card|Completed|
| ORD0113|19-03-25|   Book|   Books|   15|       5|         75|    David Lee|    San Francisco|    Debit Card|  Pending|
| ORD0011|17-02-25|   Book|   Books|   15|       2|         30|    David Lee|           Boston|    Amazon Pay|  Pending|
| ORD0123|23-02-25|   Book|   Books|   15|       3|         45|  Chris White|           Boston|     Gift Card|Cancelled|
| ORD0025|02-03-25|   Book|   Books|   15|       5|         75|Sophia Miller|          Seattle|    Amazon Pay|Completed|
| ORD0124|10-02-25|   Book|   Bo

In [7]:
# 2c. Sort by Customer Location and Customer Name
df_sorted_customer_location = df.orderBy(["Customer Location", "Customer Name"])
df_sorted_customer_location.show()

+--------+--------+-------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|      Product|       Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+-------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0123|23-02-25|         Book|          Books|   15|       3|         45|  Chris White|           Boston|     Gift Card|Cancelled|
| ORD0242|08-03-25|   Smartphone|    Electronics|  500|       4|       2000|  Chris White|           Boston|     Gift Card|  Pending|
| ORD0075|26-02-25|   Headphones|    Electronics|  100|       2|        200|Daniel Harris|           Boston|        PayPal|  Pending|
| ORD0011|17-02-25|         Book|          Books|   15|       2|         30|    David Lee|           Boston|    Amazon Pay|  Pending|
| ORD0054|15-03-25|   Smartwatch|    Electronics|  150|       

In [8]:
from pyspark.sql.functions import col, count, sum

# 3. Group by Category and calculate metrics
df_category_sales = df.groupBy("Category").agg(
    count("Order ID").alias("Transaction_Count"),
    sum("Total Sales").alias("Total_Sales")
).orderBy(col("Total_Sales").desc())

df_category_sales.show()

+---------------+-----------------+-----------+
|       Category|Transaction_Count|Total_Sales|
+---------------+-----------------+-----------+
|    Electronics|              118|     129950|
|Home Appliances|               40|     105000|
|       Footwear|               27|       4320|
|       Clothing|               40|       3540|
|          Books|               25|       1035|
+---------------+-----------------+-----------+



In [9]:
from pyspark.sql.functions import min, max, avg, sum

# 4. Group by Customer Name and calculate metrics
df_customer_sales = df.groupBy("Customer Name").agg(
    min("Total Sales").alias("Min_Total_Sales"),
    max("Total Sales").alias("Max_Total_Sales"),
    avg("Total Sales").alias("Avg_Total_Sales"),
    sum("Total Sales").alias("Sum_Total_Sales")
).orderBy("Customer Name")

df_customer_sales.show()

+-------------+---------------+---------------+------------------+---------------+
|Customer Name|Min_Total_Sales|Max_Total_Sales|   Avg_Total_Sales|Sum_Total_Sales|
+-------------+---------------+---------------+------------------+---------------+
|  Chris White|             20|           3600| 858.4090909090909|          18885|
|Daniel Harris|             20|           3600|  823.695652173913|          18945|
|    David Lee|             15|           6000| 871.7307692307693|          22665|
|Emily Johnson|             15|           4800|1067.0454545454545|          23475|
|   Emma Clark|             15|           3600|           928.125|          29700|
|   Jane Smith|             15|           4800|            1039.5|          31185|
|     John Doe|             15|           3600|1033.4615384615386|          26870|
|Michael Brown|             15|           4000| 943.9583333333334|          22655|
|Olivia Wilson|             15|           6000|1247.2413793103449|          36170|
|Sop

In [10]:
# 5. Filter customers with total sales > 1000
df_high_spenders = df_customer_sales.filter(col("Sum_Total_Sales") > 2000)

df_high_spenders.show()

+-------------+---------------+---------------+------------------+---------------+
|Customer Name|Min_Total_Sales|Max_Total_Sales|   Avg_Total_Sales|Sum_Total_Sales|
+-------------+---------------+---------------+------------------+---------------+
|  Chris White|             20|           3600| 858.4090909090909|          18885|
|Daniel Harris|             20|           3600|  823.695652173913|          18945|
|    David Lee|             15|           6000| 871.7307692307693|          22665|
|Emily Johnson|             15|           4800|1067.0454545454545|          23475|
|   Emma Clark|             15|           3600|           928.125|          29700|
|   Jane Smith|             15|           4800|            1039.5|          31185|
|     John Doe|             15|           3600|1033.4615384615386|          26870|
|Michael Brown|             15|           4000| 943.9583333333334|          22655|
|Olivia Wilson|             15|           6000|1247.2413793103449|          36170|
|Sop