# Ex-2150 - window function

In [1]:
import requests

! curl -L -o data.zip  https://www.kaggle.com/api/v1/datasets/download/zahidmughal2343/amazon-sales-2025
! unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  3617  100  3617    0     0   5039      0 --:--:-- --:--:-- --:--:--  5039
Archive:  data.zip
replace amazon_sales_data 2025.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: amazon_sales_data 2025.csv  


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [3]:
# 2. Load the CSV file assuming headers exist and let Spark infer data types
df = spark.read.option("header", "true").option("inferSchema", "true").csv("amazon_sales_data 2025.csv")

In [4]:
# 3. Check the structure of the DataFrame
df.printSchema()

root
 |-- Order ID: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Total Sales: integer (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Customer Location: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Status: string (nullable = true)



In [5]:
from pyspark.sql.functions import col, to_date, sum, row_number, desc
from pyspark.sql.window import Window

# Convert Date column to date format
df = df.withColumn("Date", to_date(col("Date"), "dd-MM-yy"))



In [6]:
window_spec = Window.partitionBy("Category").orderBy(desc("unit_price"))
row_num_func = row_number().over(window_spec)

In [7]:
# Define window function to accumulate sales over time
window_spec = Window.orderBy("Date")
row_num_func = sum('Total Sales').over(window_spec)

df_orders_by_date = df.withColumn("Cumulative Total Sales", row_num_func)

df_orders_by_date.select("Date", "Total Sales", "Cumulative Total Sales").show()

+----------+-----------+----------------------+
|      Date|Total Sales|Cumulative Total Sales|
+----------+-----------+----------------------+
|2025-02-02|       3600|                  3600|
|2025-02-03|        100|                  6960|
|2025-02-03|       3200|                  6960|
|2025-02-03|         60|                  6960|
|2025-02-04|         15|                 13775|
|2025-02-04|       6000|                 13775|
|2025-02-04|        800|                 13775|
|2025-02-05|        300|                 19175|
|2025-02-05|        300|                 19175|
|2025-02-05|       4800|                 19175|
|2025-02-06|       2500|                 30575|
|2025-02-06|       2400|                 30575|
|2025-02-06|        400|                 30575|
|2025-02-06|       1800|                 30575|
|2025-02-06|       1500|                 30575|
|2025-02-06|        800|                 30575|
|2025-02-06|       2000|                 30575|
|2025-02-07|         20|                

In [8]:
# ranking by category
from pyspark.sql.functions import sum

# Group by category and calculate the sum of 'Total Sales' for each category
df_category = df.groupBy("Category").agg(sum("Total Sales").alias("Total Sales per Category"))

# Show the grouped DataFrame
df_category.show()


+---------------+------------------------+
|       Category|Total Sales per Category|
+---------------+------------------------+
|    Electronics|                  129950|
|       Clothing|                    3540|
|       Footwear|                    4320|
|          Books|                    1035|
|Home Appliances|                  105000|
+---------------+------------------------+



In [9]:
from pyspark.sql.functions import rank

window_spec = Window.orderBy(desc("Total Sales per Category"))
row_num_func = rank().over(window_spec)

df_category.withColumn("Ranking", row_num_func).show()

+---------------+------------------------+-------+
|       Category|Total Sales per Category|Ranking|
+---------------+------------------------+-------+
|    Electronics|                  129950|      1|
|Home Appliances|                  105000|      2|
|       Footwear|                    4320|      3|
|       Clothing|                    3540|      4|
|          Books|                    1035|      5|
+---------------+------------------------+-------+



In [10]:
# ranking by category
from pyspark.sql.functions import sum

# Group by customer and calculate the sum of 'Total Sales' for each customer
df_customer = df.groupBy("Customer Name").agg(sum("Total Sales").alias("Total Sales per Customer"))

# Show the grouped DataFrame
df_customer.show()

+-------------+------------------------+
|Customer Name|Total Sales per Customer|
+-------------+------------------------+
|  Chris White|                   18885|
|Emily Johnson|                   23475|
|Sophia Miller|                   13295|
|Olivia Wilson|                   36170|
|   Emma Clark|                   29700|
|Michael Brown|                   22655|
|     John Doe|                   26870|
|    David Lee|                   22665|
|   Jane Smith|                   31185|
|Daniel Harris|                   18945|
+-------------+------------------------+



In [11]:
from pyspark.sql.functions import rank

window_spec = Window.orderBy(desc("Total Sales per Customer"))
row_num_func = rank().over(window_spec)

df_customer.withColumn("Ranking", row_num_func).show()

+-------------+------------------------+-------+
|Customer Name|Total Sales per Customer|Ranking|
+-------------+------------------------+-------+
|Olivia Wilson|                   36170|      1|
|   Jane Smith|                   31185|      2|
|   Emma Clark|                   29700|      3|
|     John Doe|                   26870|      4|
|Emily Johnson|                   23475|      5|
|    David Lee|                   22665|      6|
|Michael Brown|                   22655|      7|
|Daniel Harris|                   18945|      8|
|  Chris White|                   18885|      9|
|Sophia Miller|                   13295|     10|
+-------------+------------------------+-------+



In [12]:
# ranking by payment method
from pyspark.sql.functions import sum

# Group by payment method and calculate the sum of 'Total Sales' for each payment method
df_pay_meth = df.groupBy("Payment Method").agg(sum("Total Sales").alias("Total Sales per Payment Method"))

# Show the grouped DataFrame
df_pay_meth.show()

+--------------+------------------------------+
|Payment Method|Total Sales per Payment Method|
+--------------+------------------------------+
|    Amazon Pay|                         32750|
|   Credit Card|                         61595|
|        PayPal|                         69645|
|     Gift Card|                         47955|
|    Debit Card|                         31900|
+--------------+------------------------------+



In [13]:
from pyspark.sql.functions import rank

window_spec = Window.orderBy(desc("Total Sales per Payment Method"))
row_num_func = rank().over(window_spec)

df_pay_meth.withColumn("Ranking", row_num_func).show()

+--------------+------------------------------+-------+
|Payment Method|Total Sales per Payment Method|Ranking|
+--------------+------------------------------+-------+
|        PayPal|                         69645|      1|
|   Credit Card|                         61595|      2|
|     Gift Card|                         47955|      3|
|    Amazon Pay|                         32750|      4|
|    Debit Card|                         31900|      5|
+--------------+------------------------------+-------+

