# Ex-2140 - top and distinct

In [1]:
import requests

! curl -L -o data.zip  https://www.kaggle.com/api/v1/datasets/download/zahidmughal2343/amazon-sales-2025
! unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  3617  100  3617    0     0   2715      0  0:00:01  0:00:01 --:--:--  4100
Archive:  data.zip
replace amazon_sales_data 2025.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: amazon_sales_data 2025.csv  


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [3]:
# 2. Load the CSV file assuming headers exist and let Spark infer data types
df = spark.read.option("header", "true").option("inferSchema", "true").csv("amazon_sales_data 2025.csv")

In [4]:
# 3. Check the structure of the DataFrame
df.printSchema()

root
 |-- Order ID: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Total Sales: integer (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Customer Location: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Status: string (nullable = true)



In [5]:
from pyspark.sql.functions import col

top_categories = df.groupBy("Category").sum("Total Sales") \
    .orderBy(col("sum(Total Sales)").desc()) \
    .limit(3)

top_categories.show()

+---------------+----------------+
|       Category|sum(Total Sales)|
+---------------+----------------+
|    Electronics|          129950|
|Home Appliances|          105000|
|       Footwear|            4320|
+---------------+----------------+



In [6]:
top_customers = df.groupBy("Customer Name").sum("Total Sales") \
    .orderBy(col("sum(Total Sales)").desc()) \
    .limit(3)

top_customers.show()

+-------------+----------------+
|Customer Name|sum(Total Sales)|
+-------------+----------------+
|Olivia Wilson|           36170|
|   Jane Smith|           31185|
|   Emma Clark|           29700|
+-------------+----------------+



In [7]:
df.show(3)

+--------+--------+-------------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|      Product|   Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+-------------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0001|14-03-25|Running Shoes|   Footwear|   60|       3|        180|   Emma Clark|         New York|    Debit Card|Cancelled|
| ORD0002|20-03-25|   Headphones|Electronics|  100|       4|        400|Emily Johnson|    San Francisco|    Debit Card|  Pending|
| ORD0003|15-02-25|Running Shoes|   Footwear|   60|       2|        120|     John Doe|           Denver|    Amazon Pay|Cancelled|
+--------+--------+-------------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
only showing top 3 rows



In [8]:
from pyspark.sql.functions import to_date

df_date = df.withColumn("Date", to_date(col("Date"), "dd-MM-yy"))

recent_transactions = df_date.orderBy(col("Date").desc()).limit(10)
recent_transactions.show()

+--------+----------+------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|      Date|     Product|       Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+----------+------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0034|2025-04-02|     T-Shirt|       Clothing|   20|       5|        100|   Jane Smith|         New York|   Credit Card|  Pending|
| ORD0035|2025-04-02|      Laptop|    Electronics|  800|       3|       2400|   Emma Clark|           Denver|    Amazon Pay|Completed|
| ORD0182|2025-04-02|     T-Shirt|       Clothing|   20|       5|        100|   Emma Clark|           Denver|        PayPal|Completed|
| ORD0227|2025-04-02|  Headphones|    Electronics|  100|       5|        500|   Emma Clark|            Miami|    Amazon Pay|Cancelled|
| ORD0017|2025-04-01|     T-Shirt|       Clothing|   20

In [9]:
best_category = top_categories.first()["Category"]

worst_products = df.filter(col("Category") == best_category) \
    .groupBy("Product").sum("Total Sales") \
    .orderBy(col("sum(Total Sales)").asc()) \
    .limit(3)

worst_products.show()

+----------+----------------+
|   Product|sum(Total Sales)|
+----------+----------------+
|Headphones|            7300|
|Smartwatch|           15750|
|Smartphone|           48500|
+----------+----------------+

