# Ex-2050 - Filtering


In [1]:
import requests

! curl -L -o data.zip  https://www.kaggle.com/api/v1/datasets/download/zahidmughal2343/amazon-sales-2025
! unzip data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  3617  100  3617    0     0  10588      0 --:--:-- --:--:-- --:--:-- 10588
Archive:  data.zip
  inflating: amazon_sales_data 2025.csv  


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [3]:
# 2. Load the CSV file assuming headers exist and let Spark infer data types
df = spark.read.option("header", "true").option("inferSchema", "true").csv("amazon_sales_data 2025.csv")

In [4]:
# 3. Check the structure of the DataFrame
df.printSchema()

root
 |-- Order ID: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Total Sales: integer (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Customer Location: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Status: string (nullable = true)



In [5]:
# 4. Apply filters

# a. Only records from date '23-02-25'
df_date = df.filter(df["date"] == "23-02-25")
df_date.show()

+--------+--------+-------------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|      Product|   Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+-------------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0023|23-02-25|         Book|      Books|   15|       1|         15|   Emma Clark|          Houston|   Credit Card|  Pending|
| ORD0068|23-02-25|   Headphones|Electronics|  100|       1|        100|    David Lee|          Houston|    Debit Card|Cancelled|
| ORD0080|23-02-25|Running Shoes|   Footwear|   60|       4|        240|Sophia Miller|    San Francisco|    Debit Card|  Pending|
| ORD0114|23-02-25|Running Shoes|   Footwear|   60|       1|         60|   Emma Clark|          Houston|   Credit Card|  Pending|
| ORD0123|23-02-25|         Book|      Books|   15|       3|         45|  Chris White|    

In [6]:
# b. Only records related to 'Smartphone'
df_smartphone = df.filter(df["product"] == "Smartphone")
df_smartphone.show()

+--------+--------+----------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|   Product|   Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+----------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0008|02-03-25|Smartphone|Electronics|  500|       1|        500|Sophia Miller|            Miami|        PayPal|Completed|
| ORD0010|12-03-25|Smartphone|Electronics|  500|       1|        500|Emily Johnson|    San Francisco|   Credit Card|Cancelled|
| ORD0018|10-02-25|Smartphone|Electronics|  500|       2|       1000|Michael Brown|      Los Angeles|    Amazon Pay|Completed|
| ORD0029|12-02-25|Smartphone|Electronics|  500|       1|        500|Sophia Miller|           Denver|   Credit Card|Cancelled|
| ORD0031|24-03-25|Smartphone|Electronics|  500|       1|        500|     John Doe|          Houston|     Gift 

In [7]:
# c. Only records from category 'Electronics'
df_electronics = df.filter(df["category"] == "Electronics")
df_electronics.show()

+--------+--------+----------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|   Product|   Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+----------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0002|20-03-25|Headphones|Electronics|  100|       4|        400|Emily Johnson|    San Francisco|    Debit Card|  Pending|
| ORD0005|10-03-25|Smartwatch|Electronics|  150|       3|        450|   Emma Clark|         New York|    Debit Card|  Pending|
| ORD0007|18-03-25|Smartwatch|Electronics|  150|       4|        600|   Emma Clark|          Houston|        PayPal|Completed|
| ORD0008|02-03-25|Smartphone|Electronics|  500|       1|        500|Sophia Miller|            Miami|        PayPal|Completed|
| ORD0010|12-03-25|Smartphone|Electronics|  500|       1|        500|Emily Johnson|    San Francisco|   Credit 

In [8]:
# d. Only records with price less than 50
df_price = df.filter(df["price"] < 50)
df_price.show()

+--------+--------+-------+--------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|Product|Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+-------+--------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0006|14-03-25|T-Shirt|Clothing|   20|       1|         20|     John Doe|           Dallas|   Credit Card|  Pending|
| ORD0009|08-03-25|T-Shirt|Clothing|   20|       3|         60|Sophia Miller|           Boston|        PayPal|Completed|
| ORD0011|17-02-25|   Book|   Books|   15|       2|         30|    David Lee|           Boston|    Amazon Pay|  Pending|
| ORD0012|13-03-25|  Jeans|Clothing|   40|       4|        160|Michael Brown|           Dallas|   Credit Card|Completed|
| ORD0017|01-04-25|T-Shirt|Clothing|   20|       1|         20|   Emma Clark|         New York|    Amazon Pay|Completed|
| ORD0023|23-02-25|   Book|   Bo

In [9]:
# e. Only transactions paid via 'PayPal'
df_paypal = df.filter(df["Payment Method"] == "PayPal")
df_paypal.show()


+--------+--------+-------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|      Product|       Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+-------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0007|18-03-25|   Smartwatch|    Electronics|  150|       4|        600|   Emma Clark|          Houston|        PayPal|Completed|
| ORD0008|02-03-25|   Smartphone|    Electronics|  500|       1|        500|Sophia Miller|            Miami|        PayPal|Completed|
| ORD0009|08-03-25|      T-Shirt|       Clothing|   20|       3|         60|Sophia Miller|           Boston|        PayPal|Completed|
| ORD0032|10-03-25|   Smartphone|    Electronics|  500|       4|       2000|Michael Brown|          Seattle|        PayPal|  Pending|
| ORD0036|09-02-25| Refrigerator|Home Appliances| 1200|       

In [10]:
# 5. More complex filtering

# a. Electronics category & price < 50
df_elec_price = df.filter((df["category"] == "Electronics") & (df["price"] < 120))
df_elec_price.show()


+--------+--------+----------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|   Product|   Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+----------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0002|20-03-25|Headphones|Electronics|  100|       4|        400|Emily Johnson|    San Francisco|    Debit Card|  Pending|
| ORD0020|07-03-25|Headphones|Electronics|  100|       4|        400|Olivia Wilson|          Seattle|    Debit Card|  Pending|
| ORD0021|05-02-25|Headphones|Electronics|  100|       3|        300|  Chris White|            Miami|    Debit Card|Cancelled|
| ORD0028|03-02-25|Headphones|Electronics|  100|       1|        100|   Jane Smith|          Chicago|    Amazon Pay|Completed|
| ORD0037|16-02-25|Headphones|Electronics|  100|       3|        300|Michael Brown|         New York|    Debit 

In [11]:
# b. Transactions by 'Emma Clark' paid via 'Credit Card'
df_emma_credit = df.filter((df["Customer Name"] == "Emma Clark") & (df["Payment Method"] == "Credit Card"))
df_emma_credit.show()

+--------+--------+-------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|      Product|       Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+-------------+---------------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0023|23-02-25|         Book|          Books|   15|       1|         15|   Emma Clark|          Houston|   Credit Card|  Pending|
| ORD0056|19-03-25|   Smartwatch|    Electronics|  150|       2|        300|   Emma Clark|           Dallas|   Credit Card|Completed|
| ORD0089|26-03-25|Running Shoes|       Footwear|   60|       5|        300|   Emma Clark|      Los Angeles|   Credit Card|Cancelled|
| ORD0111|31-03-25|       Laptop|    Electronics|  800|       4|       3200|   Emma Clark|      Los Angeles|   Credit Card|Completed|
| ORD0114|23-02-25|Running Shoes|       Footwear|   60|       

In [12]:
# c. Transactions by 'Emma Clark' with 'Credit Card' & price < 50
df_emma_credit_price = df.filter((df["Customer Name"] == "Emma Clark") &
                                 (df["Payment Method"] == "Credit Card") & (df["price"] < 50))
df_emma_credit_price.show()


+--------+--------+-------+--------+-----+--------+-----------+-------------+-----------------+--------------+-------+
|Order ID|    Date|Product|Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method| Status|
+--------+--------+-------+--------+-----+--------+-----------+-------------+-----------------+--------------+-------+
| ORD0023|23-02-25|   Book|   Books|   15|       1|         15|   Emma Clark|          Houston|   Credit Card|Pending|
+--------+--------+-------+--------+-----+--------+-----------+-------------+-----------------+--------------+-------+



In [13]:
# d. Transactions by 'Emma Clark' with 'Credit Card' or 'Debit Card' & price < 50
df_emma_credit_debit_price = df.filter((df["Customer Name"] == "Emma Clark") &
                                       ((df["Payment Method"] == "Credit Card") | (df["Payment Method"] == "Debit Card")) &
                                       (df["price"] < 50))
df_emma_credit_debit_price.show()

+--------+--------+-------+--------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|Product|Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+-------+--------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0023|23-02-25|   Book|   Books|   15|       1|         15|   Emma Clark|          Houston|   Credit Card|  Pending|
| ORD0127|18-02-25|T-Shirt|Clothing|   20|       3|         60|   Emma Clark|           Dallas|    Debit Card|Completed|
| ORD0212|09-03-25|T-Shirt|Clothing|   20|       4|         80|   Emma Clark|          Houston|    Debit Card|Completed|
+--------+--------+-------+--------+-----+--------+-----------+-------------+-----------------+--------------+---------+



In [14]:
# e. Transactions by 'Emma Clark' or 'Jane Smith' with 'Credit Card' or 'Debit Card' & price < 50
df_emma_jane_credit_debit_price = df.filter(((df["Customer Name"] == "Emma Clark") | (df["Customer Name"] == "Jane Smith")) &
                                            ((df["Payment Method"] == "Credit Card") | (df["Payment Method"] == "Debit Card")) &
                                            (df["price"] < 50))
df_emma_jane_credit_debit_price.show()

+--------+--------+-------+--------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|Product|Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+-------+--------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0023|23-02-25|   Book|   Books|   15|       1|         15|   Emma Clark|          Houston|   Credit Card|  Pending|
| ORD0034|02-04-25|T-Shirt|Clothing|   20|       5|        100|   Jane Smith|         New York|   Credit Card|  Pending|
| ORD0041|20-02-25|   Book|   Books|   15|       1|         15|   Jane Smith|            Miami|   Credit Card|Cancelled|
| ORD0043|08-02-25|T-Shirt|Clothing|   20|       4|         80|   Jane Smith|           Denver|   Credit Card|  Pending|
| ORD0060|12-03-25|   Book|   Books|   15|       5|         75|   Jane Smith|           Dallas|   Credit Card|  Pending|
| ORD0104|22-02-25|  Jeans|Cloth

In [15]:
# f. Transactions related to 'Electronics', 'Clothing', or 'Books' categories
df_selected_categories = df.filter(df["category"].isin(["Electronics", "Clothing", "Books"]))
df_selected_categories.show()

+--------+--------+----------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
|Order ID|    Date|   Product|   Category|Price|Quantity|Total Sales|Customer Name|Customer Location|Payment Method|   Status|
+--------+--------+----------+-----------+-----+--------+-----------+-------------+-----------------+--------------+---------+
| ORD0002|20-03-25|Headphones|Electronics|  100|       4|        400|Emily Johnson|    San Francisco|    Debit Card|  Pending|
| ORD0005|10-03-25|Smartwatch|Electronics|  150|       3|        450|   Emma Clark|         New York|    Debit Card|  Pending|
| ORD0006|14-03-25|   T-Shirt|   Clothing|   20|       1|         20|     John Doe|           Dallas|   Credit Card|  Pending|
| ORD0007|18-03-25|Smartwatch|Electronics|  150|       4|        600|   Emma Clark|          Houston|        PayPal|Completed|
| ORD0008|02-03-25|Smartphone|Electronics|  500|       1|        500|Sophia Miller|            Miami|        Pa

In [16]:
# 6. Generate a list of unique customers (name and location)
df_customers = df.select("Customer Name", "Customer Location").distinct()
df_customers.show()

+-------------+-----------------+
|Customer Name|Customer Location|
+-------------+-----------------+
|     John Doe|           Dallas|
|  Chris White|           Boston|
|Olivia Wilson|           Denver|
|Olivia Wilson|          Seattle|
|Sophia Miller|           Boston|
|   Jane Smith|          Seattle|
|Daniel Harris|    San Francisco|
|  Chris White|         New York|
|Michael Brown|           Boston|
|Olivia Wilson|           Boston|
|Olivia Wilson|           Dallas|
|   Emma Clark|          Chicago|
|Sophia Miller|           Denver|
|   Jane Smith|           Boston|
|  Chris White|           Denver|
|     John Doe|          Houston|
|     John Doe|      Los Angeles|
|  Chris White|      Los Angeles|
|Michael Brown|      Los Angeles|
|    David Lee|           Dallas|
+-------------+-----------------+
only showing top 20 rows

