# Filter & Where

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [2]:
import requests

url = "https://raw.githubusercontent.com/Enjia/Nutrition-Facts-for-McDonald-s-Menu/refs/heads/master/menu.csv"
local_file = "McDonalds_Menu.csv"

response = requests.get(url)
with open(local_file, "wb") as file:
    file.write(response.content)


In [3]:
menu_df = spark.read.csv(local_file, header=True, inferSchema=True)
menu_df.printSchema()
menu_df.show(5)

root
 |-- Category: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Serving Size: string (nullable = true)
 |-- Calories: integer (nullable = true)
 |-- Calories from Fat: integer (nullable = true)
 |-- Total Fat: double (nullable = true)
 |-- Total Fat (% Daily Value): integer (nullable = true)
 |-- Saturated Fat: double (nullable = true)
 |-- Saturated Fat (% Daily Value): integer (nullable = true)
 |-- Trans Fat: double (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- Cholesterol (% Daily Value): integer (nullable = true)
 |-- Sodium: integer (nullable = true)
 |-- Sodium (% Daily Value): integer (nullable = true)
 |-- Carbohydrates: integer (nullable = true)
 |-- Carbohydrates (% Daily Value): integer (nullable = true)
 |-- Dietary Fiber: integer (nullable = true)
 |-- Dietary Fiber (% Daily Value): integer (nullable = true)
 |-- Sugars: integer (nullable = true)
 |-- Protein: integer (nullable = true)
 |-- Vitamin A (% Daily Value): integer (nul

## Filtering

In [4]:
menu_df\
  .filter(menu_df['Calories'] > 1000)\
  .select('Category', 'Item', 'Serving Size', 'Calories', 'Total Fat', 'Carbohydrates', 'Protein')\
  .show()

+--------------+--------------------+---------------+--------+---------+-------------+-------+
|      Category|                Item|   Serving Size|Calories|Total Fat|Carbohydrates|Protein|
+--------------+--------------------+---------------+--------+---------+-------------+-------+
|     Breakfast|Big Breakfast wit...|14.8 oz (420 g)|    1090|     56.0|          111|     36|
|     Breakfast|Big Breakfast wit...|15.3 oz (434 g)|    1150|     60.0|          116|     36|
|     Breakfast|Big Breakfast wit...|15.4 oz (437 g)|    1050|     50.0|          115|     35|
|Chicken & Fish|Chicken McNuggets...|22.8 oz (646 g)|    1880|    118.0|          118|     87|
+--------------+--------------------+---------------+--------+---------+-------------+-------+



In [5]:
menu_df\
  .filter("Calories < 100")\
  .select('Category', 'Item', 'Serving Size', 'Calories', 'Total Fat', 'Carbohydrates', 'Protein')\
  .show()

+--------------+--------------------+----------------+--------+---------+-------------+-------+
|      Category|                Item|    Serving Size|Calories|Total Fat|Carbohydrates|Protein|
+--------------+--------------------+----------------+--------+---------+-------------+-------+
|Snacks & Sides|          Side Salad|   3.1 oz (87 g)|      20|      0.0|            4|      1|
|Snacks & Sides|        Apple Slices|   1.2 oz (34 g)|      15|      0.0|            4|      0|
|      Desserts| Kids Ice Cream Cone|     1 oz (29 g)|      45|      1.5|            7|      1|
|     Beverages|   Diet Coke (Small)|    16 fl oz cup|       0|      0.0|            0|      0|
|     Beverages|  Diet Coke (Medium)|    21 fl oz cup|       0|      0.0|            0|      0|
|     Beverages|   Diet Coke (Large)|    30 fl oz cup|       0|      0.0|            0|      0|
|     Beverages|   Diet Coke (Child)|    12 fl oz cup|       0|      0.0|            0|      0|
|     Beverages|Diet Dr Pepper (S...|   

In [6]:
menu_df\
  .filter((menu_df['Calories'] < 100) & (menu_df["Calories"] != 0))\
  .select('Category', 'Item', 'Serving Size', 'Calories', 'Total Fat', 'Carbohydrates', 'Protein')\
  .show()

+--------------+--------------------+----------------+--------+---------+-------------+-------+
|      Category|                Item|    Serving Size|Calories|Total Fat|Carbohydrates|Protein|
+--------------+--------------------+----------------+--------+---------+-------------+-------+
|Snacks & Sides|          Side Salad|   3.1 oz (87 g)|      20|      0.0|            4|      1|
|Snacks & Sides|        Apple Slices|   1.2 oz (34 g)|      15|      0.0|            4|      0|
|      Desserts| Kids Ice Cream Cone|     1 oz (29 g)|      45|      1.5|            7|      1|
|     Beverages|Minute Maid 100% ...|6 fl oz (177 ml)|      80|      0.0|           21|      0|
|  Coffee & Tea|Iced Coffee with ...|    16 fl oz cup|      80|      4.5|            9|      1|
+--------------+--------------------+----------------+--------+---------+-------------+-------+



In [7]:
menu_df\
  .where((menu_df.Calories == 20) | (menu_df.Calories == 15))\
  .select('Category', 'Item', 'Serving Size', 'Calories', 'Total Fat', 'Carbohydrates', 'Protein')\
  .show()

+--------------+------------+-------------+--------+---------+-------------+-------+
|      Category|        Item| Serving Size|Calories|Total Fat|Carbohydrates|Protein|
+--------------+------------+-------------+--------+---------+-------------+-------+
|Snacks & Sides|  Side Salad|3.1 oz (87 g)|      20|      0.0|            4|      1|
|Snacks & Sides|Apple Slices|1.2 oz (34 g)|      15|      0.0|            4|      0|
+--------------+------------+-------------+--------+---------+-------------+-------+



Operators: <, >, <=, >=, ==, !=, like, rlike, between, in, isnull, isnotnull

In [8]:
from pyspark.sql.functions import col, lower

In [9]:
menu_df\
  .filter(lower(col('Item')).like('%ice%')) \
  .select('Category', 'Item', 'Serving Size', 'Calories', 'Total Fat', 'Carbohydrates', 'Protein')\
  .show()

+--------------+--------------------+----------------+--------+---------+-------------+-------+
|      Category|                Item|    Serving Size|Calories|Total Fat|Carbohydrates|Protein|
+--------------+--------------------+----------------+--------+---------+-------------+-------+
|Snacks & Sides|        Apple Slices|   1.2 oz (34 g)|      15|      0.0|            4|      0|
|      Desserts| Kids Ice Cream Cone|     1 oz (29 g)|      45|      1.5|            7|      1|
|     Beverages|Minute Maid 100% ...|6 fl oz (177 ml)|      80|      0.0|           21|      0|
|     Beverages|Minute Maid Orang...|    12 fl oz cup|     150|      0.0|           34|      2|
|     Beverages|Minute Maid Orang...|    16 fl oz cup|     190|      0.0|           44|      3|
|     Beverages|Minute Maid Orang...|    22 fl oz cup|     280|      0.0|           65|      4|
|  Coffee & Tea|    Iced Tea (Small)|    16 fl oz cup|       0|      0.0|            0|      0|
|  Coffee & Tea|   Iced Tea (Medium)|   

In [10]:
menu_df\
  .select('Category')\
  .distinct()\
  .sort('Category')\
  .show()

+------------------+
|          Category|
+------------------+
|       Beef & Pork|
|         Beverages|
|         Breakfast|
|    Chicken & Fish|
|      Coffee & Tea|
|          Desserts|
|            Salads|
|Smoothies & Shakes|
|    Snacks & Sides|
+------------------+



In [11]:
menu_df\
  .where(menu_df.Category.isin('Coffee & Tea', 'Desserts'))\
  .select('Category', 'Item', 'Serving Size', 'Calories', 'Total Fat', 'Carbohydrates', 'Protein')\
  .show()

+------------+--------------------+---------------+--------+---------+-------------+-------+
|    Category|                Item|   Serving Size|Calories|Total Fat|Carbohydrates|Protein|
+------------+--------------------+---------------+--------+---------+-------------+-------+
|    Desserts|     Baked Apple Pie|  2.7 oz (77 g)|     250|     13.0|           32|      2|
|    Desserts|Chocolate Chip Co...|1 cookie (33 g)|     160|      8.0|           21|      2|
|    Desserts|Oatmeal Raisin Co...|1 cookie (33 g)|     150|      6.0|           22|      2|
|    Desserts| Kids Ice Cream Cone|    1 oz (29 g)|      45|      1.5|            7|      1|
|    Desserts|    Hot Fudge Sundae| 6.3 oz (179 g)|     330|      9.0|           53|      8|
|    Desserts|  Hot Caramel Sundae| 6.4 oz (182 g)|     340|      8.0|           60|      7|
|    Desserts|   Strawberry Sundae| 6.3 oz (178 g)|     280|      6.0|           49|      6|
|Coffee & Tea|    Iced Tea (Small)|   16 fl oz cup|       0|      0.0|

## Array columns

array_contains, array_except, array_intersect, array_union

In [12]:
from pyspark.sql.functions import array_contains

data = [
    (1, ["apple", "banana"]),
    (2, ["grape", "peach"]),
    (3, ["banana", "melon"]),
    (4, ["apple", "cherry", "kiwi"])
]
columns = ["ID", "Fruits"]
df = spark.createDataFrame(data, columns)


In [13]:
df.show()


+---+--------------------+
| ID|              Fruits|
+---+--------------------+
|  1|     [apple, banana]|
|  2|      [grape, peach]|
|  3|     [banana, melon]|
|  4|[apple, cherry, k...|
+---+--------------------+



In [14]:
filtered_df = df.filter(array_contains(col("Fruits"), "banana"))

filtered_df.show()

+---+---------------+
| ID|         Fruits|
+---+---------------+
|  1|[apple, banana]|
|  3|[banana, melon]|
+---+---------------+



# Nested object

In [15]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("person", StructType([
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True)
    ]), True)
])

data = [
    (1, ("John", 28)),
    (2, ("Anna", 22)),
    (3, ("Mike", 35))
]
df = spark.createDataFrame(data, schema=schema)

df.show()

+---+----------+
| id|    person|
+---+----------+
|  1|{John, 28}|
|  2|{Anna, 22}|
|  3|{Mike, 35}|
+---+----------+



In [16]:
filtered_df = df.filter(col("person.age") > 25)
filtered_df.show()

+---+----------+
| id|    person|
+---+----------+
|  1|{John, 28}|
|  3|{Mike, 35}|
+---+----------+



Depreciated: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Column.getField.html


In [17]:
# from pyspark.sql.functions import getField

# filtered_df = df.filter(getField(col("person"), "age") > 25)
# filtered_df.show()

## Common error

In [18]:
menu_df2 = spark.read.csv(local_file, header=True, inferSchema=False)
menu_df2.printSchema()
menu_df2.show(5)

root
 |-- Category: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Serving Size: string (nullable = true)
 |-- Calories: string (nullable = true)
 |-- Calories from Fat: string (nullable = true)
 |-- Total Fat: string (nullable = true)
 |-- Total Fat (% Daily Value): string (nullable = true)
 |-- Saturated Fat: string (nullable = true)
 |-- Saturated Fat (% Daily Value): string (nullable = true)
 |-- Trans Fat: string (nullable = true)
 |-- Cholesterol: string (nullable = true)
 |-- Cholesterol (% Daily Value): string (nullable = true)
 |-- Sodium: string (nullable = true)
 |-- Sodium (% Daily Value): string (nullable = true)
 |-- Carbohydrates: string (nullable = true)
 |-- Carbohydrates (% Daily Value): string (nullable = true)
 |-- Dietary Fiber: string (nullable = true)
 |-- Dietary Fiber (% Daily Value): string (nullable = true)
 |-- Sugars: string (nullable = true)
 |-- Protein: string (nullable = true)
 |-- Vitamin A (% Daily Value): string (nullable = true)
 

In [19]:
menu_df2\
  .sort("Sugars", ascending=False)\
  .select("Category", "Item", "Sugars")\
  .limit(10)\
  .show()

+------------------+--------------------+------+
|          Category|                Item|Sugars|
+------------------+--------------------+------+
|      Coffee & Tea|Frappé Chocolate ...|    99|
|Smoothies & Shakes|Chocolate Shake (...|    97|
|Smoothies & Shakes|Shamrock Shake (M...|    93|
|       Beef & Pork|             Big Mac|     9|
|       Beef & Pork|Quarter Pounder D...|     9|
|    Chicken & Fish|Premium Grilled C...|     9|
|    Chicken & Fish|Premium Grilled C...|     9|
|Smoothies & Shakes|McFlurry with M&M...|    89|
|      Coffee & Tea|Frappé Mocha (Large)|    88|
|      Coffee & Tea|Frappé Caramel (L...|    88|
+------------------+--------------------+------+



In [20]:
menu_df\
  .sort("Sugars", ascending=False)\
  .select("Category", "Item", "Sugars")\
  .limit(10)\
  .show()

+------------------+--------------------+------+
|          Category|                Item|Sugars|
+------------------+--------------------+------+
|Smoothies & Shakes|McFlurry with M&M...|   128|
|Smoothies & Shakes|Strawberry Shake ...|   123|
|Smoothies & Shakes|Chocolate Shake (...|   120|
|Smoothies & Shakes|Shamrock Shake (L...|   115|
|Smoothies & Shakes|McFlurry with Ree...|   103|
|Smoothies & Shakes|Vanilla Shake (La...|   101|
|Smoothies & Shakes|Strawberry Shake ...|   100|
|      Coffee & Tea|Frappé Chocolate ...|    99|
|Smoothies & Shakes|Chocolate Shake (...|    97|
|Smoothies & Shakes|Shamrock Shake (M...|    93|
+------------------+--------------------+------+

