# Adding & removing columns

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [2]:
import requests

url = "https://raw.githubusercontent.com/Enjia/Nutrition-Facts-for-McDonald-s-Menu/refs/heads/master/menu.csv"
local_file = "McDonalds_Menu.csv"

response = requests.get(url)
with open(local_file, "wb") as file:
    file.write(response.content)


In [3]:
menu_df = spark.read.csv(local_file, header=True, inferSchema=True)
menu_df.printSchema()
menu_df.show(5)

root
 |-- Category: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Serving Size: string (nullable = true)
 |-- Calories: integer (nullable = true)
 |-- Calories from Fat: integer (nullable = true)
 |-- Total Fat: double (nullable = true)
 |-- Total Fat (% Daily Value): integer (nullable = true)
 |-- Saturated Fat: double (nullable = true)
 |-- Saturated Fat (% Daily Value): integer (nullable = true)
 |-- Trans Fat: double (nullable = true)
 |-- Cholesterol: integer (nullable = true)
 |-- Cholesterol (% Daily Value): integer (nullable = true)
 |-- Sodium: integer (nullable = true)
 |-- Sodium (% Daily Value): integer (nullable = true)
 |-- Carbohydrates: integer (nullable = true)
 |-- Carbohydrates (% Daily Value): integer (nullable = true)
 |-- Dietary Fiber: integer (nullable = true)
 |-- Dietary Fiber (% Daily Value): integer (nullable = true)
 |-- Sugars: integer (nullable = true)
 |-- Protein: integer (nullable = true)
 |-- Vitamin A (% Daily Value): integer (nul

new column

In [4]:
from pyspark.sql.functions import col

menu_df\
  .select("Item", "Calories", "Sugars")\
  .filter("Sugars > 100")\
  .withColumn("MinSugar", col("Sugars")/2)\
  .show()

+--------------------+--------+------+--------+
|                Item|Calories|Sugars|MinSugar|
+--------------------+--------+------+--------+
|Vanilla Shake (La...|     820|   101|    50.5|
|Strawberry Shake ...|     850|   123|    61.5|
|Chocolate Shake (...|     850|   120|    60.0|
|Shamrock Shake (L...|     820|   115|    57.5|
|McFlurry with M&M...|     930|   128|    64.0|
|McFlurry with Ree...|     810|   103|    51.5|
+--------------------+--------+------+--------+



drop

In [5]:
menu_df\
  .select("Item", "Calories", "Sugars")\
  .filter("Sugars > 100")\
  .drop("Sugars")\
  .show()

+--------------------+--------+
|                Item|Calories|
+--------------------+--------+
|Vanilla Shake (La...|     820|
|Strawberry Shake ...|     850|
|Chocolate Shake (...|     850|
|Shamrock Shake (L...|     820|
|McFlurry with M&M...|     930|
|McFlurry with Ree...|     810|
+--------------------+--------+



rename

In [6]:
menu_df\
  .select("Item", "Calories", "Sugars")\
  .filter("Sugars > 100")\
  .withColumn("MinSugar", col("Sugars")/2)\
  .withColumnRenamed("Sugars", "OrigSugars")\
  .show()

+--------------------+--------+----------+--------+
|                Item|Calories|OrigSugars|MinSugar|
+--------------------+--------+----------+--------+
|Vanilla Shake (La...|     820|       101|    50.5|
|Strawberry Shake ...|     850|       123|    61.5|
|Chocolate Shake (...|     850|       120|    60.0|
|Shamrock Shake (L...|     820|       115|    57.5|
|McFlurry with M&M...|     930|       128|    64.0|
|McFlurry with Ree...|     810|       103|    51.5|
+--------------------+--------+----------+--------+



multiple new columns

In [7]:
menu_df\
  .select("Item", "Calories", "Sugars")\
  .filter("Sugars > 100")\
  .withColumn("MinSugar2", col("Sugars")/2)\
  .withColumn("MinSugar3", col("Sugars")/3)\
  .show()

+--------------------+--------+------+---------+------------------+
|                Item|Calories|Sugars|MinSugar2|         MinSugar3|
+--------------------+--------+------+---------+------------------+
|Vanilla Shake (La...|     820|   101|     50.5|33.666666666666664|
|Strawberry Shake ...|     850|   123|     61.5|              41.0|
|Chocolate Shake (...|     850|   120|     60.0|              40.0|
|Shamrock Shake (L...|     820|   115|     57.5|38.333333333333336|
|McFlurry with M&M...|     930|   128|     64.0|42.666666666666664|
|McFlurry with Ree...|     810|   103|     51.5|34.333333333333336|
+--------------------+--------+------+---------+------------------+



constant value

In [9]:
menu_df\
  .select("Item", "Calories", "Sugars")\
  .filter("Sugars > 100")\
  .selectExpr("Item", "Calories", "Sugars", "Sugars/2 as MinSugars", "False as Authorized")\
  .show()

+--------------------+--------+------+---------+----------+
|                Item|Calories|Sugars|MinSugars|Authorized|
+--------------------+--------+------+---------+----------+
|Vanilla Shake (La...|     820|   101|     50.5|     false|
|Strawberry Shake ...|     850|   123|     61.5|     false|
|Chocolate Shake (...|     850|   120|     60.0|     false|
|Shamrock Shake (L...|     820|   115|     57.5|     false|
|McFlurry with M&M...|     930|   128|     64.0|     false|
|McFlurry with Ree...|     810|   103|     51.5|     false|
+--------------------+--------+------+---------+----------+



multiple constant in columns

In [10]:
from pyspark.sql.functions import lit

menu_df\
  .select("Item", "Calories", "Sugars")\
  .filter("Sugars > 100")\
  .withColumn("MinSugar", col("Sugars")/2) \
  .withColumn("Authorized", lit(True))\
  .withColumn("Announced", lit(False))\
  .show()

+--------------------+--------+------+--------+----------+---------+
|                Item|Calories|Sugars|MinSugar|Authorized|Announced|
+--------------------+--------+------+--------+----------+---------+
|Vanilla Shake (La...|     820|   101|    50.5|      true|    false|
|Strawberry Shake ...|     850|   123|    61.5|      true|    false|
|Chocolate Shake (...|     850|   120|    60.0|      true|    false|
|Shamrock Shake (L...|     820|   115|    57.5|      true|    false|
|McFlurry with M&M...|     930|   128|    64.0|      true|    false|
|McFlurry with Ree...|     810|   103|    51.5|      true|    false|
+--------------------+--------+------+--------+----------+---------+



sorting with "non-selected" column

In [12]:
menu_df\
  .select("Item", "Calories", "Sugars")\
  .filter("Sugars > 100")\
  .sort(col("Total Fat"))\
  .show()

+--------------------+--------+------+
|                Item|Calories|Sugars|
+--------------------+--------+------+
|Vanilla Shake (La...|     820|   101|
|Chocolate Shake (...|     850|   120|
|Shamrock Shake (L...|     820|   115|
|Strawberry Shake ...|     850|   123|
|McFlurry with Ree...|     810|   103|
|McFlurry with M&M...|     930|   128|
+--------------------+--------+------+



In [14]:
menu_df\
  .select("Item", "Calories", "Sugars", "Total Fat")\
  .filter("Sugars > 100")\
  .drop("Sugars", "Total Fat")\
  .sort(col("Total Fat"))\
  .show()

+--------------------+--------+
|                Item|Calories|
+--------------------+--------+
|Vanilla Shake (La...|     820|
|Chocolate Shake (...|     850|
|Shamrock Shake (L...|     820|
|Strawberry Shake ...|     850|
|McFlurry with Ree...|     810|
|McFlurry with M&M...|     930|
+--------------------+--------+



error is expected:

In [17]:
# menu_df\
#   .select("Item", "Calories", "Sugars")\
#   .withColumnRenamed("Sugars", "Sweetener")\
#   .filter(col("Sugars") > 100)\
#   .withColumn("MinSugar", col("Sugars")/2) \
#   .withColumn("Authorized", lit(True))\
#   .withColumn("Announced", lit(False)) \
#   .withColumnRenamed("Total Fat", "Fat")\
#   .sort("Total Fat")\
#   .show()

+--------------------+--------+---------+--------+----------+---------+
|                Item|Calories|Sweetener|MinSugar|Authorized|Announced|
+--------------------+--------+---------+--------+----------+---------+
|Vanilla Shake (La...|     820|      101|    50.5|      true|    false|
|Chocolate Shake (...|     850|      120|    60.0|      true|    false|
|Shamrock Shake (L...|     820|      115|    57.5|      true|    false|
|Strawberry Shake ...|     850|      123|    61.5|      true|    false|
|McFlurry with Ree...|     810|      103|    51.5|      true|    false|
|McFlurry with M&M...|     930|      128|    64.0|      true|    false|
+--------------------+--------+---------+--------+----------+---------+



Trick

In [21]:
rename_map = {
    "Name" : col("Item"),
    "Cal"  : col("Calories"),
    "Fin"  : lit(False)
    }

In [19]:
rename_map

{'Name': Column<'Item'>, 'Cal': Column<'Calories'>, 'Fin': Column<'false'>}

In [22]:
menu_df\
  .select("Item", "Calories", "Sugars")\
  .where("Sugars > 100")\
  .withColumns(rename_map)\
  .show()

+--------------------+--------+------+--------------------+---+-----+
|                Item|Calories|Sugars|                Name|Cal|  Fin|
+--------------------+--------+------+--------------------+---+-----+
|Vanilla Shake (La...|     820|   101|Vanilla Shake (La...|820|false|
|Strawberry Shake ...|     850|   123|Strawberry Shake ...|850|false|
|Chocolate Shake (...|     850|   120|Chocolate Shake (...|850|false|
|Shamrock Shake (L...|     820|   115|Shamrock Shake (L...|820|false|
|McFlurry with M&M...|     930|   128|McFlurry with M&M...|930|false|
|McFlurry with Ree...|     810|   103|McFlurry with Ree...|810|false|
+--------------------+--------+------+--------------------+---+-----+

