In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [6]:
# Allocate 8GB of RAM to the driver
spark = (
    SparkSession.builder.appName("Dessert or Not?")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

### Exploratory Data Analysis

In [7]:
food = spark.read.csv('./data/epi_r.csv', inferSchema=True, header=True)

print(food.count(), len(food.columns))

food.printSchema()

20057 680
root
 |-- title: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- calories: string (nullable = true)
 |-- protein: double (nullable = true)
 |-- fat: double (nullable = true)
 |-- sodium: double (nullable = true)
 |-- #cakeweek: double (nullable = true)
 |-- #wasteless: double (nullable = true)
 |-- 22-minute meals: double (nullable = true)
 |-- 3-ingredient recipes: double (nullable = true)
 |-- 30 days of groceries: double (nullable = true)
 |-- advance prep required: double (nullable = true)
 |-- alabama: double (nullable = true)
 |-- alaska: double (nullable = true)
 |-- alcoholic: double (nullable = true)
 |-- almond: double (nullable = true)
 |-- amaretto: double (nullable = true)
 |-- anchovy: double (nullable = true)
 |-- anise: double (nullable = true)
 |-- anniversary: double (nullable = true)
 |-- anthony bourdain: double (nullable = true)
 |-- aperitif: double (nullable = true)
 |-- appetizer: double (nullable = true)
 |-- apple: double (nullabl

Many columns contain undesirable characters such as # (from hashtags), and invalid characters. These should be standardised.

In [8]:
def sanitize_column_name(name):
    """Remove unwanted characters from a column name."""
    out = name
    for i, j in ((" ", "_"), ("-", "_"), ("/", "_"), ("&", "and")):
        out = out.replace(i, j)
    # Keep only letters, numbers and underscores
    return "".join(
        [char for char in out if char.isalpha() or char.isdigit() or char == "-"]
    )