## ETL Recipes


### Initial configuration

In [0]:
dbutils.library.restartPython()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, ArrayType
from pyspark.sql.functions import size,split, regexp_replace, expr, lower, array_except, col, lit, array, transform, trim, filter, udf

### Load recipes dataset in a Dataframe

In [0]:
raw_recipes = spark.read.csv(
    "/Volumes/wine_harmonization/datasets/raw_datasets/RecipeNLG_dataset.csv",
    header=True,
    inferSchema=True,
    escape='"'
)

In [0]:
display(raw_recipes.count())
display(raw_recipes)

### Load ingredient exclusion lexicon

In [0]:
exclusion = spark.read.csv(
    "/Volumes/wine_harmonization/datasets/raw_datasets/ingredients_inclusion.csv",
    header=True,
    inferSchema=True,
    comment='#'
)

exclusion = exclusion.withColumn("entity_alias_readable", lower(exclusion["entity_alias_readable"])).dropDuplicates(subset=["entity_alias_readable"])

In [0]:
display(exclusion)

### Clean rows

In [0]:
recipes_clean = raw_recipes.drop(
    "link", "directions", "source", "ingredients"
).dropna(subset=["title", "NER"])

In [0]:
recipes_clean = recipes_clean.withColumnRenamed("_c0", "id").withColumnRenamed("NER", "ingredients_list")
display(recipes_clean)

In [0]:
exclusion_words = [row['entity_alias_readable'] for row in exclusion.select("entity_alias_readable").distinct().collect()]

def remove_excluded_words(ingredients_list, exclusion_set):
    if not ingredients_list:
        return []
    
    result = []
    for ingredient in ingredients_list:
        if ingredient:
            words = ingredient.split()
            filtered_words = [word for word in words if word in exclusion_set]
            filtered_ingredient = ' '.join(filtered_words).strip()
            
            if filtered_ingredient:
                result.append(filtered_ingredient)
    
    return result

exclusion_set = set(exclusion_words)

remove_excluded_udf = udf(lambda ingredients: remove_excluded_words(ingredients, exclusion_set), ArrayType(StringType()))

recipes_clean_test = (
    recipes_clean
    .withColumn("ingredients_list", regexp_replace("ingredients_list", r'[\[\]\"]', ''))
    .withColumn("ingredients_list", split("ingredients_list", ","))
    .withColumn("ingredients_list", expr("transform(ingredients_list, x -> lower(trim(x)))"))
    .withColumn("ingredients_list", remove_excluded_udf(col("ingredients_list")))
)

In [0]:
recipes_clean = (
    recipes_clean_test
    .withColumn(
        "valid_ingredients",
        expr("""
            size(
                filter(
                    ingredients_list,
                    x -> 
                        x rlike '^[a-z]'
                )
            ) = size(ingredients_list)
        """)
    )
    .filter("valid_ingredients")
    .drop("valid_ingredients")
)

In [0]:
recipes = recipes_clean.dropDuplicates(subset=["title"])

In [0]:
recipes = recipes.filter(size(col("ingredients_list")) > 2)
display(recipes.count())
display(recipes)

In [0]:
from pyspark.sql import functions as F


recipes_no_dups = recipes.withColumn(
    "ingredients_list",
    F.array_distinct(F.col("ingredients_list"))
)

display(recipes_no_dups)

In [0]:
recipes = recipes_no_dups.withColumn(
    "title",
    trim( regexp_replace(col("title"), '"', "") )
)
display(recipes)

### Save in a table

In [0]:
recipes.write.mode("overwrite").saveAsTable("wine_harmonization.datasets.recipes")