In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import pathlib
import os

# Download dataset

In [3]:
CACHE_DIR = "./drive/Shared drives/CS 269: Recipe/tmp"
pathlib.Path(CACHE_DIR).mkdir(exist_ok=True)

In [4]:
data_file = "recipes_raw.zip"
data_origin = "https://storage.googleapis.com/recipe-box/recipes_raw.zip"

data_file = tf.keras.utils.get_file(
    fname=data_file,
    origin=data_origin,
    cache_dir=CACHE_DIR,
    extract=True,
    archive_format='zip'
)

In [None]:
data_srcs = ['ar', 'epi', 'fn'] # Allrecipes, Epicurious, Food Network
data_files = map(lambda fname: f"{CACHE_DIR}/datasets/recipes_raw_nosource_{fname}.json", data_srcs)
dfs = [pd.read_json(f).T for f in list(data_files)]

In [None]:
recipes = pd.concat(dfs) \
    .reset_index(drop=True) \
    .drop(['picture_link'], axis=1)

# Process data

In [None]:
# Drop rows with NA values
recipes = recipes.dropna()

In [None]:
def format_title(title):
    # # NOTE: Training GPT-2 on data without Roman Numerals did worse
    # # Remove Roman numerals found in titles (list created by manually checking through the recipes)
    # for numeral in ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII']:
    #     if title.endswith(f' {numeral}'):
    #         return title[:title.rfind(' ')]

    return title
    
def format_ingredients(ingredients):
    ingredients = [i.replace('ADVERTISEMENT', '').strip() for i in ingredients]
    ingredients = list(filter(lambda x: x != "", ingredients))
    ingredients = ''.join(map(lambda i: f"• {i}\n", ingredients))

    return ingredients

def format_instructions(instructions):
    instructions = instructions.split('\n')
    instructions = list(filter(lambda x: x != "", instructions))
    instructions = ''.join(map(lambda i: f"‣ {i}\n", instructions))

    return instructions

In [None]:
recipes['title'] = recipes['title'].apply(format_title)
recipes['ingredients'] = recipes['ingredients'].apply(format_ingredients)
recipes['instructions'] = recipes['instructions'].apply(format_instructions)

In [None]:
# Filter out long recipes
recipe_lengths = recipes['instructions'].map(len)
recipe_maxlen = int(np.percentile(recipe_lengths, 85)) # Set maximum length at 85th percentile (TODO: might hardcode this)
filtered_recipes = recipes[recipes['instructions'].str.len() < recipe_maxlen]

print(f"Removed {len(recipes) - len(filtered_recipes)} recipes. Kept {len(filtered_recipes)} recipes.")

# Save dataset to disk

In [None]:
dataset_path = os.path.join(CACHE_DIR, 'recipes.pkl')
filtered_recipes.to_pickle(dataset_path) 