In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import pathlib
import os

# Download dataset

In [4]:
CACHE_DIR = "./drive/Shared drives/CS 269: Recipe/tmp"
pathlib.Path(CACHE_DIR).mkdir(exist_ok=True)

In [5]:
data_file = "recipes_raw.zip"
data_origin = "https://storage.googleapis.com/recipe-box/recipes_raw.zip"

data_file = tf.keras.utils.get_file(
    fname=data_file,
    origin=data_origin,
    cache_dir=CACHE_DIR,
    extract=True,
    archive_format='zip'
)

Downloading data from https://storage.googleapis.com/recipe-box/recipes_raw.zip


In [6]:
data_srcs = ['ar', 'epi', 'fn'] # Allrecipes, Epicurious, Food Network
data_files = map(lambda fname: f"{CACHE_DIR}/datasets/recipes_raw_nosource_{fname}.json", data_srcs)
dfs = [pd.read_json(f).T for f in list(data_files)]

In [7]:
recipes = pd.concat(dfs) \
    .reset_index(drop=True) \
    .drop(['picture_link'], axis=1)

In [8]:
recipes.head()

Unnamed: 0,title,ingredients,instructions
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ..."
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ..."
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...


In [9]:
len(recipes)

125164

# Process data

In [10]:
# Drop rows with NA values
recipes = recipes.dropna()
len(recipes)

124473

## Create recipe strings

In [11]:
TITLE_START = "🍴 "
INGREDIENT_START = "🥑\n"
INSTRUCTION_START = "🥣\n"

In [12]:
def recipe_to_string(recipe):
    title = recipe['title']
    ingredients = recipe['ingredients']
    instructions = recipe['instructions'].split('\n')
    
    # Format ingredients
    ingredients = [i.replace('ADVERTISEMENT', '').strip() for i in ingredients]
    ingredients = filter(lambda x: x != "", ingredients)
    ingredients = ''.join(map(lambda i: f"• {i}\n", ingredients))
    
    # Format instructions
    instructions = filter(lambda x: x != "", instructions)
    instructions = ''.join(map(lambda i: f"‣ {i}\n", instructions))
    
    return f"{TITLE_START}{title}\n\n{INGREDIENT_START}{ingredients}\n{INSTRUCTION_START}{instructions}".strip()

In [13]:
text_recipes = recipes.apply(recipe_to_string, axis=1)

In [14]:
text_recipes.head()

0    🍴 Slow Cooker Chicken and Dumplings\n\n🥑\n• 4 ...
1    🍴 Awesome Slow Cooker Pot Roast\n\n🥑\n• 2 (10....
2    🍴 Brown Sugar Meatloaf\n\n🥑\n• 1/2 cup packed ...
3    🍴 Best Chocolate Chip Cookies\n\n🥑\n• 1 cup bu...
4    🍴 Homemade Mac and Cheese Casserole\n\n🥑\n• 8 ...
dtype: object

In [15]:
text_recipes

0         🍴 Slow Cooker Chicken and Dumplings\n\n🥑\n• 4 ...
1         🍴 Awesome Slow Cooker Pot Roast\n\n🥑\n• 2 (10....
2         🍴 Brown Sugar Meatloaf\n\n🥑\n• 1/2 cup packed ...
3         🍴 Best Chocolate Chip Cookies\n\n🥑\n• 1 cup bu...
4         🍴 Homemade Mac and Cheese Casserole\n\n🥑\n• 8 ...
                                ...                        
125159    🍴 Summer Corn Salad\n\n🥑\n• 4 ears fresh corn\...
125160    🍴 Zucchini Stuffed Tomatoes\n\n🥑\n• 4 large pl...
125161    🍴 Pepper Pasta Quick Cook\n\n🥑\n• 3 tablespoon...
125162    🍴 Chocolate Cake with Armagnac Ice Cream\n\n🥑\...
125163    🍴 Crabby Bisque\n\n🥑\n• 3 (10.5-ounce) cans re...
Length: 124473, dtype: object

In [16]:
# Filter out long recipes
recipe_lengths = text_recipes.map(len)
recipe_maxlen = int(np.percentile(recipe_lengths, 85)) # Set maximum length at 85th percentile
filtered_recipes = text_recipes[text_recipes.str.len() < recipe_maxlen]
print(recipe_maxlen)
print(f"Removed {len(text_recipes) - len(filtered_recipes)} recipes. Kept {len(filtered_recipes)} recipes.")

2164
Removed 18684 recipes. Kept 105789 recipes.


# Save dataset to disk

In [17]:
dataset_path = os.path.join(CACHE_DIR, 'emoji_text_recipes.pkl')
filtered_recipes.to_pickle(dataset_path) 