In [None]:
!pip install tqdm

## Download Datasets

In [1]:
!curl https://storage.googleapis.com/recipe-box/recipes_raw.zip --output recipe_box_recipes_raw.zip
!curl http://data.csail.mit.edu/im2recipe/recipe1M_layers.tar.gz --output recipe1M_layers.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 50.8M  100 50.8M    0     0   9.7M      0  0:00:05  0:00:05 --:--:-- 11.4M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  380M  100  380M    0     0  6553k      0  0:00:59  0:00:59 --:--:-- 6792k0  6878k      0  0:00:56  0:00:26  0:00:30 6540k:48  0:00:35  0:00:13 13.0M


In [15]:
# manually download from kaggle https://www.kaggle.com/hugodarwood/epirecipes
# save as epicurious-recipes-with-rating-and-nutrition.zip

# manually download from kaggle https://www.kaggle.com/kaggle/recipe-ingredients-dataset
# save as recipe-ingredients-dataset.zip

# manually download from kaggle https://www.kaggle.com/datafiniti/food-ingredient-lists/downloads/food-ingredient-lists.zip/1
# save as food-ingredient-lists.zip

In [2]:
!unzip epicurious-recipes-with-rating-and-nutrition.zip -d epicurious/
!rm epicurious-recipes-with-rating-and-nutrition.zip
!unzip recipe_box_recipes_raw.zip -d recipe-box/
!rm recipe_box_recipes_raw.zip
!unzip recipe-ingredients-dataset.zip -d recipe-ingr/
!rm recipe-ingredients-dataset.zip
!mkdir recipe1M/
!tar -xzvf recipe1M_layers.tar.gz -C recipe1M/
!rm recipe1M_layers.tar.gz
!unzip food-ingredient-lists.zip -d food-ingredient-lists/
!rm food-ingredient-lists.zip

Archive:  epicurious-recipes-with-rating-and-nutrition.zip
  inflating: epicurious/epi_r.csv    
  inflating: epicurious/full_format_recipes.json  
  inflating: epicurious/recipe.py    
  inflating: epicurious/utils.py     
Archive:  recipe_box_recipes_raw.zip
  inflating: recipe-box/recipes_raw_nosource_ar.json  
  inflating: recipe-box/recipes_raw_nosource_epi.json  
  inflating: recipe-box/recipes_raw_nosource_fn.json  
  inflating: recipe-box/LICENSE      
Archive:  recipe-ingredients-dataset.zip
  inflating: recipe-ingr/test.json   
  inflating: recipe-ingr/train.json  
layer1.json
layer2.json


## Load Datasets

In [1]:
import json
from pprint import pprint
from string import ascii_lowercase
import random
from tqdm import tqdm

In [2]:
clean_ingredients = set()
all_recipes = set()

### recipe-ingredients-dataset

In [3]:
with open('recipe-ingr/train.json', 'rb') as f:
    recipe_ingr_train = json.loads(f.read().decode("utf-8"))
with open('recipe-ingr/test.json', 'rb') as f:
    recipe_ingr_test = json.loads(f.read().decode("utf-8"))

recipe_ingr = recipe_ingr_train + recipe_ingr_test
pprint(recipe_ingr[0])

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
                 'black olives',
                 'grape tomatoes',
                 'garlic',
                 'pepper',
                 'purple onion',
                 'seasoning',
                 'garbanzo beans',
                 'feta cheese crumbles']}


In [4]:
for r in recipe_ingr:
    clean_ingredients.update(r['ingredients'])
recipe_ingr_recipes = {tuple(sorted([x.lower() for x in r['ingredients']])) for r in recipe_ingr}

### epicurious

In [5]:
with open('epicurious/full_format_recipes.json', 'r') as f:
    epicurious = json.loads(f.read())
pprint(epicurious[2]['ingredients'])

epicurious_recipes = {tuple(sorted([x.lower() for x in r['ingredients']])) for r in epicurious if len(r)}

['1 fennel bulb (sometimes called anise), stalks discarded, bulb cut\xa0into '
 '1/2-inch dice, and feathery leaves reserved for garnish',
 '1 onion, diced',
 '2 tablespoons unsalted butter',
 '2 medium russet (baking) potatoes',
 '2 cups chicken broth',
 '1 1/2 cups milk']


### recipe-box

In [6]:
files = ['recipes_raw_nosource_epi.json', 'recipes_raw_nosource_ar.json', 'recipes_raw_nosource_fn.json']
# epicurious, allrecipes, foodnetwork
recipe_box_recipes = set()
for filename in files:
    with open('recipe-box/' + filename, 'r') as f:
        rb_data_part = json.loads(f.read())
        for r in rb_data_part.values():
            if len(r):
                recipe_box_recipes.add(tuple(sorted([x.lower().replace('advertisement', '').strip() for x in r['ingredients']])))
pprint(random.sample(recipe_box_recipes, 1))

[('',
  '1 cup chocolate-hazelnut spread (such as nutella®)',
  '1 cup graham cracker crumbs',
  '1 egg',
  '1 pinch salt',
  '1/2 teaspoon baking soda',
  '1/2 teaspoon vanilla extract')]


### recipe1M

In [7]:
ls recipe1M/

layer1.json  layer2.json


In [8]:
with open('recipe1M/layer1.json', 'r') as f:
    recipe1M_data = json.loads(f.read())

In [9]:
recipe1M_recipes = set()
for r in recipe1M_data:
    recipe1M_recipes.add(tuple(sorted([x['text'].lower() for x in r['ingredients']])))

In [10]:
recipe1M_data[0]['ingredients']

[{'text': '6 ounces penne'},
 {'text': '2 cups Beechers Flagship Cheese Sauce (recipe follows)'},
 {'text': '1 ounce Cheddar, grated (1/4 cup)'},
 {'text': '1 ounce Gruyere cheese, grated (1/4 cup)'},
 {'text': '1/4 to 1/2 teaspoon chipotle chili powder (see Note)'},
 {'text': '1/4 cup (1/2 stick) unsalted butter'},
 {'text': '1/3 cup all-purpose flour'},
 {'text': '3 cups milk'},
 {'text': '14 ounces semihard cheese (page 23), grated (about 3 1/2 cups)'},
 {'text': '2 ounces semisoft cheese (page 23), grated (1/2 cup)'},
 {'text': '1/2 teaspoon kosher salt'},
 {'text': '1/4 to 1/2 teaspoon chipotle chili powder'},
 {'text': '1/8 teaspoon garlic powder'},
 {'text': '(makes about 4 cups)'}]

### cleanup

In [11]:
def replace_units(s):
    original_string = s
    
    tokens = s.replace('to taste', '').split()
    units = ['ounce', 'ounces', 'cups', 'cup', 'teaspoon', 'tablespoon', 'tablespoons', 'teaspoons', 'c', 'g', 'v', 'tbsp', 'x', 'ml', 'lb', 'tbs', 'oz', 'pkg', 'large', 'small', 'tsp', 'inch', 'grams', 'quarts', 'lbs', 'can', 'cube', 'whole', 'or', 'pieces', 'piece', 'chopped', 'shredded', 'diced', 'fresh', 'crushed']
    tokens_new = []
    for t in tokens:
        if t.strip() not in units:
            tokens_new.append(t)
            
    #if original_string != ' '.join(tokens_new):
    #    print(original_string, '--->', ' '.join(tokens_new))
    
    return ' '.join(tokens_new)
    
def cleanup_ingredient_list(l, remove_whitespace_variants):
    l = {replace_units(''.join([char for char in x.lower().strip() if char in ascii_lowercase + ' ']).strip()) for x in l}
    if not remove_whitespace_variants:
        return l
    
    removal_mapping = dict()
    remove_ingr = set()
    for i in l:
        without_whitespace = ''.join([char for char in i if char in ascii_lowercase])
        if without_whitespace in l and i != without_whitespace:
            remove_ingr.add(without_whitespace)
            removal_mapping[without_whitespace] = i
    l -= remove_ingr
    #print('removed: ' + str(remove_ingr))
    return l, removal_mapping

In [12]:
messy_recipes = epicurious_recipes | recipe_box_recipes | recipe1M_recipes
clean_recipes = recipe_ingr_recipes

In [13]:
clean_ingredients, whitespace_mapping = cleanup_ingredient_list(clean_ingredients, remove_whitespace_variants=True)

new_messy_recipes = set()
for r in tqdm(messy_recipes):
    new_messy_recipes.add(tuple(cleanup_ingredient_list(r, remove_whitespace_variants=False)))
messy_recipes = new_messy_recipes

100%|██████████| 1088280/1088280 [01:05<00:00, 16708.17it/s]


In [14]:
whitespace_mapping

{'crabmeat': 'crab meat',
 'cornbread': 'corn bread',
 'lemongrass': 'lemon grass',
 'rosewater': 'rose water',
 'dillweed': 'dill weed',
 'shiromiso': 'shiro miso',
 'cornflakes': 'corn flakes',
 'peapods': 'pea pods',
 'poppyseeds': 'poppy seeds',
 'ladyfingers': 'lady fingers',
 'gingerroot': 'ginger root',
 'cornflour': 'corn flour',
 'piecrust': 'pie crust',
 'codfish': 'cod fish',
 'blackpepper': 'black pepper',
 'cuminseed': 'cumin seed',
 'sugarcane': 'sugar cane',
 'poundcake': 'pound cake',
 'mahimahi': 'mahi mahi',
 'tatsoi': 'tat soi',
 'arrowroot': 'arrow root',
 'wheatberries': 'wheat berries'}

In [15]:
new_messy_recipes = set()
for r in messy_recipes:
    r_new = set()
    for i in r:
        if i in whitespace_mapping:
            r_new.add(whitespace_mapping[i])
        else:
            r_new.add(i)
    new_messy_recipes.add(tuple(r_new))
messy_recipes = new_messy_recipes

new_clean_recipes = set()
for r in clean_recipes:
    r_new = set()
    for i in r:
        if i in whitespace_mapping:
            r_new.add(whitespace_mapping[i])
        else:
            r_new.add(i)
    new_clean_recipes.add(tuple(r_new))
clean_recipes = new_clean_recipes

In [16]:
print(len(clean_ingredients), len(clean_recipes), len(messy_recipes))

6872 48876 1079095


In [17]:
e, *_ = messy_recipes
e

('salt',
 'oregano',
 'jalapenos seeded finely',
 'olive oil',
 'onion',
 'cayenne',
 'boneless skinless chicken breasts',
 'sour cream',
 'chicken broth',
 'cans cannellini beans drained and rinsed other white beans',
 'garlic cloves',
 'cumin')

In [18]:
e, *_ = clean_recipes
e

('balsamic vinegar',
 'fresh basil',
 'purple onion',
 'cooking spray',
 'olive oil',
 'baby spinach leaves',
 'crushed red pepper',
 'large eggs',
 'black pepper',
 'plum tomatoes',
 'mushrooms',
 'garlic cloves',
 'large egg whites',
 'salt',
 'nonfat ricotta cheese')

## save

In [19]:
import pickle
with open('data.pickle', 'wb+') as f:
    pickle.dump({'clean_recipes': clean_recipes, 'messy_recipes': messy_recipes}, f)