# Introduction

Through a quick overview/exploration of the existing online datasets, one was selected as the unanimous best. It's well organised, and contains the most data from the most amount of sources.

## Chapter Objective

### Primary 

To organise the dataset with its ingredients as elements of the dataset. 

### Secondary

Preprocess this dataset so it can be used in the following chapters.

# Setup 

In [1]:
#| default_exp recipes.create

In [2]:
#| export
from pyprojroot import here
root = here()
import sys
sys.path.append(str(root))

In [3]:
#| export
import pandas as pd
import numpy as np

import nltk
import spacy
from spacy.matcher import Matcher
from spacy.util import filter_spans

import json
from itertools import groupby
import re
import string
import time

from ast import literal_eval

from tqdm import tqdm
tqdm.pandas()

from food_database.utils.utils import *

from parse_ingredients import parse_ingredient as parse_ingredient_rgx
from ingredient_parser import parse_ingredient as parse_ingredient_nlp

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import multiprocessing as mp

from food_database.utils.parallel import *

from pathlib import Path

from sacremoses import MosesTokenizer, MosesDetokenizer
mt, md = MosesTokenizer(lang='en'), MosesDetokenizer(lang='en')

In [4]:
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
#| export
with open(f'{root}/data/globals/unit_conversions.json') as f:
    unit_list = json.load(f)

# Recipes DF

In [6]:
recipes_df = pd.read_csv('../data/datasets/recipe/recipes_nlg/full_dataset.csv')
recipes_df = recipes_df.drop('Unnamed: 0', axis=1)
recipes_df

Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."
...,...,...,...,...,...,...
2231137,Sunny's Fake Crepes,"[""1/2 cup chocolate hazelnut spread (recommend...","[""Spread hazelnut spread on 1 side of each tor...",www.foodnetwork.com/recipes/sunny-anderson/sun...,Recipes1M,"[""chocolate hazelnut spread"", ""tortillas"", ""bu..."
2231138,Devil Eggs,"[""1 dozen eggs"", ""1 paprika"", ""1 salt and pepp...","[""Boil eggs on medium for 30mins."", ""Then cool...",cookpad.com/us/recipes/355411-devil-eggs,Recipes1M,"[""eggs"", ""paprika"", ""salt"", ""choice"", ""miracle..."
2231139,Extremely Easy and Quick - Namul Daikon Salad,"[""150 grams Daikon radish"", ""1 tbsp Sesame oil...","[""Julienne the daikon and squeeze out the exce...",cookpad.com/us/recipes/153324-extremely-easy-a...,Recipes1M,"[""radish"", ""Sesame oil"", ""White sesame seeds"",..."
2231140,Pan-Roasted Pork Chops With Apple Fritters,"[""1 cup apple cider"", ""6 tablespoons sugar"", ""...","[""In a large bowl, mix the apple cider with 4 ...",cooking.nytimes.com/recipes/1015164,Recipes1M,"[""apple cider"", ""sugar"", ""kosher salt"", ""bay l..."


In [7]:
recipes_df = recipes_df.astype({'title': 'string', 'link': 'string', 'source': 'string'})

In [8]:
recipes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2231142 entries, 0 to 2231141
Data columns (total 6 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   title        string
 1   ingredients  object
 2   directions   object
 3   link         string
 4   source       string
 5   NER          object
dtypes: object(3), string(3)
memory usage: 102.1+ MB


## Initial Exploration

A brief exploration to understand the dataset.

### Sources

In [9]:
recipes_df['link'].head()

0     www.cookbooks.com/Recipe-Details.aspx?id=44874
1    www.cookbooks.com/Recipe-Details.aspx?id=699419
2     www.cookbooks.com/Recipe-Details.aspx?id=10570
3    www.cookbooks.com/Recipe-Details.aspx?id=897570
4    www.cookbooks.com/Recipe-Details.aspx?id=659239
Name: link, dtype: string

In [10]:
def get_base_url(url):
    base_url = re.search(r'^.+?[^\/:](?=[?\/]|$)', url).group(0)
    base_url = base_url.replace('www.', '')
    return base_url

get_base_url(recipes_df['link'].iloc[0])

'cookbooks.com'

In [11]:
recipes_df['link'].apply(get_base_url).value_counts()

link
cookbooks.com          896341
food.com               499616
epicurious.com         129444
tastykitchen.com        78768
myrecipes.com           64895
allrecipes.com          64602
cookpad.com             61020
cookeatshare.com        59307
yummly.com              51963
tasteofhome.com         51594
foodnetwork.com         49443
food52.com              48501
kraftrecipes.com        42010
recipeland.com          24418
recipes-plus.com        20524
cooking.nytimes.com     16367
foodandwine.com         15436
seriouseats.com         12632
foodgeeks.com            8963
cookstr.com              8797
online-cookbook.com      5691
chowhound.com            5671
vegetariantimes.com      4578
delish.com               3880
landolakes.com           2492
foodrepublic.com         2259
lovefood.com             1930
Name: count, dtype: int64

## Sampling DF

A large dataset, we want a small sample to be able to work with quickly during exploration.

In [12]:
recipes_df_full = recipes_df.copy()
recipes_df = recipes_df_full.sample(300, ignore_index=False, random_state=777)
recipes_df.shape[0], recipes_df_full.shape[0]/recipes_df.shape[0]

(300, 7437.14)

In [13]:
# adding tracked recipes
recipes_df = pd.concat([recipes_df, recipes_df_full.loc[[2006319, 931097]]], axis=0)

In [14]:
recipes_df.to_feather('../data/local/recipe/partial/recipe/0.feather')

# Ingredients DF

Since we will be working with each individial ingredient, we want to explode each ingredient as its own row, which we could tie to its recipe through a MultiIndex.

In [15]:
# read_csv reads lists as strings -> convering them to lists beforehand
recipes_df['ingredients'] = recipes_df['ingredients'].apply(lambda x: literal_eval(x))

In [16]:
ingredients_df = recipes_df.explode('ingredients')
ingredients_df

Unnamed: 0,title,ingredients,directions,link,source,NER
1746116,Turtle Thumbprints,"2/3 cup Land O Lakes Butter, softened","[""Combine butter, sugar, egg yolks and vanilla...",www.landolakes.com/recipe/1086/turtle-thumbprints,Recipes1M,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
1746116,Turtle Thumbprints,1/2 cup sugar,"[""Combine butter, sugar, egg yolks and vanilla...",www.landolakes.com/recipe/1086/turtle-thumbprints,Recipes1M,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
1746116,Turtle Thumbprints,2 Land O Lakes Eggs (yolks only),"[""Combine butter, sugar, egg yolks and vanilla...",www.landolakes.com/recipe/1086/turtle-thumbprints,Recipes1M,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
1746116,Turtle Thumbprints,1 teaspoon vanilla,"[""Combine butter, sugar, egg yolks and vanilla...",www.landolakes.com/recipe/1086/turtle-thumbprints,Recipes1M,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
1746116,Turtle Thumbprints,1 1/2 cups all-purpose flour,"[""Combine butter, sugar, egg yolks and vanilla...",www.landolakes.com/recipe/1086/turtle-thumbprints,Recipes1M,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
...,...,...,...,...,...,...
931097,Pineapple Spiced Chicken And Rice,1/4 cup chopped red bell pepper,"[""Drain pineapple; reserve 1 tablespoon juice....",www.allrecipes.com/recipe/232611/pineapple-spi...,Gathered,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun..."
931097,Pineapple Spiced Chicken And Rice,1 cup Jasmine rice,"[""Drain pineapple; reserve 1 tablespoon juice....",www.allrecipes.com/recipe/232611/pineapple-spi...,Gathered,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun..."
931097,Pineapple Spiced Chicken And Rice,1 1/2 cups reduced-sodium chicken broth,"[""Drain pineapple; reserve 1 tablespoon juice....",www.allrecipes.com/recipe/232611/pineapple-spi...,Gathered,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun..."
931097,Pineapple Spiced Chicken And Rice,1 tablespoon chopped fresh basil,"[""Drain pineapple; reserve 1 tablespoon juice....",www.allrecipes.com/recipe/232611/pineapple-spi...,Gathered,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun..."


In [17]:
# setting indices for the ingredients in each recipe (multiindex)
ingredients_df['ingredient'] = ingredients_df.groupby(ingredients_df.index).cumcount()
ingredients_df = ingredients_df.set_index([ingredients_df.index, 'ingredient'])
ingredients_df.index = ingredients_df.index.set_names(['recipe', 'ingredient'])
ingredients_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title,ingredients,directions,link,source,NER
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1746116,0,Turtle Thumbprints,"2/3 cup Land O Lakes Butter, softened","[""Combine butter, sugar, egg yolks and vanilla...",www.landolakes.com/recipe/1086/turtle-thumbprints,Recipes1M,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
1746116,1,Turtle Thumbprints,1/2 cup sugar,"[""Combine butter, sugar, egg yolks and vanilla...",www.landolakes.com/recipe/1086/turtle-thumbprints,Recipes1M,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
1746116,2,Turtle Thumbprints,2 Land O Lakes Eggs (yolks only),"[""Combine butter, sugar, egg yolks and vanilla...",www.landolakes.com/recipe/1086/turtle-thumbprints,Recipes1M,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
1746116,3,Turtle Thumbprints,1 teaspoon vanilla,"[""Combine butter, sugar, egg yolks and vanilla...",www.landolakes.com/recipe/1086/turtle-thumbprints,Recipes1M,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
1746116,4,Turtle Thumbprints,1 1/2 cups all-purpose flour,"[""Combine butter, sugar, egg yolks and vanilla...",www.landolakes.com/recipe/1086/turtle-thumbprints,Recipes1M,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."


In [18]:
# selecting columns
ingredients_df = ingredients_df.drop(columns=['title', 'directions', 'link', 'source'])
ingredients_df.rename(columns={'ingredients':'ingredient_string'}, inplace=True)
ingredients_df

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredient_string,NER
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1
1746116,0,"2/3 cup Land O Lakes Butter, softened","[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
1746116,1,1/2 cup sugar,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
1746116,2,2 Land O Lakes Eggs (yolks only),"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
1746116,3,1 teaspoon vanilla,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
1746116,4,1 1/2 cups all-purpose flour,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""..."
...,...,...,...
931097,10,1/4 cup chopped red bell pepper,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun..."
931097,11,1 cup Jasmine rice,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun..."
931097,12,1 1/2 cups reduced-sodium chicken broth,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun..."
931097,13,1 tablespoon chopped fresh basil,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun..."


# Preprocessing (1)

Now that we have the ingredients dataframe, we want to preprocess it so it for use in the following chapters.

## Parsing Information

### Name, Quantity, Unit Parsing

We need to extract two things from the ingredient strings: 

1. Name
2. Quantity

These are written in the form of a string, so to parse this information we need to use NLP, specficially Named Entity Recognition (NER). 


Here we could train our own model to do precisely what we want, however this would require a bit of work. We'd need to manually create example parses from a dataset, and figure out how training a model in SpaCy works (~5hr work).

Let's first try the existing libraries to see what we can gain from it.

### Research


- https://github.com/Glorf/recipenlg (datasets thesis explains how they extracted their ingredients in the 'NER' column)
- https://deepgram.com/learn/turning-recipes-into-data-with-named-entity-recognition (good example on training SpaCy)
- https://ingredient-parser.readthedocs.io/en/latest/index.html (python SpaCy library)
- https://archive.nytimes.com/open.blogs.nytimes.com/2015/04/09/extracting-structured-data-from-recipes-using-conditional-random-fields/ (2015 article how NyTimes' recipe search functionality was created)
    - https://github.com/mtlynch/ingredient-phrase-tagger (open source code extracted from it)

These methods all use an NLP method called Named Entity Recognition (NER).

We could train our own NER model, but first lets take a look at the existing models to see if they're fit for our purposes.

In [19]:
recipe = ingredients_df.loc[ingredients_df.first_valid_index()[0]]['ingredient_string']
ingredient = recipe.loc[0]
ingredient, recipe

('2/3 cup Land O Lakes Butter, softened',
 ingredient
 0              2/3 cup Land O Lakes Butter, softened
 1                                      1/2 cup sugar
 2                   2 Land O Lakes Eggs (yolks only)
 3                                 1 teaspoon vanilla
 4                       1 1/2 cups all-purpose flour
 5                             20 caramels, unwrapped
 6    2 tablespoons Land O Lakes Heavy Whipping Cream
 7                                    48 pecan halves
 8            1/2 cup real semi-sweet chocolate chips
 9                             2 teaspoons shortening
 Name: ingredient_string, dtype: object)

### Existing Libraries

#### RegEx

RegEx would be the simplest method of doing this, however I'm expecting it not to be able to deal with more complex strings of text.

In [20]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/steph/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [21]:
parsed_ingredient_rgx = parse_ingredient_rgx(ingredient)
parsed_ingredient_rgx

Ingredient(name='Land O Lakes Butter', quantity=2.5, unit='cup', comment='softened', original_string='2/3 cup Land O Lakes Butter, softened')

In [22]:
[parse_ingredient_rgx(ingredient) for ingredient in recipe]

[Ingredient(name='Land O Lakes Butter', quantity=2.5, unit='cup', comment='softened', original_string='2/3 cup Land O Lakes Butter, softened'),
 Ingredient(name='sugar', quantity=1.5, unit='cup', comment='', original_string='1/2 cup sugar'),
 Ingredient(name='Land O Lakes Eggs', quantity=2.0, unit='', comment='(yolks only)', original_string='2 Land O Lakes Eggs (yolks only)'),
 Ingredient(name='vanilla', quantity=1.0, unit='tsp', comment='', original_string='1 teaspoon vanilla'),
 Ingredient(name='all-purpose flour', quantity=1.5, unit='cup', comment='', original_string='1 1/2 cups all-purpose flour'),
 Ingredient(name='caramels', quantity=20.0, unit='', comment='unwrapped', original_string='20 caramels'),
 Ingredient(name='Land O Lakes Heavy Whipping Cream', quantity=2.0, unit='tbsp', comment='', original_string='2 tablespoons Land O Lakes Heavy Whipping Cream'),
 Ingredient(name='pecan halves', quantity=48.0, unit='', comment='', original_string='48 pecan halves'),
 Ingredient(name='

It's pretty good, but even in these six ingredients we can spot errors. It can't convert fraction numbers (2/3 : 2.5).

#### NLP

This is a [pretrained SpaCy NER model](https://ingredient-parser.readthedocs.io/en/latest/index.html) for the exact problemset. Let's see how this performs:

In [23]:
parsed_ingredient_nlp = parse_ingredient_nlp(ingredient)
parsed_ingredient_nlp

ParsedIngredient(name=IngredientText(text='Land O Lakes Butter', confidence=0.989295), amount=[IngredientAmount(quantity='0.667', unit='cups', confidence=0.99965, APPROXIMATE=False, SINGULAR=False)], preparation=IngredientText(text='softened', confidence=0.989225), comment=None, other=None, sentence='2/3 cup Land O Lakes Butter, softened')

In [24]:
[parse_ingredient_nlp(ingredient) for ingredient in recipe]

[ParsedIngredient(name=IngredientText(text='Land O Lakes Butter', confidence=0.989295), amount=[IngredientAmount(quantity='0.667', unit='cups', confidence=0.99965, APPROXIMATE=False, SINGULAR=False)], preparation=IngredientText(text='softened', confidence=0.989225), comment=None, other=None, sentence='2/3 cup Land O Lakes Butter, softened'),
 ParsedIngredient(name=IngredientText(text='sugar', confidence=0.991945), amount=[IngredientAmount(quantity='0.5', unit='cups', confidence=0.998894, APPROXIMATE=False, SINGULAR=False)], preparation=None, comment=None, other=None, sentence='1/2 cup sugar'),
 ParsedIngredient(name=IngredientText(text='Land O Lakes Eggs', confidence=0.962857), amount=[IngredientAmount(quantity='2', unit='', confidence=0.994399, APPROXIMATE=False, SINGULAR=False)], preparation=None, comment=IngredientText(text='(yolks only)', confidence=0.984288), other=None, sentence='2 Land O Lakes Eggs (yolks only)'),
 ParsedIngredient(name=IngredientText(text='vanilla', confidence=

### Chosen Parse Method

I think using NLP sounds much better, but it's has a few flaws right off the bat. Most noteworthy is it doesn't recognise shortened unit for cups (ie. c. != cup). 

We could simply fix this in the preprocessing.

### Preprocessing

A little bit of preprocessing is required here.

#### Cup (c.) Units

In [25]:
#| export 
def preprocess_cup_units(ingredient_string):
    ingredient_string = re.sub(r'\bc\.', 'cup', ingredient_string)
    return ingredient_string

In [26]:
assert preprocess_cup_units('5 c. water') == '5 cup water'

#### OR string

This was confusing the ingredients in later steps.

Here we want to remove the second option of ingredients.

In [27]:
ingredients_df['ingredient_string'][ingredients_df['ingredient_string'].str.contains(' or ')].loc[1936429, 5]

'3 to 4 tablespoons palm sugar, thinly sliced, or dark brown sugar'

##### Tagging Ingedient String

For this we it will be helpful to be able to get a pos_tag (noun, verb, etc) of the strings. This involved tokenizing, and detokenize the text. 

Lets test how much this will mutate the data.

In [28]:
tokenized = ingredients_df['ingredient_string'].apply(lambda x: detokenize(mt.tokenize(x)))

In [29]:
equality = tokenized.eq(ingredients_df['ingredient_string'])
equality.sum(), ingredients_df.shape[0]

(2481, 2511)

In [30]:
pd.set_option('display.max_colwidth', None)

In [31]:
unequal = ingredients_df['ingredient_string'][~equality].to_frame('ingredient_string').join(tokenized, lsuffix='_a', rsuffix='_b')
unequal

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredient_string_a,ingredient_string_b
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1
954817,2,2/3 cup low-carb sugar substitute (such as Swerve(R)),2/3 cup low-carb sugar substitute (such as Swerve (R))
1984757,1,1 1/2 teaspoons fresh sage leaves*,1 1/2 teaspoons fresh sage leaves *
1984757,6,1 1/2 teaspoons chopped fresh sage leaves*,1 1/2 teaspoons chopped fresh sage leaves *
1396331,2,"1 Large rutabaga, or several smaller ones, peeled and cut into 3/4"" cubes","1 Large rutabaga, or several smaller ones, peeled and cut into 3/4 ""cubes"
1369744,10,"1 slice rustic white bread, torn into 1"" pieces","1 slice rustic white bread, torn into 1 ""pieces"
1369744,18,"1-2 slices rustic white bread, torn into 1"" pieces","1-2 slices rustic white bread, torn into 1 ""pieces"
1613023,8,1 teaspoon golpar (angelica powder)- optional,1 teaspoon golpar (angelica powder) - optional
324733,0,1 1/2 c. warm water (110~ to 115~),1 1/2 c. warm water (110 ~ to 115 ~)
928740,4,2 tablespoons coffee-flavored liqueur (such as Kahlua(R)),2 tablespoons coffee-flavored liqueur (such as Kahlua (R))
1930042,13,"2 apples, peeled,cored and chopped","2 apples, peeled, cored and chopped"


-> This seems good, most remain unchanged, whilst the others are changed only with slight differences of the position of some punctiation.

In [32]:
pd.reset_option('display.max_colwidth')

In [33]:
#| export
tag_map = {'NN': 'noun', 'VB': 'verb', 'JJ': 'adj', 'RB': 'adv'}
def tag_ingredient_string(ingredient_string, is_item=True):
    tagged_words = nltk.pos_tag(nltk.word_tokenize(ingredient_string))
    mapped_words = [{'word': w[0], 'tag': tag_map[w[1][:2]] if w[1][:2] in tag_map else 'nan'} for w in tagged_words]
    if is_item and mapped_words:
        mapped_words[-1]['tag'] = 'noun' # making sure last word is noun, as these are items that we are labelling
    return mapped_words

In [34]:
tag_ingredient_string('green chilli pepper')

[{'word': 'green', 'tag': 'adj'},
 {'word': 'chilli', 'tag': 'noun'},
 {'word': 'pepper', 'tag': 'noun'}]

In [35]:
assert tag_ingredient_string('green chilli pepper') == [{'word': 'green', 'tag': 'adj'},
                                                        {'word': 'chilli', 'tag': 'noun'},
                                                        {'word': 'pepper', 'tag': 'noun'}]

In [36]:
#| export
def preprocess_remove_ors(ingredient_string):
    
    or_search = re.search(r'\bor\b', ingredient_string)
    if not or_search: return ingredient_string

    prev_string = ingredient_string[:or_search.span()[0]]
    following_string = ingredient_string[or_search.span()[-1]:]

    prev_tokens = tag_ingredient_string(prev_string, is_item=False)
    following_tokens = tag_ingredient_string(following_string, is_item=False)

    prev_nouns = any([token for token in prev_tokens if token['tag'] == 'noun'])

    if re.search(r'\(\s?or.*\)', ingredient_string):
        ingredient_string = re.sub(r'\(\s?or.*\)', '', ingredient_string)
    elif re.search(r'(\bor\b).*\d', ingredient_string) and prev_nouns:
        ingredient_string = re.sub(r'(\bor\b).*', '', ingredient_string)
    else:
        if not following_tokens: 
            ingredient_string = prev_string
        else:
            i = 0
            while i < len(following_tokens) and not following_tokens[i]['tag'] == 'noun': i += 1
            if i < len(following_tokens) and following_tokens[i]['tag'] == 'noun':
                if i < len(following_tokens)-1: pass
                if i == 0:
                    following_tokens = []
                else:
                    following_tokens = following_tokens[i:]
            else:
                following_tokens = []
            ingredient_words = prev_string.split(' ') + [word['word'] for word in following_tokens]
            ingredient_string = detokenize(sorted(set(ingredient_words), key=ingredient_words.index))
    
    # postprocess
    ingredient_string = ingredient_string.strip()
    ingredient_string = re.sub(r'\s\s+', ' ', ingredient_string) # double spaces
    ingredient_string = re.sub(r'[,\(.]$', '', ingredient_string) # trailing punctuation

    return ingredient_string

In [37]:
tests = {
    '4 egg whites or 1/2 c. Egg Beaters': '4 egg whites',
    'olive or sunflower oil': 'olive oil',
    'olive oil or sunflower oil': 'olive oil',
    '1/4 teaspoon salt or to taste': '1/4 teaspoon salt',
    '3 sticks butter or margarine': '3 sticks butter',
    '1 or 2 onions, chopped': '1 onions, chopped',
    '4 whole Medium (or 6 Small) Peaches, Peeled And Sliced': '4 whole Medium Peaches, Peeled And Sliced',
}

for ingredient_string, desired_string in tests.items():
    transformed = preprocess_remove_ors(ingredient_string)
    assert transformed == desired_string

In [38]:
pd.set_option('display.max_colwidth', None)

In [39]:
ingredients_df['ingredient_string_or_removed'] = ingredients_df['ingredient_string'].apply(preprocess_remove_ors)
ingredients_df[['ingredient_string', 'ingredient_string']][ingredients_df['ingredient_string'] != ingredients_df['ingredient_string']]

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredient_string,ingredient_string
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1


In [40]:
pd.reset_option('display.max_colwidth')

#### Applying Preprocessing

In [41]:
#| export 
def ner_preprocess_ingredient_string(ingredient_string):
    ingredient_string = preprocess_cup_units(ingredient_string)
    ingredient_string = preprocess_remove_ors(ingredient_string)
    return ingredient_string

In [42]:
ingredients_df['ingredient_string_processed'] = ingredients_df['ingredient_string'].apply(ner_preprocess_ingredient_string)

In [43]:
ingredients_df[['ingredient_string', 'ingredient_string_processed']][ingredients_df['ingredient_string'] != ingredients_df['ingredient_string_processed']]

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredient_string,ingredient_string_processed
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1
1828339,0,"2 (225 g) cans grape leaves, drained (canned o...","2 (225 g) cans grape leaves, drained (canned"
1703,0,"1 pkg. applesauce, spice or carrot cake mix","1 pkg. applesauce, spice"
1703,2,1/4 c. water,1/4 cup water
1703,4,1 c. nuts (optional),1 cup nuts (optional)
894410,0,2 c. water,2 cup water
...,...,...,...
599284,2,1 c. sour cream,1 cup sour cream
599284,3,1 c. grated Cheddar cheese,1 cup grated Cheddar cheese
1357213,0,2 c. drained sweet potatoes,2 cup drained sweet potatoes
1357213,1,1 c. milk,1 cup milk


In [44]:
ingredients_df['ingredient_string'] = ingredients_df['ingredient_string_processed']
ingredients_df = ingredients_df.drop('ingredient_string_processed', axis=1)

### Applying to Full Series 

In [45]:
#| export
def parse_ingredient_string(ingredient_string):
    nlp_parse = parse_ingredient_nlp(ingredient_string)
    parsed_ingredient = {
        'name': nlp_parse.name.text if nlp_parse.name else None,
        'quantity': nlp_parse.amount[0].quantity if nlp_parse.amount else None,
        'unit': nlp_parse.amount[0].unit if nlp_parse.amount else None,
        'comment': nlp_parse.comment.text if nlp_parse.comment else None,
        'preparation': nlp_parse.preparation.text if nlp_parse.preparation else None
    }
    return parsed_ingredient

In [46]:
parse_ingredient_string(ingredient)

{'name': 'Land O Lakes Butter',
 'quantity': '0.667',
 'unit': 'cups',
 'comment': None,
 'preparation': 'softened'}

In [47]:
ingredients_df['parsed'] = ingredients_df['ingredient_string'].apply(parse_ingredient_string)
ingredients_df['parsed']

recipe   ingredient
1746116  0             {'name': 'Land O Lakes Butter', 'quantity': '0...
         1             {'name': 'sugar', 'quantity': '0.5', 'unit': '...
         2             {'name': 'Land O Lakes Eggs', 'quantity': '2',...
         3             {'name': 'vanilla', 'quantity': '1', 'unit': '...
         4             {'name': 'all-purpose flour', 'quantity': '1.5...
                                             ...                        
931097   10            {'name': 'red bell pepper', 'quantity': '0.25'...
         11            {'name': 'Jasmine rice', 'quantity': '1', 'uni...
         12            {'name': 'reduced-sodium chicken broth', 'quan...
         13            {'name': 'fresh basil', 'quantity': '1', 'unit...
         14            {'name': 'Salt and ground pepper', 'quantity':...
Name: parsed, Length: 2511, dtype: object

### Expanding Dict into Columns

In [48]:
expanded = pd.json_normalize(ingredients_df['parsed'])
expanded.index = ingredients_df.index
expanded

Unnamed: 0_level_0,Unnamed: 1_level_0,name,quantity,unit,comment,preparation
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1746116,0,Land O Lakes Butter,0.667,cups,,softened
1746116,1,sugar,0.5,cups,,
1746116,2,Land O Lakes Eggs,2,,(yolks only),
1746116,3,vanilla,1,teaspoon,,
1746116,4,all-purpose flour,1.5,cups,,
...,...,...,...,...,...,...
931097,10,red bell pepper,0.25,cups,,chopped
931097,11,Jasmine rice,1,cup,,
931097,12,reduced-sodium chicken broth,1.5,cups,,
931097,13,fresh basil,1,tablespoon,,chopped


In [49]:
ingredients_df = pd.concat([ingredients_df, expanded], axis=1)
ingredients_df.drop(['parsed'], axis=1, inplace=True, errors='ignore')
ingredients_df

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredient_string,NER,ingredient_string_or_removed,name,quantity,unit,comment,preparation
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1746116,0,"2/3 cup Land O Lakes Butter, softened","[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""...","2/3 cup Land O Lakes Butter, softened",Land O Lakes Butter,0.667,cups,,softened
1746116,1,1/2 cup sugar,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""...",1/2 cup sugar,sugar,0.5,cups,,
1746116,2,2 Land O Lakes Eggs (yolks only),"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""...",2 Land O Lakes Eggs (yolks only),Land O Lakes Eggs,2,,(yolks only),
1746116,3,1 teaspoon vanilla,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""...",1 teaspoon vanilla,vanilla,1,teaspoon,,
1746116,4,1 1/2 cups all-purpose flour,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""...",1 1/2 cups all-purpose flour,all-purpose flour,1.5,cups,,
...,...,...,...,...,...,...,...,...,...
931097,10,1/4 cup chopped red bell pepper,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun...",1/4 cup chopped red bell pepper,red bell pepper,0.25,cups,,chopped
931097,11,1 cup Jasmine rice,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun...",1 cup Jasmine rice,Jasmine rice,1,cup,,
931097,12,1 1/2 cups reduced-sodium chicken broth,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun...",1 1/2 cups reduced-sodium chicken broth,reduced-sodium chicken broth,1.5,cups,,
931097,13,1 tablespoon chopped fresh basil,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun...",1 tablespoon chopped fresh basil,fresh basil,1,tablespoon,,chopped


In [50]:
ingredients_df = ingredients_df.fillna(pd.NA)

In [51]:
#| export
def pipeline_parse_ingredient_string_parsing(ingredients_df):

    ingredients_df['parsed'] = parallel_apply(
        ingredients_df['full_string'], 
        parse_ingredient_string, 
        meta=pd.Series(dtype='object'), 
        npartitions=500
    )
    
    ingredients_df['parsed'] = ingredients_df['parsed'].apply(literal_eval)

    expanded = pd.json_normalize(ingredients_df['parsed'])
    expanded = expanded.convert_dtypes()
    expanded.set_index(ingredients_df.index, inplace=True)
    ingredients_df = pd.concat([ingredients_df, expanded], axis=1)

    ingredients_df.drop(['parsed'], axis=1, inplace=True, errors='ignore')

    return ingredients_df

### Evaluating Results

In [52]:
ingredients_df[ingredients_df['unit'].isnull()].drop('NER', axis=1).head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredient_string,ingredient_string_or_removed,name,quantity,unit,comment,preparation
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1828339,1,salt,salt,salt,,,,
824709,0,bread,bread,bread,,,,
824709,1,ground cinnamon,ground cinnamon,ground cinnamon,,,,
824709,2,sugar,sugar,sugar,,,,
824709,3,squeeze margarine,squeeze margarine,squeeze margarine,,,,
855127,1,"tomato, sliced","tomato, sliced",tomato,,,,sliced
855127,2,Parmesan cheese,Parmesan cheese,Parmesan cheese,,,,
855127,3,olive oil,olive oil,olive oil,,,,
855127,4,"Italian spices (rosemary, thyme, oregano)","Italian spices (rosemary, thyme, oregano)",Italian spices,,,"(rosemary, thyme, oregano)",
767911,8,parsley to garnish,parsley to garnish,parsley,,,to garnish,


In [53]:
ingredients_df[ingredients_df['unit'].isnull() & ingredients_df['quantity'].notnull()].drop('NER', axis=1).head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredient_string,ingredient_string_or_removed,name,quantity,unit,comment,preparation
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


In [54]:
ingredients_df['quantity'] = ingredients_df['quantity'].astype('string')
ingredients_df['quantity']

recipe   ingredient
1746116  0             0.667
         1               0.5
         2                 2
         3                 1
         4               1.5
                       ...  
931097   10             0.25
         11                1
         12              1.5
         13                1
         14             <NA>
Name: quantity, Length: 2511, dtype: string

In [55]:
ingredients_df.drop('NER', axis=1).sample(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredient_string,ingredient_string_or_removed,name,quantity,unit,comment,preparation
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
504033,0,6 large firm green tomatoes,6 large firm green tomatoes,firm green tomatoes,6.0,large,,
1169604,9,4 chicken breast halves,4 chicken breast halves,chicken breast halves,4.0,,,
2180004,7,14 teaspoon ground cumin,14 teaspoon ground cumin,ground cumin,14.0,teaspoons,,
324733,2,4 cup sifted all-purpose flour,4 c. sifted all-purpose flour,all-purpose flour,4.0,cups,,sifted
281215,0,"1 lb. carrots, diagonally sliced","1 lb. carrots, diagonally sliced",carrots,1.0,lb,,diagonally sliced
1984757,5,1 1/4 cups all-purpose flour,1 1/4 cups all-purpose flour,all-purpose flour,1.25,cups,,
689193,2,1 (30 oz.) jar spaghetti sauce,1 (30 oz.) jar spaghetti sauce,spaghetti sauce,1.0,jar,,
1984239,6,1 Masago (capelin roe),1 Masago (capelin roe),Masago,1.0,,(capelin roe),
1032907,5,1/2 teaspoon cumin seed,1/2 teaspoon cumin seed,cumin seed,0.5,teaspoons,,
1948006,3,1/2 cup skim milk,1/2 cup skim milk,skim milk,0.5,cups,,


#### NAs

In [56]:
ingredients_df[ingredients_df['name'].isna()]

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredient_string,NER,ingredient_string_or_removed,name,quantity,unit,comment,preparation
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1956519,0,1 (1 1/4 ounce) envelope Lipton Recipe Secrets...,"[""Recipe Secrets savory"", ""sour cream"", ""ricot...",1 (1 1/4 ounce) envelope Lipton Recipe Secrets...,,1.0,envelope,Lipton Recipe Secrets savory herb with garlic ...,
1546037,0,FOR THE PUMPKIN PUREE:,"[""Pumpkin"", ""Flour"", ""Salt"", ""Butter"", ""Egg"", ...",FOR THE PUMPKIN PUREE:,,,,FOR THE PUMPKIN PUREE :,
1546037,2,FOR THE PIE CRUST:,"[""Pumpkin"", ""Flour"", ""Salt"", ""Butter"", ""Egg"", ...",FOR THE PIE CRUST:,,,,FOR THE PIE CRUST :,
1546037,9,FOR THE FILLING:,"[""Pumpkin"", ""Flour"", ""Salt"", ""Butter"", ""Egg"", ...",FOR THE FILLING:,,,,FOR THE FILLING :,
1546037,16,"15 ounces, weight Pumpkin Puree","[""Pumpkin"", ""Flour"", ""Salt"", ""Butter"", ""Egg"", ...","15 ounces, weight Pumpkin Puree",,15.0,ounces,weight Pumpkin Puree,
1806301,5,1 tbsp. sliced pimiento (optional),"[""biscuit mix"", ""poultry seasoning"", ""sour cre...",1 tbsp. sliced pimiento (optional),,1.0,tbsp,(optional),sliced pimiento
1396644,0,For the Vanilla Bean and Green Grape Chicken:,"[""Vanilla"", ""olive oil"", ""butter"", ""chicken br...",For the Vanilla Bean and Green Grape Chicken:,,,,For the Vanilla Bean and Green Grape Chicken :,
1396644,11,For the Vanilla-Scented Rice Pilaf:,"[""Vanilla"", ""olive oil"", ""butter"", ""chicken br...",For the Vanilla-Scented Rice Pilaf:,,,,For the Vanilla-Scented Rice Pilaf :,
1378004,8,"2 tsp, plus 1 TBL kosher salt - divided","[""bacon"", ""bone"", ""sausage"", ""yellow onion"", ""...","2 tsp, plus 1 TBL kosher salt - divided",,2.0,tsps,plus 1 TBL kosher salt -,divided
879616,7,1 Tbsp. gluten (optional),"[""water"", ""bread flour"", ""sugar"", ""salt"", ""bak...",1 Tbsp. gluten (optional),,1.0,Tbsp,(optional),gluten


The parsing does a good job of exclusing strings which aren't ingredients. It does miss out a few obscure ingredients (glutne, pimiento, swordfish steaks). 

We can simply remove these ingredients.

In [57]:
ingredients_df = ingredients_df[~ingredients_df['name'].isna()]

In [58]:
ingredients_df[ingredients_df['quantity'] == 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredient_string,NER,ingredient_string_or_removed,name,quantity,unit,comment,preparation
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


### Postprocessing

#### Non-Numeric Quantities

In [59]:
#| export
def is_number(s):
    for split in s.split('.'):
        if not split.isdigit():
            return False
    return True

def clean_quantity(quantity):
    
    if pd.isnull(quantity) or quantity == '': return quantity

    quantity = quantity.lower()
    quantity = quantity.replace('x', '')
    quantity = re.sub(r'\.$', '', quantity)

    if "-" in quantity:
        splits = quantity.split("-")
        if all([is_number(n) for n in splits]):
            quantity = sum([float(n) for n in splits])/len(splits)
    if quantity in ['several', 'few']: 
        quantity = 3.0
    if quantity == 'half': 
        quantity = 0.5

    try:
        quantity = float(quantity)
    except ValueError:
        quantity = pd.NA

    return quantity

In [60]:
clean_quantity('0.5')

0.5

In [61]:
_ = ingredients_df[(ingredients_df['quantity'].notnull())]
_[~(_['quantity'].apply(is_number))]['quantity']

recipe   ingredient
2195357  0                   4x
         4                   1x
1570340  5               1-0.25
606258   9                     
1857320  7                 8-12
2034999  4                2-0.5
1442064  3                  3-6
1916172  1                1-0.5
1396331  0                  3-4
1806301  10                 Few
1369744  9                0.5-1
         18                 1-2
1600752  14                    
1342486  20                 4-5
1613023  1                  3-4
         3                  1-2
123994   4                     
686704   0                 8-10
1330524  0                  3-4
118024   0                1-1.5
286364   4                     
1320064  7                0.5-1
305911   0                  6-8
808085   4                  5-6
         5                  3-5
402239   7                     
1307725  3                     
1408069  3                  4-5
480244   0                  4-5
         1                 9-12
         3          

In [62]:
_ = ingredients_df[(ingredients_df['quantity'].notnull())]
_[~(_['quantity'].apply(is_number))]['quantity'].apply(clean_quantity)

recipe   ingredient
2195357  0                4.0
         4                1.0
1570340  5              0.625
606258   9                   
1857320  7               10.0
2034999  4               1.25
1442064  3                4.5
1916172  1               0.75
1396331  0                3.5
1806301  10               3.0
1369744  9               0.75
         18               1.5
1600752  14                  
1342486  20               4.5
1613023  1                3.5
         3                1.5
123994   4                   
686704   0                9.0
1330524  0                3.5
118024   0               1.25
286364   4                   
1320064  7               0.75
305911   0                7.0
808085   4                5.5
         5                4.0
402239   7                   
1307725  3                   
1408069  3                4.5
480244   0                4.5
         1               10.5
         3                3.0
1505154  0               0.75
         5          

In [63]:
ingredients_df[ingredients_df['quantity'] == 0].empty

True

In [64]:
cleaned_quantity = ingredients_df['quantity'].apply(clean_quantity)

In [65]:
assert ingredients_df[cleaned_quantity == 0].empty 

In [66]:
ingredients_df['quantity'] = cleaned_quantity

#### RecipesDF NER Field

The author's of the dataset actually already performed a form of parsing in the ingredients using an NLP method Named Entity Recognition (NER). They trained their own model using their own manual labels ([info](https://www.researchgate.net/publication/345308878_Cooking_recipes_generator_utilizing_a_deep_learning-based_language_model)). 

Let's compare this to our own NLP's results.

In [67]:
#| export
def find_ner_match(ingredient):

    ner_match = pd.NA

    ner_ingredients = ingredient['NER'] if isinstance(ingredient['NER'], np.ndarray) else literal_eval(ingredient['NER'])
    ingredient_index = ingredient.name[1] if isinstance(ingredient.name, tuple) else ingredient['ingredient'] # parallel non-multi index

    if ingredient_index in ner_ingredients:
        search_ingredient = str(ner_ingredients[ingredient_index])
        if search_ingredient in ingredient['name']:
            ner_match = search_ingredient
        elif ingredient['name'] in search_ingredient:
            ner_match = ingredient['name']
    if not pd.notnull(ner_match): # the dataset's NER cuts out some of the ingredients, making the indices not match. In this case a search through the whole NER array is required.
        for search_ingredient in ner_ingredients:
            search_ingredient = str(search_ingredient)
            if search_ingredient in ingredient['name']:
                ner_match = search_ingredient
            elif ingredient['name'] in search_ingredient:
                ner_match = ingredient['name']
            if pd.notnull(ner_match): break
        
    return ner_match

In [68]:
ingredients_df.iloc[0]

ingredient_string                           2/3 cup Land O Lakes Butter, softened
NER                             ["Butter", "sugar", "Eggs", "vanilla", "flour"...
ingredient_string_or_removed                2/3 cup Land O Lakes Butter, softened
name                                                          Land O Lakes Butter
quantity                                                                    0.667
unit                                                                         cups
comment                                                                      <NA>
preparation                                                              softened
Name: (1746116, 0), dtype: object

In [69]:
find_ner_match(ingredients_df.iloc[0])

'Butter'

In [70]:
ingredients_df['name.ner'] = ingredients_df[['NER', 'name']].apply(find_ner_match, axis=1)

In [71]:
ingredients_df[ingredients_df['name.ner'].isnull()].shape[0] / ingredients_df.shape[0]

0.03648757016840417

5% of the ingredients don't match the dataset's NER extraction. Taking a look to see why:

In [72]:
pd.set_option('display.max_colwidth', None)
ingredients_df[ingredients_df['name.ner'].isnull()].join(recipes_df[['directions', 'ingredients']], on='recipe')[['name', 'directions', 'ingredients', 'NER']].head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,name,directions,ingredients,NER
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1828339,6,dried apricots,"[""Unroll the vine leaves carefully and put them into a bowl of water to remove the preserving liquid."", ""Pat them dry on absorbent kitchen paper towel."", ""You will need about 30 leaves; reserve the remainder."", ""Bring a large saucepan of salted water to boil and stir in the turmeric."", ""Add the rice and simmer for 30 minutes or until the rice is just tender."", ""Drain thoroughly and turn into a bowl."", ""Heat the oil in a small frying pan."", ""Add the onion to the rice with the apricots, sultanas, spices, mint and lemon juice."", ""Mix well together."", ""Take one vine leaf at a time."", ""Place it flat on the work surface and put a heaped teaspoon of the rice mixture in the centre."", ""Fold the base of the leaf over the filling, then fold over the first one side, then the other."", ""Fold over the top to make a neat parcel."", ""Continue making parcels until you have used up all the filling."", ""Line a large frying pan with leftover vine leaves."", ""Arrange the parcels, seam sides down, in a single layer in the pan."", ""Cover the layer with more leaves, then make a second layer of parcels and cover them with leaves."", ""Pour on the orange juice and water and cover the pan."", ""Cook gently over low heat for 1 hour, adding a little boiling water from time to time, if necessary."", ""To serve warm, allow the parcels to cool slightly in the pan, then arrange uncooked vine leaves on a flat serving dish and carefully transfer the parcels to the dish."", ""Serve immediately."", ""To serve cold, allow the parcels to cool completely in the pan, then arrange on top of uncooked vine leaves on a serving dish as above."", ""Cover and chill.""]","[2 (225 g) cans grape leaves, drained (canned or bottled), salt, 12 teaspoon ground turmeric, 175 g long grain brown rice, 1 tablespoon vegetable oil, 1 small onion, peeled and finely chopped, 100 g dried apricots, finely chopped, 50 g sultanas, 1 pinch ground cinnamon, 1 pinch ground allspice, 1 tablespoon chopped of fresh mint, 1 teaspoon lemon juice, 300 ml unsweetened orange juice, 150 ml water]","[""grape leaves"", ""salt"", ""ground turmeric"", ""long grain brown rice"", ""vegetable oil"", ""onion"", ""sultanas"", ""ground cinnamon"", ""ground allspice"", ""mint"", ""lemon juice"", ""orange juice"", ""water""]"
1703,0,pkg. applesauce,"[""Combine all ingredients."", ""Mix well."", ""Bake at 350\u00b0 for 60 to 65 minutes in a tube pan that has been greased and floured.""]","[1 pkg. applesauce, spice or carrot cake mix, 4 eggs, 1/4 c. water, 1 jar mincemeat, 1 c. nuts (optional)]","[""carrot cake"", ""eggs"", ""water"", ""mincemeat"", ""nuts""]"
317211,5,container Cool Whip,"[""With fork, mix together flour, 1/2 cup pecans and butter. Press into 9 x 13-inch pan."", ""Bake for 15 minutes at 350\u00b0."", ""Cool completely.""]","[1 1/4 c. flour, 1/2 c. pecans, finely chopped, 1/2 c. softened butter, 1 (8 oz.) pkg. cream cheese, softened, 1/2 c. powdered sugar, 1 (8 oz.) container Cool Whip, 2 pkg. pistachio instant pudding mix, 3 c. milk, 3/4 c. chopped pecans]","[""flour"", ""pecans"", ""butter"", ""cream cheese"", ""powdered sugar"", ""pistachio instant pudding"", ""milk"", ""pecans""]"
2195357,0,blue-eyed cod,"[""Put 4 to 6 blue-eyed cod or possibly snapper cutlets into a shallow dish and add in extra virgin olive oil, lemon juice, dry oregano or possibly rigini and a smashed clove garlic."", ""Add in a few thin yellow strips of lemon peel then cover and chill for at least one hour, turning once or possibly twice when you think of it."", ""Make sure the fire on the barbecue is well burned down then cook the steaks for about 3 min each side."", ""Baste often during cooking with the marinade."", ""Finish with a good squeeze of fresh lemon juice and a tiny sprinkle of oregano or possibly some minced fresh parsley."", ""Serve immediately."", ""If it rains, cook inside in a frying pan or possibly on a ribbed griddle."", ""Serves 4to 6."", ""Note: Rigini is a green herb similar to oregano but the flavour is sweeter."", ""Available at gourmet delicatessens.""]","[4 x Cutlets blue-eyed cod or possibly snapper cutlets, up to 6, 3 Tbsp. Extra virgin olive oil, 1 1/2 Tbsp. Lemon juice, strips of peel, 1/2 tsp Dry oregano or possibly rigini, 1 x Clove garlic]","[""snapper cutlets"", ""Extra virgin olive oil"", ""Lemon juice"", ""oregano"", ""Clove garlic""]"
895702,5,container Cool Whip,"[""Mix margarine, flour and pecans."", ""Pat into a 9 x 13-inch pan. Bake at 325\u00b0 for 20 minutes."", ""Let cool.""]","[1/2 c. margarine, melted, 1 c. flour, 1/2 c. pecans, chopped, 8 oz. cream cheese, 1 1/2 c. powdered sugar, 1 (12 oz.) container Cool Whip, 2 (4 1/4 oz.) pkg. instant chocolate pudding, 3 c. milk]","[""margarine"", ""flour"", ""pecans"", ""cream cheese"", ""powdered sugar"", ""instant chocolate pudding"", ""milk""]"
1653851,10,mixed baby greens,"[""For Shrimp Use the First four ingredients."", ""Bring 2 quart water to a boil in a Dutch oven."", ""Add shrimp, and cook 30 seconds."", ""Drain, rinse under cool running water, and place in a small bowl."", ""Fill a medium bowl with ice."", ""Place bowl containing shrimp in ice."", ""Cover and chill 1 to 24 hours."", ""Squeeze juice from lemons through a small strainer over shrimp in a large bowl, drizzle with hot sauce, and toss to coat."", ""Sprinkle with Creole seasoning, and toss to coat."", ""For the Avocado Sauce use the next six ingredients"", ""Process buttermilk, avocado, parsley, chopped green onions, and fresh lemon juice in a food processor 30 seconds or until smooth."", ""Season with kosher salt and freshly ground pepper to taste."", ""Store in refrigerator covered with plastic wrap directly on surface (to prevent discoloration) up to 2 days."", ""Arrange shrimp on greens and serve with dipping sauce.""]","[2 lbs large shrimp, cooked, peeled and deveined with tails on, 2 lemons, 4 teaspoons hot sauce, 2 teaspoons creole seasoning, 1 cup buttermilk, 1 ripe avocado, chopped, 2 tablespoons chopped fresh parsley, 2 finely chopped green onions, 2 tablespoons fresh lemon juice, kosher salt & freshly ground black pepper, 1 (5 ounce) package mixed baby greens, thoroughly washed]","[""shrimp"", ""lemons"", ""hot sauce"", ""creole seasoning"", ""buttermilk"", ""avocado"", ""parsley"", ""green onions"", ""lemon juice"", ""kosher salt"", ""thoroughly washed""]"
1331923,1,frozen copped sinach,"[""1. Heat oven to 425 degrees"", """", ""2. Lay steak on work surface. Holding sharp knife parallel to work surface and starting at a long side, slice flank steak in half to opposite long side, without cutting all the way through; open up the steak like a book. Flatten slightly to an even thickness."", """", ""3. Squeeze liquid from spinach; discard liquid. In medium-sized bowl, combine spinach, cheese, peppers, breadcrumbs, egg yolk, 1/4 teaspoon each of the garlic salt and the pepper."", """", ""4. Season steak with an additional 1/4 teaspoon each of the garlic salt and pepper. Press filling onto steak, leaving a 1 inch border on all sides. Roll up steak to enclose filling, beginning on a short side; the grain of the meat will be running from left to right. Tuck any loose filling back into the ends."", """", ""5. Tie steak with cotton twine at 2 inch intervals to secure. Rub outside with oil, then sprinkle with remaining 1/4 teaspoon each of garlic salt and pepper."", """", ""Roast at 425 degrees for 35 minutes, then increase heat to broil for 10 minutes, turning once. Let meat rest 15 minutes. Remove twine, slice and serve."", """", ""Per Serving: 305 calories, 15 g fat (6 g sat.); 36 g protein; 7 g carbohydrate; 2 g fiber; 588 mg sodiuim; 92 mg cholesterol.""]","[2 lbs Flank steak, 1 package frozen copped sinach, thawed, 1/2 cup crumbled blue cheese, 1 jar 7 oz. roasted red peppers, 2 tablespoons seasoned dry breadcrumbs, 1 egg yolk, 3/4 teaspoons of garlic salt, 3/4 teaspoons ground black pepper, 1 tablespoon olive oil]","[""Flank steak"", ""blue cheese"", ""red peppers"", ""breadcrumbs"", ""egg yolk"", ""garlic salt"", ""ground black pepper"", ""olive oil""]"
661227,4,Beau Monde,"[""Mix and serve with vegetables for a dip or serve as a dressing on mixed garden salad.""]","[2/3 c. mayonnaise, 2/3 c. sour cream, 1 Tbsp. chopped green onion, 1 Tbsp. chopped parsley, 1 Tbsp. Beau Monde (Spice Islands), 1 Tbsp. dill weed]","[""mayonnaise"", ""sour cream"", ""green onion"", ""parsley"", ""dill weed""]"
1546037,14,Ground Cloves,"[""For the pumpkin puree:"", ""Preheat oven to 375\u00b0F. Wash the pumpkin to remove any dirt. Cut the pumpkin in half and remove stem. Remove all of the seeds and stringiness from the center of the pumpkin."", ""Place the pumpkin halves cut-side down onto a baking sheet lined with parchment paper. Roast for 45 minutes to 1 hour, or until skin darkens and is shriveled. The pumpkin should be very soft."", ""With a spoon, scoop the pumpkin out of the skin and transfer to a food processor. Puree until completely smooth."", ""For the pie crust:"", ""Combine flour and salt in a large bowl."", ""n a food processor, add flour and cold butter. Pulse a few times until lightly combined. With the motor running, add egg, cold water, and vinegar into dough. Pulse until just combined (dough might be slightly crumbly)."", ""Remove dough from bowl and gather it into a ball with your hands. Divide the dough in half and slightly flatten it into a disc. Wrap the dough discs with plastic wrap and refrigerate for 30 minutes. (Dough recipe is enough for 2 pies or 1 pie with leaf decorations.)"", ""To assemble the pie:"", ""Preheat oven to 425\u00b0F."", ""In a small bowl, combine sugar, cinnamon, salt, ginger, and cloves."", ""In a large bowl, whisk eggs. Add pumpkin puree and sugar mixture. Whisk in half-and-half, milk and vanilla until well combined."", ""On a floured surface, use a rolling pin to roll out chilled dough into a round shape. The dough should be at least 1 inch larger than your pie dish. Carefully place the dough into the pie dish. Tuck excess dough under itself. Crimp the edges with your fingers (optional)."", ""Pour filling into unbaked pie shell. Bake 15 minutes at 425\u00b0F. Then reduce oven temperature to 350\u00b0F and continue baking for 45-60 minutes or until edges seem dry and center is set but slightly jiggly. If you are going to decorate pie with leaves, roll out remaining dough and cut out leaf shapes. When the pie has been baking for 40 minutes, remove from oven and quickly lay leaf cut-outs over crust. Continue baking until pie is done."", ""Remove pie from oven and place on a cooling rack to cool completely. Serve immediately or refrigerate.""]","[FOR THE PUMPKIN PUREE:, 1 whole (about 3 Lb. Size) Pumpkin, FOR THE PIE CRUST:, 3 cups Flour, 1 teaspoon Salt, 3 sticks Cold Butter, Cut Into Cubes, 1 Egg, Lightly Beaten, 5 Tablespoons Cold Water, 1 Tablespoon White Wine Vinegar, FOR THE FILLING:, 3/4 cups Sugar, 1 teaspoon Cinnamon, 1/2 teaspoons Salt, 1/2 teaspoons Ground Ginger, 1/4 teaspoons Ground Cloves, 2 Eggs, 15 ounces, weight Pumpkin Puree, 1 cup Half-and-half, 1 cup Milk, 1 teaspoon Vanilla]","[""Pumpkin"", ""Flour"", ""Salt"", ""Butter"", ""Egg"", ""Water"", ""White Wine Vinegar"", ""FILLING"", ""Sugar"", ""Cinnamon"", ""Salt"", ""Ground Ginger"", ""\u00bc"", ""Eggs"", ""Milk"", ""Vanilla""]"
1546037,17,Half-and-half,"[""For the pumpkin puree:"", ""Preheat oven to 375\u00b0F. Wash the pumpkin to remove any dirt. Cut the pumpkin in half and remove stem. Remove all of the seeds and stringiness from the center of the pumpkin."", ""Place the pumpkin halves cut-side down onto a baking sheet lined with parchment paper. Roast for 45 minutes to 1 hour, or until skin darkens and is shriveled. The pumpkin should be very soft."", ""With a spoon, scoop the pumpkin out of the skin and transfer to a food processor. Puree until completely smooth."", ""For the pie crust:"", ""Combine flour and salt in a large bowl."", ""n a food processor, add flour and cold butter. Pulse a few times until lightly combined. With the motor running, add egg, cold water, and vinegar into dough. Pulse until just combined (dough might be slightly crumbly)."", ""Remove dough from bowl and gather it into a ball with your hands. Divide the dough in half and slightly flatten it into a disc. Wrap the dough discs with plastic wrap and refrigerate for 30 minutes. (Dough recipe is enough for 2 pies or 1 pie with leaf decorations.)"", ""To assemble the pie:"", ""Preheat oven to 425\u00b0F."", ""In a small bowl, combine sugar, cinnamon, salt, ginger, and cloves."", ""In a large bowl, whisk eggs. Add pumpkin puree and sugar mixture. Whisk in half-and-half, milk and vanilla until well combined."", ""On a floured surface, use a rolling pin to roll out chilled dough into a round shape. The dough should be at least 1 inch larger than your pie dish. Carefully place the dough into the pie dish. Tuck excess dough under itself. Crimp the edges with your fingers (optional)."", ""Pour filling into unbaked pie shell. Bake 15 minutes at 425\u00b0F. Then reduce oven temperature to 350\u00b0F and continue baking for 45-60 minutes or until edges seem dry and center is set but slightly jiggly. If you are going to decorate pie with leaves, roll out remaining dough and cut out leaf shapes. When the pie has been baking for 40 minutes, remove from oven and quickly lay leaf cut-outs over crust. Continue baking until pie is done."", ""Remove pie from oven and place on a cooling rack to cool completely. Serve immediately or refrigerate.""]","[FOR THE PUMPKIN PUREE:, 1 whole (about 3 Lb. Size) Pumpkin, FOR THE PIE CRUST:, 3 cups Flour, 1 teaspoon Salt, 3 sticks Cold Butter, Cut Into Cubes, 1 Egg, Lightly Beaten, 5 Tablespoons Cold Water, 1 Tablespoon White Wine Vinegar, FOR THE FILLING:, 3/4 cups Sugar, 1 teaspoon Cinnamon, 1/2 teaspoons Salt, 1/2 teaspoons Ground Ginger, 1/4 teaspoons Ground Cloves, 2 Eggs, 15 ounces, weight Pumpkin Puree, 1 cup Half-and-half, 1 cup Milk, 1 teaspoon Vanilla]","[""Pumpkin"", ""Flour"", ""Salt"", ""Butter"", ""Egg"", ""Water"", ""White Wine Vinegar"", ""FILLING"", ""Sugar"", ""Cinnamon"", ""Salt"", ""Ground Ginger"", ""\u00bc"", ""Eggs"", ""Milk"", ""Vanilla""]"


In [73]:
pd.reset_option('display.max_colwidth')

We could use this NER as a sort of third check/filter as we need to simplify the ingredients in order to match them with their density. Why don't we just cut out the recipes where NER's don't match up and see what we're left with. 

In [74]:
len(ingredients_df.index.unique(level=0))

302

In [75]:
(len(ingredients_df.index.unique(level=0)) - len(ingredients_df[ingredients_df['name.ner'].isnull()].index.unique(level=0)) )/ len(ingredients_df.index.unique(level=0))

0.7649006622516556

This cuts out 1/4 of the dataset. That's a lot, but we do have a lot of data to work with. If we can gain something from this parsing it might be worth it. 

What this might help with is filtering obscure ingredients that woudln't be matched with the food databases in the future chapters. Let's see how many unique ingredients we have with those cut out.

In [76]:
ingredients_df_filtered = ingredients_df[~ingredients_df.index.isin(ingredients_df[ingredients_df['name.ner'].isnull()].index.unique(level=0), level=0)]
len(ingredients_df_filtered.index.unique(level=0))

231

In [77]:
ingredients_df_filtered['name'].nunique() / ingredients_df['name'].nunique()

0.733567046450482

-> This proportionally removes the number of unique ingredients, so it's not doing anything significant in reducing obscurity.

What it can serve as is a way of filtering out the main ingredient from the name. It seems to be more harsh at eliminating unnecessary words than our method. 

In [78]:
ingredients_df

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredient_string,NER,ingredient_string_or_removed,name,quantity,unit,comment,preparation,name.ner
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1746116,0,"2/3 cup Land O Lakes Butter, softened","[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""...","2/3 cup Land O Lakes Butter, softened",Land O Lakes Butter,0.667,cups,,softened,Butter
1746116,1,1/2 cup sugar,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""...",1/2 cup sugar,sugar,0.5,cups,,,sugar
1746116,2,2 Land O Lakes Eggs (yolks only),"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""...",2 Land O Lakes Eggs (yolks only),Land O Lakes Eggs,2.0,,(yolks only),,Eggs
1746116,3,1 teaspoon vanilla,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""...",1 teaspoon vanilla,vanilla,1.0,teaspoon,,,vanilla
1746116,4,1 1/2 cups all-purpose flour,"[""Butter"", ""sugar"", ""Eggs"", ""vanilla"", ""flour""...",1 1/2 cups all-purpose flour,all-purpose flour,1.5,cups,,,flour
...,...,...,...,...,...,...,...,...,...,...
931097,10,1/4 cup chopped red bell pepper,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun...",1/4 cup chopped red bell pepper,red bell pepper,0.25,cups,,chopped,red bell pepper
931097,11,1 cup Jasmine rice,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun...",1 cup Jasmine rice,Jasmine rice,1.0,cup,,,Jasmine rice
931097,12,1 1/2 cups reduced-sodium chicken broth,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun...",1 1/2 cups reduced-sodium chicken broth,reduced-sodium chicken broth,1.5,cups,,,chicken
931097,13,1 tablespoon chopped fresh basil,"[""Pineapple"", ""garlic"", ""ground thyme"", ""groun...",1 tablespoon chopped fresh basil,fresh basil,1.0,tablespoon,,chopped,fresh basil


In [79]:
ingredients_df = ingredients_df.rename({'name.ner': 'name.name', 'name': 'name.description', 'ingredient_string': 'ingredient_string'}, axis=1)
ingredients_df = ingredients_df[['name.name', 'name.description', 'quantity', 'unit', 'comment', 'preparation', 'ingredient_string']]
ingredients_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1746116,0,Butter,Land O Lakes Butter,0.667,cups,,softened,"2/3 cup Land O Lakes Butter, softened"
1746116,1,sugar,sugar,0.5,cups,,,1/2 cup sugar
1746116,2,Eggs,Land O Lakes Eggs,2.0,,(yolks only),,2 Land O Lakes Eggs (yolks only)
1746116,3,vanilla,vanilla,1.0,teaspoon,,,1 teaspoon vanilla
1746116,4,flour,all-purpose flour,1.5,cups,,,1 1/2 cups all-purpose flour


Decision: Opting to keep the ingredients not identified by NER instead of fitlering them.

In [80]:
ingredients_df[ingredients_df['name.name'].isnull()]

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1828339,6,,dried apricots,100.0,g,,finely chopped,"100 g dried apricots, finely chopped"
1703,0,,pkg. applesauce,1.0,,spice,,"1 pkg. applesauce, spice"
317211,5,,container Cool Whip,1.0,,,,1 (8 oz.) container Cool Whip
2195357,0,,blue-eyed cod,4.0,Cutlets,,,4 x Cutlets blue-eyed cod
895702,5,,container Cool Whip,1.0,,,,1 (12 oz.) container Cool Whip
...,...,...,...,...,...,...,...,...
1777734,3,,"tub, COOL WHIP Whipped Topping",1.0,,,thawed,"1 tub (12 oz.) COOL WHIP Whipped Topping, thawed"
2177242,1,,5 spice powder,1.0,teaspoon,,,1 teaspoon five-spice powder
2177242,3,,scallion,3.0,heads,white part only,finely chopped,"3 scallion heads white part only, finely chopped"
1097688,0,,BATTER,,,,,BATTER


In [81]:
ingredients_df['name.name'] = ingredients_df.apply(lambda ingredient: ingredient['name.name'] if pd.notnull(ingredient['name.name']) else ingredient['name.description'], axis=1)

## Cleaning Null Values

Making null values consistent.

In [82]:
ingredients_df

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1746116,0,Butter,Land O Lakes Butter,0.667,cups,,softened,"2/3 cup Land O Lakes Butter, softened"
1746116,1,sugar,sugar,0.5,cups,,,1/2 cup sugar
1746116,2,Eggs,Land O Lakes Eggs,2.0,,(yolks only),,2 Land O Lakes Eggs (yolks only)
1746116,3,vanilla,vanilla,1.0,teaspoon,,,1 teaspoon vanilla
1746116,4,flour,all-purpose flour,1.5,cups,,,1 1/2 cups all-purpose flour
...,...,...,...,...,...,...,...,...
931097,10,red bell pepper,red bell pepper,0.25,cups,,chopped,1/4 cup chopped red bell pepper
931097,11,Jasmine rice,Jasmine rice,1.0,cup,,,1 cup Jasmine rice
931097,12,chicken,reduced-sodium chicken broth,1.5,cups,,,1 1/2 cups reduced-sodium chicken broth
931097,13,fresh basil,fresh basil,1.0,tablespoon,,chopped,1 tablespoon chopped fresh basil


In [83]:
ingredients_df

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1746116,0,Butter,Land O Lakes Butter,0.667,cups,,softened,"2/3 cup Land O Lakes Butter, softened"
1746116,1,sugar,sugar,0.5,cups,,,1/2 cup sugar
1746116,2,Eggs,Land O Lakes Eggs,2.0,,(yolks only),,2 Land O Lakes Eggs (yolks only)
1746116,3,vanilla,vanilla,1.0,teaspoon,,,1 teaspoon vanilla
1746116,4,flour,all-purpose flour,1.5,cups,,,1 1/2 cups all-purpose flour
...,...,...,...,...,...,...,...,...
931097,10,red bell pepper,red bell pepper,0.25,cups,,chopped,1/4 cup chopped red bell pepper
931097,11,Jasmine rice,Jasmine rice,1.0,cup,,,1 cup Jasmine rice
931097,12,chicken,reduced-sodium chicken broth,1.5,cups,,,1 1/2 cups reduced-sodium chicken broth
931097,13,fresh basil,fresh basil,1.0,tablespoon,,chopped,1 tablespoon chopped fresh basil


In [84]:
ingredients_df = ingredients_df.replace(r'^\s*$', pd.NA, regex=True)
ingredients_df

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1746116,0,Butter,Land O Lakes Butter,0.667,cups,,softened,"2/3 cup Land O Lakes Butter, softened"
1746116,1,sugar,sugar,0.5,cups,,,1/2 cup sugar
1746116,2,Eggs,Land O Lakes Eggs,2.0,,(yolks only),,2 Land O Lakes Eggs (yolks only)
1746116,3,vanilla,vanilla,1.0,teaspoon,,,1 teaspoon vanilla
1746116,4,flour,all-purpose flour,1.5,cups,,,1 1/2 cups all-purpose flour
...,...,...,...,...,...,...,...,...
931097,10,red bell pepper,red bell pepper,0.25,cups,,chopped,1/4 cup chopped red bell pepper
931097,11,Jasmine rice,Jasmine rice,1.0,cup,,,1 cup Jasmine rice
931097,12,chicken,reduced-sodium chicken broth,1.5,cups,,,1 1/2 cups reduced-sodium chicken broth
931097,13,fresh basil,fresh basil,1.0,tablespoon,,chopped,1 tablespoon chopped fresh basil


In [85]:
ingredients_df.iloc[2]['unit']

<NA>

## Data Types

In [86]:
ingredients_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2494 entries, (1746116, 0) to (931097, 14)
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   name.name          2494 non-null   object
 1   name.description   2494 non-null   object
 2   quantity           2330 non-null   object
 3   unit               1966 non-null   object
 4   comment            280 non-null    object
 5   preparation        672 non-null    object
 6   ingredient_string  2494 non-null   object
dtypes: object(7)
memory usage: 155.6+ KB


In [87]:
ingredients_df = ingredients_df.astype({
    'name.name': 'string',
    'name.description': 'string',
    'quantity': 'Float64',
    'unit': 'string',
    'comment': 'string',
    'preparation': 'string',
    'ingredient_string': 'string'
})

In [88]:
ingredients_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2494 entries, (1746116, 0) to (931097, 14)
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name.name          2494 non-null   string 
 1   name.description   2494 non-null   string 
 2   quantity           2330 non-null   Float64
 3   unit               1966 non-null   string 
 4   comment            280 non-null    string 
 5   preparation        672 non-null    string 
 6   ingredient_string  2494 non-null   string 
dtypes: Float64(1), string(6)
memory usage: 158.0 KB


## Unit Tagging

The above will cover all that is required to find ingredients by their name in the `food_df`. From this finding thier density requires another step: to find the appropriate portion measure / size in the `food_portion_df`. To best do this we can homogenise the units, which is easily done as they are not so expansive, they are only a list of ~30. Again we can do this with NLP.

In [89]:
ingredients_df['unit'].value_counts().head(20)

unit
cups           545
cup            166
teaspoons      146
tablespoons    106
tsps            94
can             81
teaspoon        71
Tbsps           61
tablespoon      61
tsp             53
lbs             40
ounces          40
large           34
cloves          30
lb              27
medium          27
Tbsp            27
oz              22
cans            20
tbsps           19
Name: count, dtype: Int64

In [90]:
ingredients_df['unit_tags'], ingredients_df['unit_remainders'], ingredients_df['unit_type'] = zip(*ingredients_df['unit'].apply(clean_ingredient_string).progress_apply(tag_units))
ingredients_df[['unit', 'unit_tags', 'unit_remainders', 'unit_type']]

  0%|                                                                                                                                                         | 0/2494 [00:00<?, ?it/s]

  3%|████▊                                                                                                                                          | 85/2494 [00:00<00:02, 846.08it/s]

  7%|█████████▋                                                                                                                                    | 170/2494 [00:00<00:02, 780.76it/s]

 10%|██████████████▋                                                                                                                               | 257/2494 [00:00<00:02, 816.19it/s]

 14%|███████████████████▎                                                                                                                          | 339/2494 [00:00<00:02, 789.81it/s]

 17%|████████████████████████                                                                                                                      | 423/2494 [00:00<00:02, 806.98it/s]

 21%|█████████████████████████████▉                                                                                                                | 526/2494 [00:00<00:02, 878.98it/s]

 25%|███████████████████████████████████                                                                                                           | 615/2494 [00:00<00:02, 858.52it/s]

 28%|███████████████████████████████████████▉                                                                                                      | 702/2494 [00:00<00:02, 851.22it/s]

 32%|████████████████████████████████████████████▊                                                                                                 | 788/2494 [00:00<00:02, 848.15it/s]

 35%|██████████████████████████████████████████████████                                                                                            | 880/2494 [00:01<00:01, 868.53it/s]

 39%|███████████████████████████████████████████████████████                                                                                       | 968/2494 [00:01<00:01, 870.63it/s]

 42%|███████████████████████████████████████████████████████████▊                                                                                 | 1057/2494 [00:01<00:01, 874.55it/s]

 46%|████████████████████████████████████████████████████████████████▋                                                                            | 1145/2494 [00:01<00:01, 840.71it/s]

 50%|█████████████████████████████████████████████████████████████████████▉                                                                       | 1236/2494 [00:01<00:01, 859.33it/s]

 53%|██████████████████████████████████████████████████████████████████████████▊                                                                  | 1323/2494 [00:01<00:01, 860.81it/s]

 57%|███████████████████████████████████████████████████████████████████████████████▋                                                             | 1410/2494 [00:01<00:01, 836.53it/s]

 60%|████████████████████████████████████████████████████████████████████████████████████▋                                                        | 1498/2494 [00:01<00:01, 847.33it/s]

 64%|█████████████████████████████████████████████████████████████████████████████████████████▊                                                   | 1588/2494 [00:01<00:01, 860.99it/s]

 67%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 1675/2494 [00:01<00:00, 851.30it/s]

 71%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 1761/2494 [00:02<00:00, 830.83it/s]

 74%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                    | 1850/2494 [00:02<00:00, 846.46it/s]

 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 1936/2494 [00:02<00:00, 849.15it/s]

 81%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 2022/2494 [00:02<00:00, 835.79it/s]

 85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 2113/2494 [00:02<00:00, 855.94it/s]

 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 2200/2494 [00:02<00:00, 857.60it/s]

 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 2300/2494 [00:02<00:00, 897.48it/s]

 96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 2390/2494 [00:02<00:00, 865.63it/s]

 99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 2477/2494 [00:02<00:00, 857.35it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2494/2494 [00:02<00:00, 849.42it/s]




Unnamed: 0_level_0,Unnamed: 1_level_0,unit,unit_tags,unit_remainders,unit_type
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1746116,0,cups,[cup],[],volume
1746116,1,cups,[cup],[],volume
1746116,2,,[],[],portion
1746116,3,teaspoon,[teaspoon],[],volume
1746116,4,cups,[cup],[],volume
...,...,...,...,...,...
931097,10,cups,[cup],[],volume
931097,11,cup,[cup],[],volume
931097,12,cups,[cup],[],volume
931097,13,tablespoon,[tablespoon],[],volume


In [91]:
ingredients_df[(ingredients_df['unit_tags'].str.len() == 0) & (ingredients_df['unit'] != '')]

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string,unit_tags,unit_remainders,unit_type
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1828339,8,ground cinnamon,ground cinnamon,1.0,pinch,,,1 pinch ground cinnamon,[],[pinch],portion
1828339,9,ground allspice,ground allspice,1.0,pinch,,,1 pinch ground allspice,[],[pinch],portion
1703,3,mincemeat,mincemeat,1.0,jar,,,1 jar mincemeat,[],[jar],portion
382666,1,butter,butter,1.5,sticks,,,1 1/2 sticks butter,[],[stick],portion
2195357,0,blue-eyed cod,blue-eyed cod,4.0,Cutlets,,,4 x Cutlets blue-eyed cod,[],[cutlet],portion
...,...,...,...,...,...,...,...,...,...,...,...
2177242,3,scallion,scallion,3.0,heads,white part only,finely chopped,"3 scallion heads white part only, finely chopped",[],[head],portion
64662,0,margarine,margarine,1.0,stick,,softened,"1 stick margarine, softened",[],[stick],portion
1936429,1,garlic,garlic,2.0,cloves,,coarsely chopped,"2 cloves garlic, coarsely chopped",[],[clove],portion
1401651,4,ras el hanout,ras el hanout,,generous pinch,,,generous pinches ras el hanout,[],"[generous, pinch]",portion


In [92]:
ingredients_df['unit_type'].value_counts()

unit_type
volume     1497
portion     815
weight      182
Name: count, dtype: int64

In [93]:
ingredients_df['unit_type'] = ingredients_df['unit_type'].astype('category')

# Creating Expanded Ingredients DF

What we want here is to set the dataframe up for use in other sections/modules. All of these steps should go here.

This includes:
- removing unnecessary columns
- cleaning data
- homogenising data as above

Here we do any dataframe operations required for the joining. This is done to the full dataframe at the beginning, to have them all ready for the join operation, preventing the computations being done numerous times.

## Extracting Search Terms

The next chapters will involve joining various dataframes with this one ie. matching these ingredients with those of other dataframes through searching through them in a methodical manner. To do this we have devised a function that takes in a list of **orderd** search terms, using these to refine the search towards the best match. This can be arranged as a set of columns for each ingredient.

Emphasis is put onto the ordering of these search terms, as we want to carefully lead our search to the match, avoiding getting stuck in incorrect searches. The rest of this section will do this by ordering the columns, and extracting their information in a methodical way.

### Selecting Columns

#decision

Here decisions need to be made in selecting which part of the ingredient info we need for searching. We want to get a specific match on an ingredient, but again we must make sure to avoid 'over-searching' and matching with an obscure ingredient.

We definitely don't want to include information about how an ingredient should be prepared, do we want comments? Comments seem to cause more harm than good in the search, so these are removed.

In [94]:
ingredients_df.shape

(2494, 10)

#### Nouns Separation

When it comes to matching the ingredient string, nouns are the most important search term. Adjectives matches are more optional, and sometimes hinder results by matching with oddly specific matches that are too detailed and stem away from the base ingredient. ie. chopped onion matching with "chicken breast with chopped onion and chopped pepper." instead of simply "onion".

In [95]:
nltk.pos_tag(mt.tokenize('red bell pepper'))

[('red', 'JJ'), ('bell', 'NN'), ('pepper', 'IN')]

One more point of note, the tag of a word depends on its position relative to other words. This can cause issues when stripping words away.

In [96]:
assert tag_ingredient_string('chopped onions') != [*tag_ingredient_string('chopped'), *tag_ingredient_string('onion')]
tag_ingredient_string('chopped onions'), [*tag_ingredient_string('chopped'), *tag_ingredient_string('onion')]

([{'word': 'chopped', 'tag': 'adj'}, {'word': 'onions', 'tag': 'noun'}],
 [{'word': 'chopped', 'tag': 'noun'}, {'word': 'onion', 'tag': 'noun'}])

It would be best if we instead tagged the list before this was split.

Here we want to perform it on `name.full`, then find the indices at which the `name.name` and `name.description` was split, use these to find the tags of the string at that index, and match that to the original tags.

In [97]:
#| export
def tokenize_with_spans(txt):
    tokens=mt.tokenize(txt)
    offset = 0
    for token in tokens:
        offset = txt.find(token, offset)
        yield token, offset, offset+len(token)
        offset += len(token)

In [98]:
list(tokenize_with_spans('red bell pepper'))

[('red', 0, 3), ('bell', 4, 8), ('pepper', 9, 15)]

In [99]:
ingredient = ingredients_df.loc[58195, 4]
ingredient

name.name                      butter
name.description           buttermilk
quantity                          1.0
unit                              cup
comment                          <NA>
preparation                      <NA>
ingredient_string    1 cup buttermilk
unit_tags                       [cup]
unit_remainders                    []
unit_type                      volume
Name: (58195, 4), dtype: object

In [100]:
#| export
def get_match_idxs(ingredient_string, search_string):
    idx = ingredient_string.find(search_string)
    if idx == -1: 
        return None
    return set(range(idx, idx+len(search_string)))

In [101]:
assert get_match_idxs('1/2 cup sugar', 'sugar') == set(range(8,13))
assert get_match_idxs('8 ounces blueberries', 'blueberry') == None

In [102]:
#| export
def split_nouns(ingredient_string, cut_string):

    if not cut_string: return ([], [])

    # getting tags of full string
    tokens_spanned = list(tokenize_with_spans(ingredient_string))
    tokens_spanned_tags = nltk.pos_tag(list(zip(*tokens_spanned))[0])
    if tokens_spanned_tags: tokens_spanned_tags[-1] = (tokens_spanned_tags[-1][0], 'NN') # setting last word as noun (as this is describing item)
    tokens_spanned_tagged = [word[0] + word[1] for word in list(zip(tokens_spanned, tokens_spanned_tags))] # joining the two together
    tokens_spanned_tagged = list(zip(*[t for i, t in enumerate(list(zip(*tokens_spanned_tagged))) if i != 3])) # removing 3rd index of each tuple in list
    
    # using full string tags to tag cut string
    match_idxs = get_match_idxs(ingredient_string, cut_string)
    if not match_idxs: return pd.NA, pd.NA
    tags = [t for t in tokens_spanned_tagged if match_idxs.intersection(range(*t[1:3]))]
    
    # splitting nouns
    tags_split = [t[0] for t in tags if t[3].startswith('NN')], [t[0] for t in tags if not t[3].startswith('NN')]

    # limit to 6 words for each split
    tags_split = tuple([tags[0:6] for tags in tags_split])

    return tags_split

In [103]:
assert split_nouns('1/2 cup granulated sugar', 'granulated sugar') == (['sugar'], ['granulated'])
assert split_nouns('8 ounces blueberries', 'blueberry') == (pd.NA, pd.NA)
assert split_nouns('8 ounces blueberries', '') == ([], [])
assert split_nouns('red bell pepper', 'red bell pepper') == (['bell','pepper'], ['red']) # final word should be noun always

In [104]:
#| export
def split_ingredient_fields_by_noun(ingredient, debug=False):
    for col in ingredient.index:
        if col == 'ingredient_string': continue
        ingredient[col + '.nouns'], ingredient[col + '.others'] = split_nouns(ingredient['name.description'], ingredient[col])
        if debug:
            if ingredient[col] != '' and (len(ingredient[col + '.nouns']) + len(ingredient[col + '.others']) == 0):
                print('WARN: Missing ingredient tags', ingredient.name, col, ingredient[col], ingredient['ingredient_string'], sep=' | ')
    return ingredient

In [105]:
expanded_ingredients_df = ingredients_df[['name.name', 'name.description', 'ingredient_string']]
expanded_ingredients_df = expanded_ingredients_df.apply(split_ingredient_fields_by_noun, axis=1)

expanded_ingredients_df.drop([col for col in expanded_ingredients_df.columns if not any(s in col for s in ['nouns', 'others'])], axis=1, inplace=True)
expanded_ingredients_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name.nouns,name.name.others,name.description.nouns,name.description.others
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1746116,0,[Butter],[],"[Land, O, Lakes, Butter]",[]
1746116,1,[sugar],[],[sugar],[]
1746116,2,[Eggs],[],"[Land, O, Lakes, Eggs]",[]
1746116,3,[vanilla],[],[vanilla],[]
1746116,4,[flour],[],[flour],[all-purpose]


In [106]:
ingredients_df.loc[767911, 1]

name.name                          onion
name.description                   onion
quantity                             1.0
unit                                 cup
comment                             <NA>
preparation                      chopped
ingredient_string    1 cup chopped onion
unit_tags                          [cup]
unit_remainders                       []
unit_type                         volume
Name: (767911, 1), dtype: object

In [107]:
# split expand lists into individual columns
original_cols = expanded_ingredients_df.columns
for expand_col in original_cols:
    expanded = pd.DataFrame(expanded_ingredients_df[expand_col].tolist(), index=expanded_ingredients_df.index)
    expanded.columns = [expand_col + '.' + str(c) for c in expanded.columns]
    expanded_ingredients_df = expanded_ingredients_df.join(expanded)
expanded_ingredients_df.drop(columns=original_cols, inplace=True)
expanded_ingredients_df.columns

Index(['name.name.nouns.0', 'name.name.nouns.1', 'name.name.nouns.2',
       'name.name.nouns.3', 'name.name.nouns.4', 'name.name.others.0',
       'name.name.others.1', 'name.name.others.2', 'name.name.others.3',
       'name.description.nouns.0', 'name.description.nouns.1',
       'name.description.nouns.2', 'name.description.nouns.3',
       'name.description.nouns.4', 'name.description.nouns.5',
       'name.description.others.0', 'name.description.others.1',
       'name.description.others.2', 'name.description.others.3',
       'name.description.others.4', 'name.description.others.5'],
      dtype='object')

In [108]:
expanded_ingredients_df = expanded_ingredients_df.convert_dtypes()
print(list(expanded_ingredients_df.dtypes))

[string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python], string[python]]


## Reversing Column Order

When dealing with nouns, most of the time gramatically the most important and core noun comes last. eg. 'apple cider vinegar' where 'vinegar' is the core ingredient.

In [109]:
split_types = ['nouns', 'others']
original_cols = list(dict.fromkeys([re.sub(r'\.(nouns|others).*', '', c) for c in expanded_ingredients_df.columns]))
reordered_cols = []
for col in original_cols:
    for word_type in split_types:
        col_expanded = [c for c in expanded_ingredients_df.columns if col in c and word_type in c]
        if word_type =='nouns': col_expanded.reverse()
        reordered_cols.extend(col_expanded)

reordered_cols
    
expanded_ingredients_df = expanded_ingredients_df[reordered_cols]
expanded_ingredients_df.columns

Index(['name.name.nouns.4', 'name.name.nouns.3', 'name.name.nouns.2',
       'name.name.nouns.1', 'name.name.nouns.0', 'name.name.others.0',
       'name.name.others.1', 'name.name.others.2', 'name.name.others.3',
       'name.description.nouns.5', 'name.description.nouns.4',
       'name.description.nouns.3', 'name.description.nouns.2',
       'name.description.nouns.1', 'name.description.nouns.0',
       'name.description.others.0', 'name.description.others.1',
       'name.description.others.2', 'name.description.others.3',
       'name.description.others.4', 'name.description.others.5'],
      dtype='object')

In [110]:
assert int(expanded_ingredients_df.columns[0].split('.')[-1]) > 0

For more accurate grammar tagging, we preferred the description to contain the full string. To make sure we're not searching for the same search term twice we can now remove them here ready for exporting.

In [111]:
#| export
def remove_name_from_description(ingredient):
    ingredient_cols = ingredient[ingredient.notnull()].index
    ingredient_name_cols = ingredient_cols[ingredient_cols.str.startswith('name.name')]
    ingredient_desc_cols = ingredient_cols[ingredient_cols.str.startswith('name.description')]
    matching_cols = ingredient[ingredient_desc_cols][ingredient[ingredient_desc_cols].apply(lambda x: x in ingredient[ingredient_name_cols].values)].index
    ingredient[matching_cols] = pd.NA
    return ingredient

In [112]:
expanded_ingredient = expanded_ingredients_df.iloc[0]
expanded_ingredient

name.name.nouns.4              <NA>
name.name.nouns.3              <NA>
name.name.nouns.2              <NA>
name.name.nouns.1              <NA>
name.name.nouns.0            Butter
name.name.others.0             <NA>
name.name.others.1             <NA>
name.name.others.2             <NA>
name.name.others.3             <NA>
name.description.nouns.5       <NA>
name.description.nouns.4       <NA>
name.description.nouns.3     Butter
name.description.nouns.2      Lakes
name.description.nouns.1          O
name.description.nouns.0       Land
name.description.others.0      <NA>
name.description.others.1      <NA>
name.description.others.2      <NA>
name.description.others.3      <NA>
name.description.others.4      <NA>
name.description.others.5      <NA>
Name: (1746116, 0), dtype: string

In [113]:
assert pd.isna(remove_name_from_description(expanded_ingredient)['name.description.nouns.3'])

In [114]:
expanded_ingredients_df = expanded_ingredients_df.apply(remove_name_from_description, axis=1)

In [115]:
assert pd.isna(expanded_ingredients_df.iloc[0])['name.description.nouns.3']

# Preprocessing (2)

Preprocessing stages to be applied after the `expanded_ingredients_df` has been created.

## Cleaning Columns

Note this was saved until after separating nouns/others in case it altered phrases.

In [116]:
ingredients_df[['name.name', 'name.description']] = ingredients_df[['name.name', 'name.description']].map(clean_ingredient_string)
ingredients_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string,unit_tags,unit_remainders,unit_type
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1746116,0,butter,land lake butter,0.667,cups,,softened,"2/3 cup Land O Lakes Butter, softened",[cup],[],volume
1746116,1,sugar,sugar,0.5,cups,,,1/2 cup sugar,[cup],[],volume
1746116,2,egg,land lake egg,2.0,,(yolks only),,2 Land O Lakes Eggs (yolks only),[],[],portion
1746116,3,vanilla,vanilla,1.0,teaspoon,,,1 teaspoon vanilla,[teaspoon],[],volume
1746116,4,flour,all-purpose flour,1.5,cups,,,1 1/2 cups all-purpose flour,[cup],[],volume


In [117]:
expanded_ingredients_df = expanded_ingredients_df.map(clean_ingredient_string)
expanded_ingredients_df = expanded_ingredients_df.replace(r'^\s*$', pd.NA, regex=True)
expanded_ingredients_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name.nouns.4,name.name.nouns.3,name.name.nouns.2,name.name.nouns.1,name.name.nouns.0,name.name.others.0,name.name.others.1,name.name.others.2,name.name.others.3,name.description.nouns.5,...,name.description.nouns.3,name.description.nouns.2,name.description.nouns.1,name.description.nouns.0,name.description.others.0,name.description.others.1,name.description.others.2,name.description.others.3,name.description.others.4,name.description.others.5
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1746116,0,,,,,butter,,,,,,...,,lake,,land,,,,,,
1746116,1,,,,,sugar,,,,,,...,,,,,,,,,,
1746116,2,,,,,egg,,,,,,...,,lake,,land,,,,,,
1746116,3,,,,,vanilla,,,,,,...,,,,,,,,,,
1746116,4,,,,,flour,,,,,,...,,,,,all-purpose,,,,,


In [118]:
expanded_ingredients_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name.nouns.4,name.name.nouns.3,name.name.nouns.2,name.name.nouns.1,name.name.nouns.0,name.name.others.0,name.name.others.1,name.name.others.2,name.name.others.3,name.description.nouns.5,...,name.description.nouns.3,name.description.nouns.2,name.description.nouns.1,name.description.nouns.0,name.description.others.0,name.description.others.1,name.description.others.2,name.description.others.3,name.description.others.4,name.description.others.5
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1746116,0,,,,,butter,,,,,,...,,lake,,land,,,,,,
1746116,1,,,,,sugar,,,,,,...,,,,,,,,,,
1746116,2,,,,,egg,,,,,,...,,lake,,land,,,,,,
1746116,3,,,,,vanilla,,,,,,...,,,,,,,,,,
1746116,4,,,,,flour,,,,,,...,,,,,all-purpose,,,,,


## Filtering Strings

We want to do a bit of filtering here, mainly with 'seasoning' in mind. This is an unnecessary addition in most recipes, with an exception being in baking where this matters. If it does matter it will have a specified quantity and unit. 

In [119]:
ingredient = expanded_ingredients_df.loc[2058895, 6]
ingredient[ingredient.notnull()]

name.name.nouns.0             salt
name.description.nouns.1    pepper
Name: (2058895, 6), dtype: object

In [120]:
#| export
def filter_patterns(ingredient, filters):

    cols = ingredient.index[ingredient.notnull()]

    for filter_words in filters:
        if all([any([filter_word == ingredient[col] for col in cols]) for filter_word in filter_words]):
            return True
        
    return False

filters = [['salt'],['ground', 'pepper'],['black', 'pepper'], ['pepper']]

In [121]:
assert filter_patterns(expanded_ingredients_df.loc[2058895, 6], filters) == True

In [122]:
# only filtering the ingredients which do not specify their unit & quantity - otherwise we are able to assess.
quantity_filter = (ingredients_df['unit'] == '') & (ingredients_df['quantity'] == '')

word_filter = expanded_ingredients_df.apply(filter_patterns, args=(filters,), axis=1)

print((quantity_filter & word_filter).sum(), expanded_ingredients_df.shape[0])

expanded_ingredients_df = expanded_ingredients_df[~(quantity_filter & word_filter)]

ingredients_df = ingredients_df.loc[expanded_ingredients_df.index]

0 2494


# Saving

In [123]:
ingredients_df

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string,unit_tags,unit_remainders,unit_type
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1746116,0,butter,land lake butter,0.667,cups,,softened,"2/3 cup Land O Lakes Butter, softened",[cup],[],volume
1746116,1,sugar,sugar,0.5,cups,,,1/2 cup sugar,[cup],[],volume
1746116,2,egg,land lake egg,2.0,,(yolks only),,2 Land O Lakes Eggs (yolks only),[],[],portion
1746116,3,vanilla,vanilla,1.0,teaspoon,,,1 teaspoon vanilla,[teaspoon],[],volume
1746116,4,flour,all-purpose flour,1.5,cups,,,1 1/2 cups all-purpose flour,[cup],[],volume
...,...,...,...,...,...,...,...,...,...,...,...
931097,9,red onion,red onion,0.25,cups,,finely chopped,"1/4 cup red onion, finely chopped",[cup],[],volume
931097,10,red bell pepper,red bell pepper,0.25,cups,,chopped,1/4 cup chopped red bell pepper,[cup],[],volume
931097,11,jasmine rice,jasmine rice,1.0,cup,,,1 cup Jasmine rice,[cup],[],volume
931097,12,chicken,reduced-sodium chicken broth,1.5,cups,,,1 1/2 cups reduced-sodium chicken broth,[cup],[],volume


In [124]:
ingredients_df.loc[2006319]

Unnamed: 0_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string,unit_tags,unit_remainders,unit_type
ingredient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,watermelon,watermelon,1.0,,,,1 watermelon,[],[],portion
1,cantaloupe,cantaloupe,1.0,,,,1 cantaloupe,[],[],portion
2,raspberry,raspberry,8.0,ounces,,,8 ounces raspberries,[ounce],[],weight
3,blackberry,blackberry,8.0,ounces,,,8 ounces blackberries,[ounce],[],weight
4,blueberry,blueberry,8.0,ounces,,,8 ounces blueberries,[ounce],[],weight


In [125]:
ingredients_df.to_feather('../data/local/recipe/partial/ingredients/0.feather')
expanded_ingredients_df.to_feather('../data/local/recipe/partial/expanded_ingredients/0.feather')

In [126]:
from nbdev import nbdev_export; nbdev_export()