## Extract `RecipeNLG_bg_only.csv` from zip file before running this code

Place it in the same directory as this notebook.

In [58]:
import pandas as pd
import re
import numpy as np
DATA_FILE = 'RecipeNLG_bg_only.csv'

# ingredients that will be quantified
QUANTIFIED_INGREDIENTS = {'egg': 0,
                          'flour': 0, 
                          'sugar': 0, 
                          'butter': 0, 
                          'vanilla extract': 0, 
                          'milk': 0, 
                          'evaporated milk': 0, 
                          'condensed milk': 0, 
                          'shortening': 0,
                          'margarine': 0,
                          'oleo': 0,
                          'crisco': 0,
                          'powdered sugar': 0,
                          'cornmeal': 0,
                          'soda': 0, # aka baking soda
                          'baking powder': 0}

ALIASES = {'oleo': 'butter', 'margarine': 'butter', 'crisco': 'shortening'}

number_capturer = '([0-9]*(?: [1-9](?:\/|\\\)[1-9])?) '

# regex objects
re_cup = re.compile(number_capturer+'c(?:ups?)?\.? ', flags=re.IGNORECASE)
re_pint = re.compile(number_capturer+'p(?:in?)?ts?\.? ', flags=re.IGNORECASE)
re_tbsp = re.compile(number_capturer+'tbsps?\.? ', flags=re.IGNORECASE)
re_tsp = re.compile(number_capturer+'tsps?\.? ', flags=re.IGNORECASE)
re_lb = re.compile(number_capturer+'(?:pound|lb)s?\.? ', flags=re.IGNORECASE)
re_g = re.compile(number_capturer+'g(?:ram)?s?\.? ', flags=re.IGNORECASE)
re_oz = re.compile(number_capturer+'o(?:z|unces?)\.? ', flags=re.IGNORECASE)
re_stick = re.compile(number_capturer+'sticks? ', flags=re.IGNORECASE)

re_egg = re.compile(number_capturer+'(?:[abd-z\.]* )?egg', flags=re.IGNORECASE)
re_egg_beaters = re.compile(number_capturer+'c(?:up)?\.? egg beaters', flags=re.IGNORECASE)

# unit conversions
re_vol_units = [re_cup, re_pint, re_tbsp, re_tsp]
vol_multipliers = [48, 24, 3, 1] # the number of tsps in a cup, pint, tbsp, and tsp (respectively)

re_mass_units = [re_lb, re_oz, re_g, re_stick]
mass_multipliers = [453.592, # grams in a lb
                    28.35, # g in an ounce
                    1, # g in a g
                    113] # g in a stick of butter/margarine

grams_in_a_tsp = {'flour': 2.5, 
                  'sugar': 4.167, 
                  'butter': 4.71, 
                  'vanilla extract': 4.92,
                  'milk': 5.5,                   
                  'evaporated milk': 5.25,
                  'condensed milk': 6.38,
                  'shortening': 4.1,
                  'powdered sugar': 2.504,
                  'cornmeal': 3.27,
                  'soda': 4.8,
                  'baking powder': 4.8}

In [25]:
df = pd.read_csv(filepath_or_buffer=DATA_FILE, header=0)
print(df.shape)

(235762, 4)


In [59]:
# convert string with fraction to float
# e.g. "3 1/2" becomes 3.5

def unsafe_convert_to_float(s):
    if ' ' in s:
        p1, p2 = s.split(' ')
        f = float(p1)
        if '/' in p2:
            numerator, denominator = p2.split('/')
        elif '\\' in p2:
            numerator, denominator = p2.split('\\')
        else:
            return float(p1) * float(p2)
        f += (float(numerator) / float(denominator))
        return f
    elif '/' in s:
        i = s.find('/')
    elif '\\' in s:
        i = s.find('\\')
    else:
        return float(s)
    
    return float(s[:i]) / float(s[i+1:])


def convert_to_float(s):
    try:
        return unsafe_convert_to_float(s)
    except ValueError:
        return np.nan

In [60]:
# Some helper functions
# qi = Quantified Ingredient
# ing_string = string from recipe that we want to quantify

# search for unit of volume, convert to grams
def vol_convert(ing_string, qi):
    for i in range(len(re_vol_units)):
        search = re_vol_units[i].match(ing_string)
        if search is not None:
            return vol_multipliers[i] * grams_in_a_tsp[qi] * convert_to_float(search.group(1))
    return 0

# search for unit of mass, convert to grams
def mass_convert(ing_string):
    for i in range(len(re_mass_units)):
        search = re_mass_units[i].match(ing_string)
        if search is not None:
            return mass_multipliers[i] * convert_to_float(search.group(1))
    return 0

# helper function for eggs
def egg_convert(ing_string):
    search = re_egg.match(ing_string)
    if search is not None:
        return convert_to_float(search.group(1))
    else:
        search = re_egg_beaters.match(ing_string)
        if search is not None:
            # Egg Beaters are almost always measured in cups, and 1 cup = 4 eggs
            return 4 * convert_to_float(search.group(1))
    return 0

# extract quantities from an ingredient string
def extract_quantities(ingredients):
    
    # initialize quantity dict, which will be returned
    quantities = QUANTIFIED_INGREDIENTS.copy()
    
    # convert ingredient string into list
    ing_list = ingredients.lower().split('"')[1::2]
    
    # iterate through strings of recipe's ingredient list
    for ing in ing_list:
        # iterate through ingredients we are searching for
        for k, v in quantities.items():
            if v == 0 and k in ing:
                
                # Eggs are special for two reasons
                # 1. They are discrete, unlike other ingredients
                # 2. They sometimes come in the form of "Egg Beaters"
                if k == 'egg':
                    quantities['egg'] = egg_convert(ing)
                else:
                    if k == 'butter' and 'buttermilk' in ing:
                        continue
                    
                    if k == 'milk' and ('evaporated' in ing or 'condensed' in ing):
                        continue
                        
                    if k in ALIASES:
                        k = ALIASES[k]
                    
                    # quantify ingredient, in grams
                    grams = vol_convert(ing, k)
                    # if volume unit wasn't found, try mass instead
                    if grams == 0:
                        grams = mass_convert(ing)
                    quantities[k] = grams                        
                
                # an ingredient has been found in this string and quantified,
                # so we don't need to check for other ingredients in this string
                break
    
    # remove aliases from dict
    for alias in ALIASES.keys():
        del quantities[alias]
        
    return quantities

In [63]:
quant = []
for row in df.itertuples(index=False):
    quant.append(extract_quantities(row.ingredients))

In [65]:
quant_df = pd.DataFrame(quant)
quant_df.head()

Unnamed: 0,egg,flour,sugar,butter,vanilla extract,milk,evaporated milk,condensed milk,shortening,powdered sugar,cornmeal,soda,baking powder
0,0.0,0.0,200.016,28.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,240.0,300.024,0.0,0.0,264.0,0.0,0.0,0.0,0.0,0.0,4.8,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,120.0,0.0,28.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0


In [90]:
max_amounts = {'egg': 15,
               'flour': 1500, 
               'sugar': 1500, 
               'butter': 1500, 
               'vanilla extract': 100,
               'milk': 1000,                   
               'evaporated milk': 500,
               'condensed milk': 500,
               'shortening': 1000,
               'powdered sugar': 1000,
               'cornmeal': 1500,
               'soda': 200,
               'baking powder': 200}

def has_valid_quantities(row):
    for k in max_amounts.keys():
        if np.isnan(row[k]) or row[k] > max_amounts[k]:
            return False
    return True

In [91]:
valid_list = []
for rowtuple in quant_df.iterrows():
    valid_list.append(has_valid_quantities(rowtuple[1]))

In [98]:
quant_df = quant_df.join(pd.Series(valid_list, name='valid'))

In [99]:
quant_df.groupby(by=[lambda i: quant_df['flour'][i] > 0, 'valid']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,egg,flour,sugar,butter,vanilla extract,milk,evaporated milk,condensed milk,shortening,powdered sugar,cornmeal,soda,baking powder
Unnamed: 0_level_1,valid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
False,False,5960,5958,5944,5953,5960,5957,5959,5960,5960,5960,5960,5958,5959
False,True,86707,86707,86707,86707,86707,86707,86707,86707,86707,86707,86707,86707,86707
True,False,5906,5906,5906,5906,5906,5906,5906,5906,5906,5906,5906,5906,5906
True,True,137189,137189,137189,137189,137189,137189,137189,137189,137189,137189,137189,137189,137189
