## Extract `RecipeNLG_bg_only.csv` from zip file before running this code

Place it in the same directory as this notebook.

In [33]:
import pandas as pd
import re
DATA_FILE = 'RecipeNLG_bg_only.csv'

# ingredients that will be quantified
QUANTIFIED_INGREDIENTS = {'egg': 0,
                          'flour': 0, 
                          'sugar': 0, 
                          'butter': 0, 
                          'vanilla extract': 0, 
                          'milk': 0, 
                          'evaporated milk': 0, 
                          'condensed milk': 0, 
                          'shortening': 0}

SYNONYMS = set(['margarine', 'oleo', 'crisco'])

# regex objects
re_cup = re.compile('^([0-9\/ ]*[0-9]) c(?:ups?)?\.? ', flags=re.IGNORECASE)
re_tbsp = re.compile('^([0-9\/ ]*[0-9]) tbsps?\.? ', flags=re.IGNORECASE)
re_tsp = re.compile('^([0-9\/ ]*[0-9]) tsps?\.? ', flags=re.IGNORECASE)
re_lb = re.compile('^([0-9\/ ]*[0-9]) (?:pound|lb)s?\.? ', flags=re.IGNORECASE)
re_g = re.compile('^([0-9\/ ]*[0-9]) g(?:ram)?s?\.? ', flags=re.IGNORECASE)
re_oz = re.compile('^([0-9\/ ]*[0-9]) o(?:z|unces?)\.? ', flags=re.IGNORECASE)
re_stick = re.compile('^([0-9\/ ]*[0-9]) sticks? ', flags=re.IGNORECASE)

re_egg = re.compile('([0-9\/ ]*[0-9]) (?:[abd-z\.]* )?egg', flags=re.IGNORECASE)
re_egg_beaters = re.compile('^([0-9\/ ]*[0-9]) c(?:up)?\.? egg beaters', flags=re.IGNORECASE)

# unit conversions
re_vol_units = [re_cup, re_tbsp, re_tsp]
vol_multipliers = [48, 3, 1] # the number of tsps in a cup, tbsp, and tsp (respectively)

re_mass_units = [re_lb, re_oz, re_g, re_stick]
mass_multipliers = [453.592, # grams in a lb
                    28.35, # g in an ounce
                    1, # g in a g
                    113] # g in a stick of butter/margarine

grams_in_a_tsp = {'flour': 2.5, 
                  'sugar': 4.167, 
                  'butter': 4.71, 
                  'vanilla extract': 4.92,
                  'milk': 5.5,                   
                  'evaporated milk': 5.25,
                  'condensed milk': 6.38,
                  'shortening': 4.1} # self-explanatory

In [2]:
df = pd.read_csv(filepath_or_buffer=DATA_FILE, header=0)
print(df.shape)

(235762, 4)


In [3]:
# convert string with fraction to float
# e.g. "3 1/2" becomes 3.5

def convert_to_float(s):
    if ' ' in s:
        p1, p2 = s.split(' ')
        f = float(p1)
        if '/' in p2:
            numerator, denominator = p2.split('/')
        else:
            numerator, denominator = p2.split('\\')
        f += (float(numerator) / float(denominator))
        return f
    elif '/' in s:
        i = s.find('/')
    elif '\\' in s:
        i = s.find('\\')
    else:
        return float(s)
    
    return float(s[:i]) / float(s[i+1:])

In [29]:
# qi = Quantified Ingredient
# ing_string = string from recipe that we want to quantify
def vol_convert(ing_string, qi):
    for i in range(len(re_vol_units)):
        search = re_vol_units[i].match(ing_string)
        if search is not None:
            return vol_multipliers[i] * grams_in_a_tsp[qi] * convert_to_float(search.group(1))
    return 0

def mass_convert(ing_string):
    for i in range(len(re_mass_units)):
        search = re_mass_units[i].match(ing_string)
        if search is not None:
            return mass_multipliers[i] * convert_to_float(search.group(1))
    return 0

def egg_convert(ing_string):
    search = re_egg.match(ing_string)
    if search is not None:
        return convert_to_float(search.group(1))
    else:
        search = re_egg_beaters.match(ing_string)
        if search is not None:
            return 4 * convert_to_float(search.group(1))
    return 0

# extract quantities from an ingredient string
def extract_quantities(ingredients):
    
    # initialize quantity dict, which will be returned
    quantities = QUANTIFIED_INGREDIENTS.copy()
    
    # convert ingredient string into list
    ing_list = ingredients.lower().split('"')[1::2]
    
    # iterate through strings of recipe's ingredient list
    for ing in ing_list:
        # iterate through ingredients we are searching for
        for k, v in quantities.items():
            if v == 0 and k in ing:
                
                # Eggs are special for two reasons
                # 1. They are discrete, unlike other ingredients
                # 2. They sometimes come in the form of "Egg Beaters"
                if k == 'egg':
                    quantities['egg'] = egg_convert(ing)
                else:
                    if k == 'butter' and 'buttermilk' in ing:
                        continue
                    
                    if k == 'milk' and ('evaporated' in ing or 'condensed' in ing):
                        continue
                    
                    # quantify ingredient, in grams
                    grams = vol_convert(ing, k)
                    # if volume unit wasn't found, try mass instead
                    if grams == 0:
                        grams = mass_convert(ing)
                    quantities[k] = grams                        
                
                # an ingredient has been found in this string and quantified,
                # so we don't need to check for other ingredients in this string
                break
        
    return quantities

In [39]:
extract_quantities(df.iloc[8]['ingredients'])

{'egg': 2.0,
 'flour': 0,
 'sugar': 453.592,
 'butter': 0,
 'vanilla extract': 0,
 'milk': 0,
 'evaporated milk': 0,
 'condensed milk': 0,
 'shortening': 0}

In [11]:
df.iloc[6]['ingredients'].split('"')[1::2]

['1 Tbsp. Worcestershire sauce',
 '2 or 3 shakes Old Bay',
 '1 tsp. dry mustard',
 '2 Tbsp. mayonnaise',
 '1 egg']