## Extract `RecipeNLG_bg_only.csv` from zip file before running this code

Place it in the same directory as this notebook.

In [77]:
import pandas as pd
import re
DATA_FILE = 'RecipeNLG_bg_only.csv'

# list of ingredients that will be quantified
QUANTIFIED_INGREDIENTS = {'flour': 0, 'sugar': 0, 'butter': 0}

# regex objects
re_cup = re.compile('^([0-9\/ ]*[0-9]) c\.? ', flags=re.IGNORECASE)
re_tbsp = re.compile('^([0-9\/ ]*[0-9]) tbsps?\.? ', flags=re.IGNORECASE)
re_tsp = re.compile('^([0-9\/ ]*[0-9]) tsps?\.? ', flags=re.IGNORECASE)
re_lb = re.compile('^([0-9\/ ]*[0-9]) lbs?\.? ', flags=re.IGNORECASE)
re_g = re.compile('^([0-9\/ ]*[0-9]) g(?:ram)?s?\.? ', flags=re.IGNORECASE)

re_vol_units = [re_cup, re_tbsp, re_tsp]
re_mass_units = [re_lb, re_g]

# unit conversions
vol_multipliers = [48, 3, 1]
grams_in_a_tsp = {'flour': 2.5, 'sugar': 4.167, 'butter': 4.72917}

In [5]:
df = pd.read_csv(filepath_or_buffer=DATA_FILE, header=0)
print(df.shape)

(2231142, 7)


In [24]:
# convert string with fraction to float
# e.g. "3 1/2" becomes 3.5
def convert_to_float(s):
    if ' ' in s:
        p1, p2 = s.split(' ')
        f = float(p1)
        if '/' in p2:
            numerator, denominator = p2.split('/')
        else:
            numerator, denominator = p2.split('\\')
        f += (float(numerator) / float(denominator))
        return f
    elif '/' in s:
        i = s.find('/')
    elif '\\' in s:
        i = s.find('\\')
    else:
        return float(s)
    
    return float(s[:i]) / float(s[i+1:])

In [75]:
# extract quantities from an ingredient string

def extract_quantities(ingredients):
    # initialize quantity dict
    quantities = QUANTIFIED_INGREDIENTS.copy()
    
    # convert ingredient string into list
    ing_list = ingredients.split('"')[1::2]
    
    # iterate through ingredient list
    for ing in ing_list:
        # iterate through ingredients we are searching for
        for k, v in quantities.items():
            if v == 0 and k in ing:
                # iterate through units of volume
                for i in range(len(re_vol_units)):
                    search = re_vol_units[i].match(ing)
                    if search is not None:
                        quantities[k] = vol_multipliers[i] * grams_in_a_tsp[k] * convert_to_float(search.group(1))
                        break
                
                if quantities[k] == 0:
                    for i in range(len(re_mass_units))
    
    return quantities

In [78]:
extract_quantities(df.iloc[6]['ingredients'])

{'flour': 240.0, 'sugar': 300.024, 'butter': 113.50008}

In [36]:
df.iloc[6]['ingredients'].split('"')[1::2]

['1 1/2 c. sugar',
 '1/2 c. butter',
 '1 egg',
 '1 c. buttermilk',
 '2 c. flour',
 '1/2 tsp. salt',
 '1 tsp. soda',
 '1 c. buttermilk',
 '2 c. rhubarb, finely cut',
 '1 tsp. vanilla']