MIS 285N Cognitive Computing<br>
Final Project<br>
Jerry Che - Jose Guerrero - Riley Moynihan - Noah Placke - Sarah Teng - Palmer Wenzel

# Data Preprocessing

#### Read data from JSON file.

In [5]:
import json


# Load raw JSON data for recipes
with open('raw/recipes_raw_nosource_ar.json', 'r') as infile:
    recipes_json_raw = json.load(infile)


recipes_json_raw[list(recipes_json_raw.keys())[0]]

{'title': 'Slow Cooker Chicken and Dumplings',
 'ingredients': ['4 skinless, boneless chicken breast halves ADVERTISEMENT',
  '2 tablespoons butter ADVERTISEMENT',
  '2 (10.75 ounce) cans condensed cream of chicken soup ADVERTISEMENT',
  '1 onion, finely diced ADVERTISEMENT',
  '2 (10 ounce) packages refrigerated biscuit dough, torn into pieces ADVERTISEMENT',
  'ADVERTISEMENT'],
 'instructions': 'Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.\nCover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.\n',
 'picture_link': '55lznCYBbs2mT8BTx6BTkLhynGHzM.S'}

#### Process ingredient list text.

In [6]:
import re
import string
import itertools
import fractions


# Read in "unit tokens" (e.g. 'cup', 'tablespoons', 'pinch')
with open('unit_tokens.txt', 'r') as infile:
    unit_tokens = infile.read().splitlines()

# Will need to convert any written numbers to numerical representation
word_to_digit = {
    'one': 1, 'two': 2, 'three': 3,
    'four': 4, 'five': 5, 'six': 6,
    'seven': 7, 'eight': 8, 'nine': 9,
}

# Method to process a single ingredient line, will return tuple of (int(quanity), ingredient)
def process_ingredient(ingredient):
    # Str lower
    ingredient = ingredient.lower()
    
    # Remove unit tokens
    ingredient = ' '.join([token for token in ingredient.split() if token not in unit_tokens])
    
    # Convert any written numbers to digit representation
    for word in word_to_digit.keys():
        if ' ' + word + ' ' in ingredient:
            ingredient = ingredient.replace(word, str(word_to_digit[word]))
    
    # Remove any range options (e.g. '4 to 6')
    ingredient = re.sub(r' to (\d|\d/)', '', ingredient)
    
    # Extract quantity and convert fraction to float
    quantity_str = ''.join(itertools.takewhile(lambda c: c == ' ' or (ord(c) >= 47 and ord(c) <= 57), ingredient))  # extract fraction string from front of ingredient
    quantity_int = sum([float(fractions.Fraction(frac_str)) for frac_str in quantity_str.split()])  # convert fraction string(s) to floats, summing if mixed number
    
    # Remove quantity string from ingredient
    ingredient = ingredient.replace(quantity_str, '')
    
    # Return tuple
    return (ingredient, quantity_int)


# Iterate through recipes, applying processing to ingredient lists
recipes_json_pcd = recipes_json_raw.copy()
bad_keys = []
for key, recipe in recipes_json_pcd.items():
    try:
        # Perform processing on ingredients list
        ingredients = [process_ingredient(ingredient) for ingredient in recipe['ingredients']]
        
        # Corrects an artifact on the AllRecipes data source where an blank ingredient is present
        ingredients = [ingredient for ingredient in ingredients if ingredient[1] > 0.0]
        
        # Amend original ingredients list with processed one
        recipes_json_pcd[key]['ingredients'] = dict(ingredients)
        
        import random
        if random.random() < 0.75:
            bad_keys.append(key)
        
    except KeyError:
        bad_keys.append(key)
        
# Delete erroneous recipes
for key in bad_keys:
    recipes_json_pcd.pop(key, None)
    
    
recipes_json_pcd[list(recipes_json_pcd.keys())[0]]

{'title': 'Awesome Slow Cooker Pot Roast',
 'ingredients': {'(10.75 ounce) cans condensed cream of mushroom soup': 2.0,
  '(ounce) package dry onion soup mix': 1.0,
  'water': 1.25,
  'pot roast': 5.5},
 'instructions': 'In a slow cooker, mix cream of mushroom soup, dry onion soup mix and water. Place pot roast in slow cooker and coat with soup mixture.\nCook on High setting for 3 to 4 hours, or on Low setting for 8 to 9 hours.\n',
 'picture_link': 'QyrvGdGNMBA2lDdciY0FjKu.77MM0Oe'}

#### Build Pandas dataframe.

In [7]:
import pandas as pd

# NEW METHOD #
# Get set of all ingredients
# Create list of dictionaries with key/val pairs for all ingredients, fill in zeros for majority
# Make that into a dataframe

# Get set of all unique ingredients
all_ingredients = set()
for key, recipe in recipes_json_pcd.items():
    for ingredient in recipe['ingredients'].keys():
        all_ingredients.add(ingredient)
        
# For each recipe, add all unique ingredients to ingredients list and replace the non-zero entries with the correct values
recipes_json_for_df = []
for key, recipe in recipes_json_pcd.items():
    this_recipe = {}
    
    this_recipe['title'] = recipe['title']
    this_recipe['instructions'] = recipe['instructions']
    this_recipe['picture_link'] = recipe['picture_link']
    
    # Add an entry for every unique ingredient
    for ingredient in all_ingredients:
        if ingredient in recipe['ingredients']:
            this_recipe[ingredient] = recipe['ingredients'][ingredient]
        else:
            this_recipe[ingredient] = 0.0

    recipes_json_for_df.append(this_recipe)


# Create dataframe from list of dictionaries
recipes_df = pd.DataFrame(recipes_json_for_df)

print(recipes_df.shape)
recipes_df.head()

Unnamed: 0,title,instructions,picture_link,Unnamed: 4,packets artificial sweetener,liquid smoke flavoring,"jarred roasted red pepper, drained and chopped","tomatoes, peeled and sliced","fresh lump crabmeat, or more to taste",sheets sushi nori (dry seaweed),...,finely chopped fresh mint,buttery crackers such as keebler club® crackers,havarti or gouda cheese,greek-style vinaigrette (such as renee's® gourmet),(1.5 fluid ounce) jigger swiss chocolate almond liqueur,(12 ounce) package refrigerated buttermilk biscuit dough,(24 ounce) jar marinara sauce,fresh spinach leaves,"brown sugar, or to taste",frozen unsweetened raspberries
0,Awesome Slow Cooker Pot Roast,"In a slow cooker, mix cream of mushroom soup, ...",QyrvGdGNMBA2lDdciY0FjKu.77MM0Oe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Chef John's Fisherman's Pie,Bring a large saucepan of salted water and to ...,aUca10AaD8T2yYvcLOgH/UJlR5/OhOe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Best Big, Fat, Chewy Chocolate Chip Cookie",Preheat the oven to 325 degrees F (165 degrees...,EmmrsVyOmMlydE188t/S5gtqhqeENfO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bailey's Irish Cream Brownies,Preheat oven to 350 degrees F (175 degrees C)....,j9kandxcyu2SK5yNmK8yWjefb2xYdCG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Chicken Cordon Bleu II,Pound chicken breasts if they are too thick. P...,ihSzemMJFm3PAc5Gjz7ClL3vrC17c1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
all_ingredients = set()
for key, recipe in recipes_json_pcd.items():
    for ingredient in recipe['ingredients'].keys():
        all_ingredients.add(ingredient)
        
len(all_ingredients)