### Parsing Steps
    - Use parser (stanford or bllip) to parse each sentence
    - For each noun phrase, travel up the tree to find verb.
    - Write wordnet ancestor search to resolve references from noun phrase to ingredient
    - Keep dict of ingredients -> counter of actions

### Search Steps
    - For given set of ingredients
        - Sort recipes by most shared ingredients
        - For any missing ingredients, see how similar the dict entries are to determine if can substitute

In [135]:
import os
from nltk.parse import stanford
from nltk.tree import Tree

os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2015-01-30/'
os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2015-01-30/'
parser = stanford.StanfordParser(model_path="stanford-parser-full-2015-01-30/englishPCFG.ser.gz")

def parse_sentences(sentences):
    """ returns list of Tree's for input list of strings """
    return parser.parse_sents([sentence.split() for sentence in sentences])

def draw_sentences(tree_list):
    """ GUI debugging """
    for sentence in tree_list:
        sentence.draw()

In [136]:
import json
with open('sample.json') as f:
    sample_recipe = json.load(f)



In [137]:
with open('bigoven_cleaned.json') as f:
    test_recipes = json.load(f)[0:100]

In [138]:
def process_recipe(recipe):
    print 'Ingredients:'
    for ingredient in recipe['ingredients']:
        print ingredient
    
    print 'Instructions:'
    instr_trees = parse_sentences(recipe['instructions'])
    for instruction, tree in zip(recipe['instructions'], instr_trees):
        print instruction
        print tree
    
    return instr_trees
    
    #draw_sentences(instr_trees)
    
trees = process_recipe(sample_recipe)

Ingredients:
{u'name': u'Lasagna noodles ', u'unit': u'package (~12 oz.)', u'quantity': u'1'}
{u'name': u'Olive oil', u'unit': u'ml', u'quantity': u'30'}
{u'name': u'Tomato basil pasta sauce', u'unit': u'jars', u'quantity': u'1 1/2'}
{u'name': u'Ricotta Cheese part skim', u'unit': u'16 oz container', u'quantity': u'1'}
{u'name': u'Roasted garlic', u'unit': u'cloves', u'quantity': u'10'}
{u'name': u'Fresh baby spinach', u'unit': u'handfuls', u'quantity': u'4'}
{u'name': u'Artichoke hearts (in water)', u'unit': u'15 oz can/jar', u'quantity': u'1'}
{u'name': u'Dried parsley', u'unit': u'ml', u'quantity': u'2'}
{u'name': u'Red pepper flakes', u'unit': u'ml', u'quantity': u'2'}
{u'name': u'Ground black pepper ', u'unit': u'ml', u'quantity': u'2'}
{u'name': u'Parmesan cheese', u'unit': u'ml', u'quantity': u'177'}
{u'name': u'Monterey jack cheese, reduced fat', u'unit': u'g', u'quantity': u'170'}
{u'name': u'Part Skim Mozzarella Cheese', u'unit': u'g', u'quantity': u'170'}
{u'name': u'Fresh b

In [139]:
from collections import Counter

"""
Search subtrees, calling match_noun_to_verb() on every verb phrase.
"""
def crawl_tree(tree, ingredients, knowledge_base):
    for child in tree.subtrees(lambda t: t.label() in ['VP']):
        #print 'found verb phrase:', child.flatten(), 'Looking for noun in children.'
        
        for ing in match_noun_to_verb_phrase(child):
            
            knowledge_base[ing].update(find_verb(child, ing))
        
"""
Returns the verb phrase's VB's and NN's.
"""
def find_verb(vp, ingredient):
    return map(lambda w: (w[0].lower(), w[1]), filter(lambda w: w[1] in ['VB'] and w[0] not in [ingredient], vp.pos()))
        
        
"""
Search subtrees for ingredient(s) affected by the verb in a verb phrase.
"""
def match_noun_to_verb_phrase(tree):
    matches = []
    for child in tree.subtrees(lambda t: t.label() in ['NP']):
        #print 'found NP:', child.flatten()
        (ingredient_match, confidence) = resolve_np(child, ingredients)
        if confidence > 0:
            matches.append(ingredient_match)
            #print 'confident match! saving:', ingredient_match, child.flatten()
                
    return matches


"""
For Tree np and list ingredients, tries to find match between np and ingredients.
Returns (matched_ingredient_name, confidence)
"""
def resolve_np(np, ingredients):
    np_flat = ' '.join(np.flatten())
    
    np_flat = norm_noun(np_flat)
    
    #print 'Trying to resolve NP:', np_flat
    
    matches = [(ing, wordnet_search(np_flat, norm_noun(ing))) for ing in ingredients]
    
    if matches:
        match = max(matches, key=lambda e:e[1])

        min_confidence = 0.9
        if match[1] > min_confidence:
            print 'WordNet match: ', np_flat, match
            return match

    #print 'Found no match.'
    
    return ('', 0)


"""
Searches through wordnet for common ancestors between two strings.
"""
from nltk.corpus import wordnet as wn
known_matches = {}
def wordnet_search(ing1, ing2):
    # Try to hit cache first
    if (ing1, ing2) in known_matches:
        return known_matches[(ing1, ing2)]
    if (ing2, ing1) in known_matches:
        return known_matches[(ing2, ing1)]
    
    # Exact match is exact
    if ing1.lower() == ing2.lower():
        return 1
    
    full_syns1 = wn.synsets(ing1)
    full_syns2 = wn.synsets(ing2)
    
    
    if full_syns1 == []:
        full_syns1 = wn.synsets(normed_noun_truncate(ing1))
    
    if full_syns2 == []:
        full_syns2 = wn.synsets(normed_noun_truncate(ing2))

    
    possible_matches = [ syn1.wup_similarity(syn2) for syn2 in full_syns2 for syn1 in full_syns1]
    if len(possible_matches) > 0:
        best = max(possible_matches)
        known_matches[(ing1, ing2)] = best
        return best
    
    return 0

"""
Gets singular form of noun.
Runs lemmatizer on last word, prepends rest of name
"""
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def norm_noun(noun):
    # TODO handle 'artichoke hearts', ex
    
    # strip ( ,
    noun = noun.split('(')[0].split(',')[0].strip(' .?!')
    
    # remove leading "the"
    leading_strip = ['the ', 'rest of the ']
    for banned in leading_strip:
        if noun.startswith(banned):
            noun = noun[len(banned):]
    
    pieces = noun.split()
    
    if len(pieces) > 0:
        return '_'.join(pieces[:-1] + [lemmatizer.lemmatize(pieces[-1], pos='n')])
    return noun

"""
Drops first word from multi-word noun phrase.
"""
def normed_noun_truncate(noun):
    return '_'.join(noun.split('_')[1:])

def print_knowledge_base(knowledge_base):
    for ingredient,matches in knowledge_base.iteritems():
        print ingredient
        print '\t', matches


In [140]:
# Runs above algorithm on ten sample recipes
total_kb = {}
for r in test_recipes:
    ingredients = [ing['name'].lower() for ing in r['ingredients']]
    # ingredients.append('meat')
    # ingredients.append('noodles')
    
    knowledge_base = {ing : Counter() for ing in ingredients}
    
    for tree in process_recipe(r):
        crawl_tree(tree, ingredients, knowledge_base)
#     print_knowledge_base(knowledge_base)
    
    # merge knowledge_base into total_kb
    for ingredient,matches in knowledge_base.iteritems():
        if ingredient in total_kb:
            knowledge_base[ingredient] = matches.update(total_kb[ingredient])
    total_kb.update(knowledge_base)

Ingredients:
{u'name': u'frozen young turkey', u'unit': u'kg', u'quantity': u'7'}
{u'name': u'Kosher salt', u'unit': u'ml', u'quantity': u'237'}
{u'name': u'Light brown sugar', u'unit': u'ml', u'quantity': u'118'}
{u'name': u'Water', u'unit': u'l', u'quantity': u'4'}
{u'name': u'black peppercorns', u'unit': u'ml', u'quantity': u'15'}
{u'name': u'allspice berry', u'unit': u'ml', u'quantity': u'7'}
{u'name': u'Water', u'unit': u'l', u'quantity': u'4'}
{u'name': u'red apple', u'unit': u'', u'quantity': u'1'}
{u'name': u'onion', u'unit': u'', u'quantity': u'1/2'}
{u'name': u'Cinnamon', u'unit': u'', u'quantity': u'1'}
{u'name': u'Water', u'unit': u'ml', u'quantity': u'237'}
{u'name': u'rosemary', u'unit': u'sprigs', u'quantity': u'4'}
{u'name': u'leaves sage', u'unit': u'', u'quantity': u'6'}
{u'name': u'Canola Oil', u'unit': u'ml', u'quantity': u'118'}
Instructions:
Combine all brine ingredients, except ice water, in a stock pot and bring to a boil. 
(ROOT
  (S
    (VP
      (VP
        (

In [141]:
total_kb = {key:value for key,value in total_kb.iteritems() if value and len(value) > 0}
print_knowledge_base(total_kb)

icing sugar
	Counter({(u'add', u'VB'): 1})
deli ham
	Counter({(u'place', u'VB'): 2, (u'take', u'VB'): 1})
chuck roast 
	Counter({(u'pour', u'VB'): 2, (u'add', u'VB'): 2})
bunch kale
	Counter({(u'saut\xe9', u'VB'): 1})
chicken
	Counter({(u'add', u'VB'): 1})
medium zucchini
	Counter({(u'add', u'VB'): 6, (u'cook', u'VB'): 4})
blueberries
	Counter({(u'serve', u'VB'): 2})
raw honey
	Counter({(u'add', u'VB'): 2, (u'stir', u'VB'): 1})
rosemary sprigs
	Counter({(u'discard', u'VB'): 2, (u'brown', u'VB'): 1, (u'remove', u'VB'): 1, (u'tuck', u'VB'): 1})
gherkin pickles
	Counter({(u'pour', u'VB'): 6, (u'add', u'VB'): 4, (u'pllace', u'VB'): 2})
oranges
	Counter({(u'add', u'VB'): 1})
red wine
	Counter({(u'deglaze', u'VB'): 3, (u'add', u'VB'): 2, (u'sweat', u'VB'): 1})
crushed tomatoes
	Counter({(u'add', u'VB'): 1})
garlic, or 1/4 teaspoon garlic powder
	Counter({(u'chop', u'VB'): 2, (u'peel', u'VB'): 1})
caramel ice cream topping
	Counter({(u'drizzle', u'VB'): 2, (u'place', u'VB'): 1})
potatoes
	Cou

In [153]:
# Change counters to percentages

kb = {}
for ingredient, counts in total_kb.iteritems():
    banned = [('add', 'VB')]
    for ban in banned:
        if ban in counts:
            del(counts[ban])
            
    total = sum([count for verb,count in counts.iteritems()])
    kb[ingredient] = {verb : count/float(total) for verb,count in counts.iteritems()}
    
print_knowledge_base(kb)

icing sugar
	{}
deli ham
	{(u'place', u'VB'): 0.6666666666666666, (u'take', u'VB'): 0.3333333333333333}
chuck roast 
	{(u'pour', u'VB'): 1.0}
bunch kale
	{(u'saut\xe9', u'VB'): 1.0}
chicken
	{}
medium zucchini
	{(u'cook', u'VB'): 1.0}
blueberries
	{(u'serve', u'VB'): 1.0}
raw honey
	{(u'stir', u'VB'): 1.0}
rosemary sprigs
	{(u'brown', u'VB'): 0.2, (u'discard', u'VB'): 0.4, (u'remove', u'VB'): 0.2, (u'tuck', u'VB'): 0.2}
gherkin pickles
	{(u'pllace', u'VB'): 0.25, (u'pour', u'VB'): 0.75}
oranges
	{}
red wine
	{(u'deglaze', u'VB'): 0.75, (u'sweat', u'VB'): 0.25}
crushed tomatoes
	{}
garlic, or 1/4 teaspoon garlic powder
	{(u'peel', u'VB'): 0.3333333333333333, (u'chop', u'VB'): 0.6666666666666666}
caramel ice cream topping
	{(u'place', u'VB'): 0.3333333333333333, (u'drizzle', u'VB'): 0.6666666666666666}
 milk
	{(u'thicken', u'VB'): 0.6666666666666666, (u'smooth', u'VB'): 0.3333333333333333}
sliced mushrooms
	{(u'cook', u'VB'): 1.0}
pork tenderloins
	{(u'place', u'VB'): 0.3333333333333333,

In [157]:
def get_intersection(kb, ing1, ing2):
    n = 0.0
    for verb1, count1 in kb[ing1].iteritems():
        for verb2, count2 in kb[ing2].iteritems():
            if verb1 == verb2:
                n += min(count1, count2)
    return n

MINIMUM_OVERLAP = 0.7
def are_equivalent(kb, ing1, ing2):
    return get_intersection(kb, ing1, ing2) > MINIMUM_OVERLAP
    
equivalencies = {}
for ingredient in kb:
    equivs = []
    for other in kb:
        if ingredient != other and are_equivalent(kb, ingredient, other):
            equivs.append(other)
    equivalencies[ingredient] = equivs

In [158]:
# print equivalencies
import pprint

equivalencies = {key:value for key,value in equivalencies.iteritems() if value and len(value) > 0}
pprint.pprint(equivalencies)

{u'basmati rice': [u'medium zucchini',
                   u'sliced mushrooms',
                   u'yellow pepper',
                   u'leeks'],
 u'blueberries': [u'wheat english muffin', u'tomato salsa'],
 u'broth': [u'soy flakes'],
 u'chicken stock': [u'raw honey', u'onions'],
 u'chuck roast ': [u'gherkin pickles', u'white wine ', u'spicy tomato sauce'],
 u'cooked corn': [u'monterey jack'],
 u'egg ': [u'instant yeast '],
 u'gherkin pickles': [u'chuck roast ', u'white wine ', u'spicy tomato sauce'],
 u'instant yeast ': [u'egg '],
 u'leeks': [u'medium zucchini',
            u'sliced mushrooms',
            u'yellow pepper',
            u'basmati rice'],
 u'medium zucchini': [u'sliced mushrooms',
                      u'yellow pepper',
                      u'leeks',
                      u'basmati rice'],
 u'monterey jack': [u'cooked corn'],
 u'onions': [u'raw honey', u'chicken stock'],
 u'parsley': [u'yellow onion'],
 u'raw honey': [u'chicken stock', u'onions'],
 u'red wine': [u'shal

In [160]:
# print total_kb
# print total_kb['rum (preferably light']
print kb['broth']
print kb['soy flakes']

{(u'bring', u'VB'): 0.3333333333333333, (u'reduce', u'VB'): 0.3333333333333333, (u'simmer,', u'VB'): 0.3333333333333333}
{(u'bring', u'VB'): 0.3333333333333333, (u'reduce', u'VB'): 0.3333333333333333, (u'simmer,', u'VB'): 0.3333333333333333}
