### Parsing Steps
    - Use parser (stanford or bllip) to parse each sentence
    - For each noun phrase, travel up the tree to find verb.
    - Write wordnet ancestor search to resolve references from noun phrase to ingredient
    - Keep dict of ingredients -> counter of actions

### Search Steps
    - For given set of ingredients
        - Sort recipes by most shared ingredients
        - For any missing ingredients, see how similar the dict entries are to determine if can substitute

In [31]:
import os
from nltk.parse import stanford
from nltk.tree import Tree

os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2015-01-30/'
os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2015-01-30/'
parser = stanford.StanfordParser(model_path="stanford-parser-full-2015-01-30/englishPCFG.ser.gz")

def parse_sentences(sentences):
    """ returns list of Tree's for input list of strings """
    return parser.parse_sents([sentence.split() for sentence in sentences])

def draw_sentences(tree_list):
    """ GUI debugging """
    for sentence in tree_list:
        sentence.draw()

In [18]:
import json
with open('sample.json') as f:
    sample_recipe = json.load(f)



In [127]:
with open('bigoven_cleaned.json') as f:
    test_recipes = json.load(f)[0:10]

In [135]:
def process_recipe(recipe):
    print 'Ingredients:'
    for ingredient in recipe['ingredients']:
        print ingredient
    
    print 'Instructions:'
    instr_trees = parse_sentences(recipe['instructions'])
    for instruction, tree in zip(recipe['instructions'], instr_trees):
        print instruction
        print tree
    
    return instr_trees
    
    #draw_sentences(instr_trees)
    
trees = process_recipe(sample_recipe)

Ingredients:
{u'name': u'Lasagna noodles ', u'unit': u'package (~12 oz.)', u'quantity': u'1'}
{u'name': u'Olive oil', u'unit': u'ml', u'quantity': u'30'}
{u'name': u'Tomato basil pasta sauce', u'unit': u'jars', u'quantity': u'1 1/2'}
{u'name': u'Ricotta Cheese part skim', u'unit': u'16 oz container', u'quantity': u'1'}
{u'name': u'Roasted garlic', u'unit': u'cloves', u'quantity': u'10'}
{u'name': u'Fresh baby spinach', u'unit': u'handfuls', u'quantity': u'4'}
{u'name': u'Artichoke hearts (in water)', u'unit': u'15 oz can/jar', u'quantity': u'1'}
{u'name': u'Dried parsley', u'unit': u'ml', u'quantity': u'2'}
{u'name': u'Red pepper flakes', u'unit': u'ml', u'quantity': u'2'}
{u'name': u'Ground black pepper ', u'unit': u'ml', u'quantity': u'2'}
{u'name': u'Parmesan cheese', u'unit': u'ml', u'quantity': u'177'}
{u'name': u'Monterey jack cheese, reduced fat', u'unit': u'g', u'quantity': u'170'}
{u'name': u'Part Skim Mozzarella Cheese', u'unit': u'g', u'quantity': u'170'}
{u'name': u'Fresh b

In [130]:
tree = trees[0]

for sub in tree.subtrees():
    print sub
    if sub == tree:
        print 'same'

for sub in tree.subtrees(lambda t: t.label() in ['VP']):
    print sub.label()
    for sub2 in tree.subtrees(lambda t: t.label() in ['VP']):
        print sub2.label()
        print sub2
        if sub2 == sub:
            print 'same'

print ''



(ROOT
  (S
    (VP
      (VB Bring)
      (NP (NP (JJ large) (NN pot)) (PP (IN of) (NP (NN water))))
      (PP (TO to) (NP (DT a) (NN boil.))))))
same
(S
  (VP
    (VB Bring)
    (NP (NP (JJ large) (NN pot)) (PP (IN of) (NP (NN water))))
    (PP (TO to) (NP (DT a) (NN boil.)))))
(VP
  (VB Bring)
  (NP (NP (JJ large) (NN pot)) (PP (IN of) (NP (NN water))))
  (PP (TO to) (NP (DT a) (NN boil.))))
(VB Bring)
(NP (NP (JJ large) (NN pot)) (PP (IN of) (NP (NN water))))
(NP (JJ large) (NN pot))
(JJ large)
(NN pot)
(PP (IN of) (NP (NN water)))
(IN of)
(NP (NN water))
(NN water)
(PP (TO to) (NP (DT a) (NN boil.)))
(TO to)
(NP (DT a) (NN boil.))
(DT a)
(NN boil.)
VP
VP
(VP
  (VB Bring)
  (NP (NP (JJ large) (NN pot)) (PP (IN of) (NP (NN water))))
  (PP (TO to) (NP (DT a) (NN boil.))))
same



In [170]:

conf_threshold = 0

ingredients = [ing['name'].lower() for ing in sample_recipe['ingredients']]
#ingredients.append('water')

knowledge_base = {ing : [] for ing in ingredients}


"""
Search subtrees, calling match_noun_to_verb() on every verb phrase.
"""
def crawl_tree(tree):
    for child in tree.subtrees(lambda t: t.label() in ['VP']):
        #print 'found verb phrase:', child.flatten(), 'Looking for noun in children.'
        
        for ing in match_noun_to_verb_phrase(child):
            
            knowledge_base[ing].append(find_verb(child, ing))
        
"""
Returns the verb phrase's VB's and NN's.
"""
def find_verb(vp, ingredient):
    return filter(lambda w: w[1] in ['VB', 'NN'] and w[0] not in [ingredient], vp.pos())
        
        
"""
Search subtrees for ingredient(s) affected by the verb in a verb phrase.
"""
def match_noun_to_verb_phrase(tree):
    matches = []
    for child in tree.subtrees(lambda t: t.label() in ['NP']):
        #print 'found NP:', child.flatten()
        (ingredient_match, confidence) = resolve_np(child, ingredients)
        if confidence > conf_threshold:
            matches.append(ingredient_match)
            #print 'confident match! saving:', ingredient_match, verb.flatten()
                
    return matches


"""
For Tree np and list ingredients, tries to find match between np and ingredients.
Returns (matched_ingredient_name, confidence)
"""
def resolve_np(np, ingredients):
    np_flat = ' '.join(np.flatten())
    
    max_depth = 3
    #print 'Trying to resolve NP:', np_flat
    for ing in ingredients:
        depth = wordnet_search(np_flat, ing, max_depth)
        if depth >= 0:
            #print 'Found match with', ing, 'at depth', depth
            return (ing, (max_depth - depth)/float(max_depth))
    #print 'Found no match. Trying children.'
    
    return ('', 0)


"""
Searches through wordnet for common ancestors between two strings.
"""
def wordnet_search(ing1, ing2, max_depth):
    #TODO crawl up wordnet for `ing1` and `ing2` looking for match
    if ing1.lower() == ing2.lower():
        return 0
    return -1

for tree in trees:
    crawl_tree(tree)

def print_knowledge_base(knowledge_base):
    for ingredient,matches in knowledge_base.iteritems():
        print ingredient
        for match in matches:
            print '\t', match
print_knowledge_base(knowledge_base)

olive oil
	[(u'Pour', u'VB'), (u'tablespoon', u'NN'), (u'oil', u'NN'), (u'water', u'NN'), (u'pasta.', u'NN')]
	[(u'Drizzle', u'VB'), (u'tablespoon', u'NN'), (u'oil', u'NN'), (u'glass', u'NN'), (u'pan.', u'NN')]
ricotta cheese part skim
tomato basil pasta sauce
ground black pepper 
lasagna noodles 
fresh basil
roasted garlic
monterey jack cheese, reduced fat
fresh baby spinach
dried parsley
parmesan cheese
artichoke hearts (in water)
part skim mozzarella cheese
red pepper flakes
