### Parsing Steps
    - Use parser (stanford or bllip) to parse each sentence
    - For each noun phrase, travel up the tree to find verb.
    - Write wordnet ancestor search to resolve references from noun phrase to ingredient
    - Keep dict of ingredients -> counter of actions

### Search Steps
    - For given set of ingredients
        - Sort recipes by most shared ingredients
        - For any missing ingredients, see how similar the dict entries are to determine if can substitute

In [31]:
import os
from nltk.parse import stanford
from nltk.tree import Tree

os.environ['STANFORD_PARSER'] = 'stanford-parser-full-2015-01-30/'
os.environ['STANFORD_MODELS'] = 'stanford-parser-full-2015-01-30/'
parser = stanford.StanfordParser(model_path="stanford-parser-full-2015-01-30/englishPCFG.ser.gz")

def parse_sentences(sentences):
    """ returns list of Tree's for input list of strings """
    return parser.parse_sents([sentence.split() for sentence in sentences])

def draw_sentences(tree_list):
    """ GUI debugging """
    for sentence in tree_list:
        sentence.draw()

In [18]:
import json
with open('sample.json') as f:
    sample_recipe = json.load(f)



In [127]:
with open('bigoven_cleaned.json') as f:
    test_recipes = json.load(f)[0:10]

In [223]:
def process_recipe(recipe):
    print 'Ingredients:'
    for ingredient in recipe['ingredients']:
        print ingredient
    
    print 'Instructions:'
    instr_trees = parse_sentences(recipe['instructions'])
    for instruction, tree in zip(recipe['instructions'], instr_trees):
        print instruction
        #print tree
    
    return instr_trees
    
    #draw_sentences(instr_trees)
    
trees = process_recipe(sample_recipe)

Ingredients:
{u'name': u'Lasagna noodles ', u'unit': u'package (~12 oz.)', u'quantity': u'1'}
{u'name': u'Olive oil', u'unit': u'ml', u'quantity': u'30'}
{u'name': u'Tomato basil pasta sauce', u'unit': u'jars', u'quantity': u'1 1/2'}
{u'name': u'Ricotta Cheese part skim', u'unit': u'16 oz container', u'quantity': u'1'}
{u'name': u'Roasted garlic', u'unit': u'cloves', u'quantity': u'10'}
{u'name': u'Fresh baby spinach', u'unit': u'handfuls', u'quantity': u'4'}
{u'name': u'Artichoke hearts (in water)', u'unit': u'15 oz can/jar', u'quantity': u'1'}
{u'name': u'Dried parsley', u'unit': u'ml', u'quantity': u'2'}
{u'name': u'Red pepper flakes', u'unit': u'ml', u'quantity': u'2'}
{u'name': u'Ground black pepper ', u'unit': u'ml', u'quantity': u'2'}
{u'name': u'Parmesan cheese', u'unit': u'ml', u'quantity': u'177'}
{u'name': u'Monterey jack cheese, reduced fat', u'unit': u'g', u'quantity': u'170'}
{u'name': u'Part Skim Mozzarella Cheese', u'unit': u'g', u'quantity': u'170'}
{u'name': u'Fresh b

In [130]:
tree = trees[0]

for sub in tree.subtrees():
    print sub
    if sub == tree:
        print 'same'

for sub in tree.subtrees(lambda t: t.label() in ['VP']):
    print sub.label()
    for sub2 in tree.subtrees(lambda t: t.label() in ['VP']):
        print sub2.label()
        print sub2
        if sub2 == sub:
            print 'same'

print ''



(ROOT
  (S
    (VP
      (VB Bring)
      (NP (NP (JJ large) (NN pot)) (PP (IN of) (NP (NN water))))
      (PP (TO to) (NP (DT a) (NN boil.))))))
same
(S
  (VP
    (VB Bring)
    (NP (NP (JJ large) (NN pot)) (PP (IN of) (NP (NN water))))
    (PP (TO to) (NP (DT a) (NN boil.)))))
(VP
  (VB Bring)
  (NP (NP (JJ large) (NN pot)) (PP (IN of) (NP (NN water))))
  (PP (TO to) (NP (DT a) (NN boil.))))
(VB Bring)
(NP (NP (JJ large) (NN pot)) (PP (IN of) (NP (NN water))))
(NP (JJ large) (NN pot))
(JJ large)
(NN pot)
(PP (IN of) (NP (NN water)))
(IN of)
(NP (NN water))
(NN water)
(PP (TO to) (NP (DT a) (NN boil.)))
(TO to)
(NP (DT a) (NN boil.))
(DT a)
(NN boil.)
VP
VP
(VP
  (VB Bring)
  (NP (NP (JJ large) (NN pot)) (PP (IN of) (NP (NN water))))
  (PP (TO to) (NP (DT a) (NN boil.))))
same



In [227]:
from collections import Counter

"""
Search subtrees, calling match_noun_to_verb() on every verb phrase.
"""
def crawl_tree(tree, ingredients, knowledge_base):
    for child in tree.subtrees(lambda t: t.label() in ['VP']):
        #print 'found verb phrase:', child.flatten(), 'Looking for noun in children.'
        
        for ing in match_noun_to_verb_phrase(child):
            
            knowledge_base[ing].update(find_verb(child, ing))
        
"""
Returns the verb phrase's VB's and NN's.
"""
def find_verb(vp, ingredient):
    return filter(lambda w: w[1] in ['VB', 'NN'] and w[0] not in [ingredient], vp.pos())
        
        
"""
Search subtrees for ingredient(s) affected by the verb in a verb phrase.
"""
def match_noun_to_verb_phrase(tree):
    matches = []
    for child in tree.subtrees(lambda t: t.label() in ['NP']):
        #print 'found NP:', child.flatten()
        (ingredient_match, confidence) = resolve_np(child, ingredients)
        if confidence > 0:
            matches.append(ingredient_match)
            #print 'confident match! saving:', ingredient_match, verb.flatten()
                
    return matches


"""
For Tree np and list ingredients, tries to find match between np and ingredients.
Returns (matched_ingredient_name, confidence)
"""
def resolve_np(np, ingredients):
    np_flat = ' '.join(np.flatten())
    
    np_flat = norm_noun(np_flat)
    
    #print 'Trying to resolve NP:', np_flat
    
    match = max([(ing, wordnet_search(np_flat, norm_noun(ing))) for ing in ingredients], key=lambda e:e[1])
    
    min_confidence = 0.8
    if match[1] > min_confidence:
        print 'WordNet match: ', np_flat, match
        return match
        
    #print 'Found no match.'
    
    return ('', 0)


"""
Searches through wordnet for common ancestors between two strings.
"""
from nltk.corpus import wordnet as wn

def wordnet_search(ing1, ing2):
    # Exact match is exact
    if ing1.lower() == ing2.lower():
        return 1
    
    full_syns1 = wn.synsets(ing1)
    full_syns2 = wn.synsets(ing2)
    
    
    if full_syns1 == []:
        full_syns1 = wn.synsets(normed_noun_truncate(ing1))
    
    if full_syns2 == []:
        full_syns2 = wn.synsets(normed_noun_truncate(ing2))

    
    possible_matches = [ syn1.wup_similarity(syn2) for syn2 in full_syns2 for syn1 in full_syns1]
    if len(possible_matches) > 0:
        return max(possible_matches)
    
    return 0

"""
Gets singular form of noun.
Runs lemmatizer on last word, prepends rest of name
"""
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def norm_noun(noun):
    # TODO handle 'artichoke hearts', ex
    
    # strip ( ,
    noun = noun.split('(')[0].split(',')[0].strip(' .?!')
    
    # remove leading "the"
    leading_strip = ['the ', 'rest of the ']
    for banned in leading_strip:
        if noun.startswith(banned):
            noun = noun[len(banned):]
    
    pieces = noun.split()
    
    if len(pieces) > 0:
        return '_'.join(pieces[:-1] + [lemmatizer.lemmatize(pieces[-1], pos='n')])
    return noun

"""
Drops first word from multi-word noun phrase.
"""
def normed_noun_truncate(noun):
    return '_'.join(noun.split('_')[1:])

def print_knowledge_base(knowledge_base):
    for ingredient,matches in knowledge_base.iteritems():
        print ingredient
        print '\t', matches


In [229]:
# Runs above algorithm on ten sample recipes
for r in test_recipes:
    ingredients = [ing['name'].lower() for ing in r['ingredients']]
    # ingredients.append('meat')
    # ingredients.append('noodles')
    
    knowledge_base = {ing : Counter() for ing in ingredients}
    
    for tree in process_recipe(r):
        crawl_tree(tree, ingredients, knowledge_base)
    print_knowledge_base(knowledge_base)

Ingredients:
{u'name': u'frozen young turkey', u'unit': u'kg', u'quantity': u'7'}
{u'name': u'Kosher salt', u'unit': u'ml', u'quantity': u'237'}
{u'name': u'Light brown sugar', u'unit': u'ml', u'quantity': u'118'}
{u'name': u'Water', u'unit': u'l', u'quantity': u'4'}
{u'name': u'black peppercorns', u'unit': u'ml', u'quantity': u'15'}
{u'name': u'allspice berry', u'unit': u'ml', u'quantity': u'7'}
{u'name': u'Water', u'unit': u'l', u'quantity': u'4'}
{u'name': u'red apple', u'unit': u'', u'quantity': u'1'}
{u'name': u'onion', u'unit': u'', u'quantity': u'1/2'}
{u'name': u'Cinnamon', u'unit': u'', u'quantity': u'1'}
{u'name': u'Water', u'unit': u'ml', u'quantity': u'237'}
{u'name': u'rosemary', u'unit': u'sprigs', u'quantity': u'4'}
{u'name': u'leaves sage', u'unit': u'', u'quantity': u'6'}
{u'name': u'Canola Oil', u'unit': u'ml', u'quantity': u'118'}
Instructions:
Combine all brine ingredients, except ice water, in a stock pot and bring to a boil. 
Stir to dissolve solids, then remove f