In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import time
from tqdm import tqdm as ProgressBar
import six # needed for Google Cloud client
import operator
import sys
import en # NodeBox https://www.nodebox.net/code/index.php/Linguistics#verb_conjugation
from nltk.corpus import stopwords
import parsing_util
import pickle
from nltk.stem.wordnet import WordNetLemmatizer

from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

In [21]:
df=pd.read_pickle('CleanedIngredients.pkl')
df.drop_duplicates(subset='url', keep='first', inplace=True)
df2=df.set_index("url")
print df.shape
print df[['instructionSteps']].values[0]
print df[['ingredients']].values[0]
print df[['cleanedIngredients']].values[0]


(48417, 11)
[ [u'In a bowl, mix the tomatoes, onion, red bell pepper, yellow bell pepper, cilantro, lime juice, and salt. Cover and refrigerate until ready to serve.']]
[ [u'6 roma (plum) tomatoes, diced', u'1 sweet onion, diced', u'1 medium red bell pepper, diced', u'1 medium yellow bell pepper, diced', u'1 bunch cilantro, finely minced', u'1 lime, juiced', u'1 teaspoon salt, or to taste', u'Add all ingredients to list', u'Add all ingredients to list']]
[ ['roma plum tomatoes', 'sweet onion', 'red bell pepper', 'yellow bell pepper', 'cilantro', 'lime', 'salt']]


In [29]:
# Load a list of stopwords
stopWords = set(stopwords.words('english'))
stopVerbs = {'bring','make'} # These are useless verbs that should be ignored
goVerbs = {'preheat'}  # These are verbs that are often not recognized as such


def get_verbs(parsedInstructions, Ingredients, ingredientsDict, debug=0):

    tokens = parsedInstructions

    # part-of-speech tags from enums.PartOfSpeech.Tag
    pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM',
               'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX')
    idx=0
    verbs=list()
    for token in tokens:
        tag = pos_tag[token.part_of_speech.tag]
        if token.text.content.lower() != token.text.content and en.is_verb(token.text.content.lower()) and tag != 'VERB':
            tag = 'VERB'
        # If this was identified and a noun and can be a verb and there is a noun following
        elif (tag == 'NOUN' # parser thinks it is a NOUN but can also be a verb
             and len(tokens) > idx+1
             and en.is_verb(token.text.content.lower()) and pos_tag[tokens[idx+1].part_of_speech.tag] == 'NOUN'):
            if (pos_tag[tokens[idx-1].part_of_speech.tag] == 'PUNCT') and token.text.content not in ingredientsDict:
                tag = 'VERB'
        else:
            tag = pos_tag[token.part_of_speech.tag]

        if tag == 'VERB':
            if debug > 1:
                print token.text.content
            verb=(token.text.content).lower()
            try:   
                v=WordNetLemmatizer().lemmatize(verb,'v')
                if len(v)>1:
                    verb=v
            except:
                pass
            if verb not in stopWords and verb not in stopVerbs:
                verbs.append(verb)
            elif verb in goVerbs:
                verbs.append(verb)
            
        if debug > 5:
            print(u'{}: {}'.format(tag, verb))
        idx += 1
    return(verbs)

In [31]:
verbsDict=defaultdict(int)
if insDict is None:
    insDict=parsing_util.Instructions()
cleanedIngredientsDict=pd.read_pickle('cleanerIngredients.pkl')  
added=0
#for idx,row in df.sample(1).iterrows():
for idx, row in ProgressBar(df.iterrows(), desc="Processing recipes"):
    url=row["url"]
    # print url
    parsedInstructions=insDict.parse(url,row["instructionSteps"])
    Ingredients=cleanedIngredientsDict.get(url,[])
    verbs=get_verbs(parsedInstructions, Ingredients, verbsDict, debug=0)
    verbsDict[url] = verbs
    #for verb in verbs:
    #    verbsDict[verb] += 1
        # print verb
    added+=1
    if added > 10000:
        with open('verbs.pkl', 'wb') as f:
            pickle.dump(verbsDict, f) 
        added = 0
with open('verbs.pkl', 'wb') as f:
    pickle.dump(verbsDict, f)
insDict.close()
print verbs


Processing recipes: 48417it [04:46, 169.13it/s]


[u'preheat', u'grease', u'combine', u'add', u'beat', u'batter', u'pour', u'bake', u'insert', u'come', u'cool', u'remove', u'finish', u'cool', u'fill', u'desire']


In [65]:
#sanity
for item in sorted(verbsDict.items(), reverse=False, key=operator.itemgetter(1))[:300]:
    print item

(u'http://allrecipes.com/recipe/7199/prosciutto-filling-for-calzones/', [])
(u'http://allrecipes.com/recipe/10931/buckeye-cookies-iii/', [u"'ll", u'need', u'mix', u'shape', u'cover', u'place', u'melt', u'pour', u'dip', u'refrigerate'])
(u'http://allrecipes.com/recipe/12761/stephans-broiled-salmon-pesto/', [u'accommodate', u'place', u'run', u'remove', u'use', u'pull', u'remain', u'squeeze', u'marinate', u'preheat', u'coat', u'cover', u'place', u'broil', u'form', u'brown', u'remove', u'set', u'squeeze', u'slice', u'remain', u'place', u'arrange', u'serve'])
(u'http://allrecipes.com/recipe/12080/chocolate-pudding-pie/', [u'accord', u'dump', u'spread', u'whip', u'top', u'refrigerate', u'garnish'])
(u'http://allrecipes.com/recipe/13740/cornbread-and-sausage-stuffing/', [u'accord', u'let', u'sit', u'get', u'place', u'cook', u'crumble', u'set', u'cook', u'remove', u'allow', u'cool', u'combine', u'crumble', u'add', u'mix', u'add', u'stuff', u'toss', u'bake'])
(u'http://allrecipes.com/recipe/669

In [37]:
print len(verbsDict)

1


In [32]:
from nltk.stem.wordnet import WordNetLemmatizer
words = ['gave','went','using','dating']
for word in words:
    print word+"-->"+WordNetLemmatizer().lemmatize(word,'v')

gave-->give
went-->go
using-->use
dating-->date


## Debug

In [30]:
# For debugging
df2=df.set_index("url")
if insDict is None:
    insDict=parsing_util.Instructions()
reload(parsing_util)
url='http://allrecipes.com/recipe/7199/prosciutto-filling-for-calzones/'
Instructions=str(df2.loc[url,'instructionSteps'])
ingredients=df2.loc[url,'ingredients']

#Instructions="[u'Whisk the paprika, garlic, Italian seasoning, lemon juice, olive oil, pepper, basil, and brown sugar together in a bowl until thoroughly blended. Stir in the shrimp, and toss to evenly coat with the marinade. Cover and refrigerate at least 2 hours, turning once.', u'Preheat an outdoor grill for medium-high heat. Lightly oil grill grate, and place about 4 inches from heat source.', u'Remove shrimp from marinade, drain excess, and discard marinade.', u'Place shrimp on preheated grill and cook, turning once, until opaque in the center, 5 to 6 minutes. Serve immediately.']"
#ingredients=[u'2 teaspoons ground paprika', u'2 tablespoons fresh minced garlic', u'2 teaspoons Italian seasoning blend', u'2 tablespoons fresh lemon juice', u'1/4 cup olive oil', u'1/2 teaspoon ground black pepper', u'2 teaspoons dried basil leaves', u'2 tablespoons brown sugar, packed', u'2 pounds large shrimp (21-25 per pound), peeled and deveined', u'Add all ingredients to list', u'Add all ingredients to list']
parsedInstructions = insDict.parse(url, Instructions)

print Instructions
print ingredients
print get_verbs(parsedInstructions, ingredients, verbsDict, debug=0)

[u'Cook pasta according to package directions. Drain.', u'In large skillet heat oil and butter. Gently brown garlic, add broccoli and saute gently for 2 to 3 minutes. Add broth; cover and simmer until broccoli is tender.', u'Toss the broccoli mixture with the basil and cooked pasta. Serve with grated Parmesan cheese on top.']
[u'8 tablespoons olive oil', u'2 tablespoons butter', u'4 cloves garlic, minced', u'1 pound fresh broccoli florets', u'1 cup vegetable broth', u'1 cup chopped fresh basil', u'1 pound rigatoni pasta', u'2 tablespoons grated Parmesan cheese', u'Add all ingredients to list', u'Add all ingredients to list']
[u'cook', u'accord', u'drain', u'add', u'add', u'cover', u'simmer', u'toss', u'cook', u'serve']
