In [58]:
import numpy
import pandas as pd
import numpy as np
import nltk
from ast import literal_eval
import treetaggerwrapper as ttw
from nltk.stem.porter import *

In [59]:
recipes = pd.read_csv("data/RAW_recipes.csv")
recipes["steps"] = recipes["steps"].apply(literal_eval)
recipes["ingredients"] = recipes["ingredients"].apply(literal_eval)
recipes["tags"] = recipes["tags"].apply(literal_eval)

In [60]:
preprocessed_sentence = ''
lemma_sentence = ''
def tree_tag(text, tagger):
    tagged_text = tagger.tag_text(text)
    return ttw.make_tags(tagged_text)

In [61]:
def tag_ingredients(ingredients):
    
    tagged_ingredients = []
    for ingredient in ingredients:
    
        sentence = tree_tag(ingredient, tagger)

        for tag in sentence:
        # remove special characters
            if isinstance(tag, ttw.Tag):

                # stem
                stem = stemmer.stem(tag.lemma)
                
                if tag.pos in ['NN0', 'NN1', 'NN2']:
                    tagged_ingredients.append(stem.split('|')[0])
    return tagged_ingredients


In [62]:
def tag_sentence(original_sentence, ingredients):
    
    sentence = tree_tag(original_sentence, tagger)

    step_ingredients = []
    step_activities = []
    
    for tag in sentence:
    # remove special characters
        if isinstance(tag, ttw.Tag):
            
            # stem
            stem = stemmer.stem(tag.lemma)
            if stem in ingredients:
                step_ingredients.append(tag.lemma.split('|')[0])
            if tag.pos in ['VVB','VVZ','VVI',
                           'VVD','VVN', 'VVP','VVG']:#VVG not sure
                step_activities.append(tag.lemma.split('|')[0])
                
    if len(step_activities) == 0:
        # default activity if no verb is present
        step_activities.append('use')

    return step_ingredients, step_activities

            

In [80]:
selectByName = True

categories =['cheesecake']
name = "mojito"
max_recipes = 1000
tagger = ttw.TreeTagger(TAGLANG='en')
stemmer = PorterStemmer()

case_id_col = []
event_id_col = []
order_col = []
activity_col = []
ingredient_col = []

event_cnt = 0 
recipe_cnt = 0

for i in range(0,recipes.shape[0]):

    if (not selectByName and len(np.intersect1d(categories,recipes["tags"][i])) == len(categories)) or (selectByName and name.lower() in str(recipes["name"][i]).lower()):
        #print(str(recipes["name"][i]))
        recipe_cnt += 1
        if recipe_cnt < max_recipes and recipes["minutes"][i] < 1440:
            ingredients = tag_ingredients(recipes["ingredients"][i])

            minutesPerStep = recipes["minutes"][i]/len(recipes["steps"][i])
            time = 0

            for original_sentence in recipes["steps"][i]:
                step_ingredients, step_activities = tag_sentence(original_sentence,ingredients)

                # synchronous events
                for ingredient in step_ingredients:
                    for activity in step_activities:
                        case_id_col.append(i)
                        event_id_col.append(event_cnt)
                        event_cnt += 1
                        order_col.append(time)

                        activity_col.append(activity)
                        ingredient_col.append(ingredient)

                time += minutesPerStep
            

In [81]:
recipe_log = pd.DataFrame({'case_id': case_id_col,
                           'event_id': event_id_col,
                           'order': order_col,
                           'activity': activity_col,
                           'ingredient': ingredient_col})
recipe_log

Unnamed: 0,case_id,event_id,order,activity,ingredient
0,2034,0,0.000000,make,rum
1,2034,1,0.000000,stir,rum
2,2034,2,0.000000,make,lime
3,2034,3,0.000000,stir,lime
4,2034,4,0.000000,make,juice
...,...,...,...,...,...
2012,228724,2012,23.333333,blend,lime
2013,228724,2013,23.333333,make,zest
2014,228724,2014,23.333333,mix,zest
2015,228724,2015,23.333333,concentrate,zest


In [82]:
if selectByName:
    recipe_log.to_csv('./logs/log_' + name + '_1000.csv',index = False)
else:
    recipe_log.to_csv('./logs/log_' + '_'.join(categories) + '_1000.csv',index = False)