In [1]:
import pandas as pd

In [2]:
nlg = pd.read_csv("../data/recipeNLG/full_dataset.csv")

In [3]:
nlg.head(3)

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."


In [5]:
nlg.source.unique()

array(['Gathered', 'Recipes1M'], dtype=object)

## Preprocessing

In [16]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [50]:
nlg["NER"][0]

'["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'

In [9]:
nlg["directions"][0]

'["In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.", "Stir over medium heat until mixture bubbles all over top.", "Boil and stir 5 minutes more. Take off heat.", "Stir in vanilla and cereal; mix well.", "Using 2 teaspoons, drop and shape into 30 clusters on wax paper.", "Let stand until firm, about 30 minutes."]'

In [89]:
def preprocess_item(item):
    tokens = word_tokenize(item)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens).lower()        

In [90]:
import ast

example = '["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'

item_list = ast.literal_eval(example)
for item in item_list:
    print(preprocess_item(item))

brown sugar
milk
vanilla
nut
butter
bite size shredded rice biscuit


In [91]:
lemmatizer = WordNetLemmatizer()
ingredients = set()

for items, source in zip(nlg['NER'], nlg['source']):
    if source == "Recipes1M":
        item_list = ast.literal_eval(items)
        for item in item_list:
            ingredients.add(preprocess_item(item))

Lowercase ingredients from recipeNLG (recipe1M) before preprocessing: 94k

after: 91k

https://www.kaggle.com/datasets/thedevastator/now-with-more-nutrients/

In [9]:
kaggle = pd.read_csv("../data/cleaned_ingredients.csv")

In [94]:
kaggle.head(3)

Unnamed: 0,NDB_No,Descrip,Energy_kcal,Protein_g,Saturated_fats_g,Fat_g,Carb_g,Fiber_g,Sugar_g,Calcium_mg,...,VitC_mg,Thiamin_mg,Riboflavin_mg,Niacin_mg,VitB6_mg,Folate_mcg,VitB12_mcg,VitA_mcg,VitE_mg,VitD2_mcg
0,1001,butter with salt,717.0,0.85,51.368,81.11,0.06,0.0,0.06,24.0,...,0.0,0.005,0.034,0.042,0.003,3.0,0.17,684.0,2.32,0.0
1,1002,butter whipped w salt,718.0,0.49,45.39,78.3,2.87,0.0,0.06,23.0,...,0.0,0.007,0.064,0.022,0.008,4.0,0.07,683.0,1.37,0.0
2,1003,butter oil anhydrous,876.0,0.28,61.924,99.48,0.0,0.0,0.0,4.0,...,0.0,0.001,0.005,0.003,0.001,0.0,0.01,840.0,2.8,0.0


In [95]:
kaggle_ingrs = set() 

for record in kaggle['Descrip']:
    kaggle_ingrs.add(preprocess_item(record))  

In [97]:
len(kaggle_ingrs)

9306

## GISMO preprocessing

In [26]:
from ingredient import *
from instruction import *


class Args:
    def __init__(self, minnumingrs, minnuminstrs, maxnuminstrs, 
                 maxnumingrs, minnumwords, majority_lvl, threshold_ingrs):
        self.minnumingrs = minnumingrs
        self.minnuminstrs = minnuminstrs
        self.maxnuminstrs = maxnuminstrs
        self.maxnumingrs = maxnumingrs
        self.minnumwords = minnumwords
        self.majority_lvl = majority_lvl
        self.threshold_ingrs = threshold_ingrs
        
        
BASE_WORDS = [
    "peppers",
    "tomato",
    "spinach_leaves",
    "turkey_breast",
    "lettuce_leaf",
    "chicken_thighs",
    "milk_powder",
    "bread_crumbs",
    "onion_flakes",
    "red_pepper",
    "pepper_flakes",
    "juice_concentrate",
    "cracker_crumbs",
    "hot_chili",
    "seasoning_mix",
    "dill_weed",
    "pepper_sauce",
    "sprouts",
    "cooking_spray",
    "cheese_blend",
    "basil_leaves",
    "pineapple_chunks",
    "marshmallow",
    "chile_powder",
    "cheese_blend",
    "corn_kernels",
    "tomato_sauce",
    "chickens",
    "cracker_crust",
    "lemonade_concentrate",
    "red_chili",
    "mushroom_caps",
    "mushroom_cap",
    "breaded_chicken",
    "frozen_pineapple",
    "pineapple_chunks",
    "seasoning_mix",
    "seaweed",
    "onion_flakes",
    "bouillon_granules",
    "lettuce_leaf",
    "stuffing_mix",
    "parsley_flakes",
    "chicken_breast",
    "basil_leaves",
    "baguettes",
    "green_tea",
    "peanut_butter",
    "green_onion",
    "fresh_cilantro",
    "breaded_chicken",
    "hot_pepper",
    "dried_lavender",
    "white_chocolate",
    "dill_weed",
    "cake_mix",
    "cheese_spread",
    "turkey_breast",
    "chucken_thighs",
    "basil_leaves",
    "mandarin_orange",
    "laurel",
    "cabbage_head",
    "pistachio",
    "cheese_dip",
    "thyme_leave",
    "boneless_pork",
    "red_pepper",
    "onion_dip",
    "skinless_chicken",
    "dark_chocolate",
    "canned_corn",
    "muffin",
    "cracker_crust",
    "bread_crumbs",
    "frozen_broccoli",
    "philadelphia",
    "cracker_crust",
    "chicken_breast",
]


In [None]:
def ingredients_dict(nlg, args, instruction_parser: InstructionParser, ingredient_parser: IngredientParser):
    ingr_counted = {}
    for ner, source, instrs in zip(nlg['NER'], nlg['source'], nlg["directions"]):
        if source == "Recipes1M": 
            ingr_list = ast.literal_eval(ner)  
            instrs_list = ast.literal_eval(instrs)  
            acc_len, instr_list = instruction_parser.parse_entry(instr_list)
            if (
                len(instrs_list) < args.minnuminstrs
                or len(instrs_list) >= args.maxnuminstrs
                or acc_len < args.minnumwords
            ):
                continue
            else:         
                # preprocess ingr, 
                for ingr in ingr_list:
                    ready_ingr = ingredient_parser.parse_entry(ingr)
                    if ready_ingr in ingr_counted.keys():
                        ingr_counted[ready_ingr] += 1
                    else:
                        ingr_counted[ready_ingr] = 1

    # manually add missing entries for better clustering
    for base_word in BASE_WORDS:
        if base_word not in ingr_counted.keys():
            ingr_counted[base_word] = 1

    counter_ingrs, cluster_ingrs = cluster_ingredients(ingr_counted)
    counter_ingrs, cluster_ingrs = remove_plurals(counter_ingrs, cluster_ingrs)

    # if threshold not achieved - delete
    ingrs = {
            word: cnt
            for word, cnt in ingr_counted
            if cnt >= args.threshold_ingrs
    }

    return ingrs, cluster_ingre

In [4]:
args = Args(minnumingrs=2, minnuminstrs=3, maxnuminstrs=12, maxnumingrs=10, 
            minnumwords=50, majority_lvl=0.5, threshold_ingrs=10)

ingredient_parser = IngredientParser(
    replace_dict={
        "and": ["&", "'n"],
        "": ["%", ",", ".", "#", "[", "]", "!", "?"],
    }
)

instruction_parser = InstructionParser(
    replace_dict={"and": ["&", "'n"], "": ["#", "[", "]"]}
)

ingredients_dict(nlg, args, instruction_parser, ingredient_parser)

# coverage(nlg, kaggle_ingrs, args, instruction_parser, ingredient_parser)


IndentationError: unexpected indent (1584243564.py, line 3)