In [1]:
import pandas as pd

In [3]:
nlg = pd.read_csv("../data/recipeNLG/full_dataset.csv")

In [4]:
nlg.head(3)

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."


In [5]:
nlg.source.unique()

array(['Gathered', 'Recipes1M'], dtype=object)

## Preprocessing

In [16]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [50]:
nlg["NER"][0]

'["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'

In [89]:
def preprocess_item(item):
    tokens = word_tokenize(item)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens).lower()        

In [90]:
import ast

example = '["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'

item_list = ast.literal_eval(example)
for item in item_list:
    print(preprocess_item(item))

brown sugar
milk
vanilla
nut
butter
bite size shredded rice biscuit


In [91]:
lemmatizer = WordNetLemmatizer()
ingredients = set()

for items, source in zip(nlg['NER'], nlg['source']):
    if source == "Recipes1M":
        item_list = ast.literal_eval(items)
        for item in item_list:
            ingredients.add(preprocess_item(item))

### Coverage

Lowercase ingredients from recipeNLG (recipe1M) before preprocessing: 94k

after: 91k

https://www.kaggle.com/datasets/thedevastator/now-with-more-nutrients/

In [9]:
kaggle = pd.read_csv("../data/cleaned_ingredients.csv")

In [94]:
kaggle.head(3)

Unnamed: 0,NDB_No,Descrip,Energy_kcal,Protein_g,Saturated_fats_g,Fat_g,Carb_g,Fiber_g,Sugar_g,Calcium_mg,...,VitC_mg,Thiamin_mg,Riboflavin_mg,Niacin_mg,VitB6_mg,Folate_mcg,VitB12_mcg,VitA_mcg,VitE_mg,VitD2_mcg
0,1001,butter with salt,717.0,0.85,51.368,81.11,0.06,0.0,0.06,24.0,...,0.0,0.005,0.034,0.042,0.003,3.0,0.17,684.0,2.32,0.0
1,1002,butter whipped w salt,718.0,0.49,45.39,78.3,2.87,0.0,0.06,23.0,...,0.0,0.007,0.064,0.022,0.008,4.0,0.07,683.0,1.37,0.0
2,1003,butter oil anhydrous,876.0,0.28,61.924,99.48,0.0,0.0,0.0,4.0,...,0.0,0.001,0.005,0.003,0.001,0.0,0.01,840.0,2.8,0.0


In [95]:
kaggle_ingrs = set() 

for record in kaggle['Descrip']:
    kaggle_ingrs.add(preprocess_item(record))  

In [97]:
len(kaggle_ingrs)

9306

## Coverage

In [101]:
def coverage(nlg, kaggle_set, majority_lvl):
    count = 0
    all = 0
    for record, source in zip(nlg['NER'], nlg['source']):
        if source == "Recipes1M":            
            item_list = ast.literal_eval(record)
            
            covered_items = 0
            for item in item_list:
                if preprocess_item(item) in kaggle_set:
                    covered_items += 1

            if len(item_list) != 0:
                all += 1
                if covered_items / len(item_list) >= majority_lvl:
                    count += 1

    return count / all

In [102]:
coverage(nlg, kaggle_ingrs, 0.5)

0.004361931430574009

In [103]:
coverage(nlg, kaggle_ingrs, 0.7)

0.00027900029431128607