# Automated Metrics

- Percentage of ingredients correctly used
- BLEU-4



In [175]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [176]:
import os
import pathlib
import numpy as np

In [177]:
CACHE_DIR = "./drive/Shared drives/CS 269: Recipe/tmp"
pathlib.Path(CACHE_DIR).mkdir(exist_ok=True)

## Calculate average percentage of ingredients correctly used 

In [178]:
import re

# Helper functions for evaluation
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def filter_words(f, ing):
    return ' '.join(filter(f, ing.split()))

def remove_symbols(ing):
    return ing.translate({ord(i): None for i in '(),-\'\"'})

In [179]:
measurements = ['cup', 'can', 'jar', 'inch', 'inches', 'fluid ounce', 'fluid ounces', 'ounce', 'pinch', 'pinches', 'pound', 'teaspoon', 'tablespoon', 'liter', 'quart', 'package', 'clove']
measurements += list(map(\
    lambda m: \
        m + 's' if m not in ['inch', 'fluid ounce', 'inches', 'fluid ounces', 'pinch', 'pinches'] else m + 'es', \
        measurements)) # Add plurals

In [180]:
adjectives = set(['canned', 'softened', 'diced', 'chopped', 'semisweet', 'thawed', 'frozen', 'minced', 'peeled', 'seeded', 'prepared', \
                  'melted', 'pitted', 'uncooked', 'cooked', 'squeezed', 'lean', 'boneless', 'ground', 'divided', 'refrigerated', \
                  'skinless', 'crushed', 'grated', 'trimmed', 'crushed', 'sifted', 'all-purpose', 'allpurpose', 'drained', 'mashed', \
                  'rinsed', 'shredded', 'hulled', 'dry', 'dried', 'white wine', 'red wine', 'packed', 'fresh', 'freshly', 'sliced', \
                  'washed', 'sweetened', 'unsweetened', 'extract', 'vegetable', 'large', 'small', 'finely', 'to', 'taste', 'beaten'] \
+ ['white', 'red', 'orange', 'yellow', 'green', 'blue', 'brown', 'black'])

In [197]:
from tqdm import tqdm

def calc_used_ing_percentage(recipes, model_name):
    sum_correct = 0
    recipe_count = 0

    for recipe_index in tqdm(range(len(recipes))):
        recipe = recipes[recipe_index]

        if model_name == 'rnn':
            split_recipe = recipe.split("🥣", 1)
            if len(split_recipe) != 2:
                continue

            ingredients, instructions = split_recipe

             # Check if ingredients list was generated
            ing_start = ingredients.find('•')
            if ing_start == -1:
                continue

            # Remove characters preceding ingredients list (i.e., title)
            ingredients = ingredients[ing_start:]

        else: # GPT-2 and Ground Truth
            # Extract ingredients
            split_recipe = recipe.split("<ING>", 1)
            if len(split_recipe) != 2:
                continue

            split_recipe = split_recipe[1].split("<INS>", 1)
            if len(split_recipe) != 2:
                continue

            ingredients, instructions = split_recipe

        # Create list from ingredients
        ingredients = ingredients \
            .split("• ")
        
        # Remove empty strings
        ingredients = list(filter(lambda ing: ing is not '', ingredients))

        # Extract raw ingredients from ingredients list (not perfect, uses basic string replacement)
        for i, ing in enumerate(ingredients):
            ing = remove_symbols(ing)
            ing = filter_words(lambda word: not has_numbers(word), ing) # Remove numbers
            ing = filter_words(lambda word: word not in measurements, ing) # Remove measurements
            ing = filter_words(lambda word: word not in adjectives, ing) # Remove common adjectives
            ing = filter_words(lambda word: word != "and" and word != "or", ing) # Remove conjunctions
            ingredients[i] = ing
            
        # Filter symbols from instructions
        instructions = remove_symbols(instructions)
        
        # Count number of ingredients found in instructions. Matches if any word in an ingredient appears
        num_ings_found = sum(map(lambda ing: any(map(lambda w: w in instructions, ing.split())), ingredients))
        correct_percentage = num_ings_found / len(ingredients)
        sum_correct += correct_percentage
        recipe_count += 1
    
    return sum_correct / recipe_count

In [182]:
def load_file_recipes(file_dataset_path):
    recipes = []
    print("Reading recipe files")

    # Iterate through files in natural sort order
    for filepath in sorted(pathlib.Path(file_dataset_path).glob('**/*'), key=lambda path: 0 if path.stem.find("_") == -1 else int(path.stem.rsplit("_", 1)[-1])):
        # If file is titles.txt, skip
        if filepath.name == "titles.txt":
            continue
            
        with open(filepath) as f:
            recipes.append(f.read())
    
    return recipes

Ground truth recipes

In [183]:
import pandas as pd

truth_dataset_path = os.path.join(CACHE_DIR, 'text_recipes.pkl')
orig_recipes = pd.read_pickle(truth_dataset_path)
truth_percentage = calc_used_ing_percentage(orig_recipes[:500].to_list(), 'truth')
print(f" Ground Truth Avg % of Ingredients Correctly Used {truth_percentage}")

100%|██████████| 500/500 [00:00<00:00, 5624.08it/s]

 Ground Truth Avg % of Ingredients Correctly Used 0.9769391030624002





Generated recipes where input is a random title word

In [184]:
gpt2_dataset_path = os.path.join(CACHE_DIR, 'gpt2_finetuned_output_recipes')
gpt2_recipes = load_file_recipes(gpt2_dataset_path)

gpt2_percentage = calc_used_ing_percentage(gpt2_recipes, 'gpt2')
print(f" GPT-2 Avg % of Ingredients Correctly Used {gpt2_percentage}")

Reading recipe files


100%|██████████| 500/500 [00:00<00:00, 5430.28it/s]

 GPT-2 Avg % of Ingredients Correctly Used 0.9153300226904766





In [186]:
rnn_dataset_path = os.path.join(CACHE_DIR, 'rnn_output_recipes')
rnn_recipes = load_file_recipes(rnn_dataset_path)

rnn_percentage = calc_used_ing_percentage(rnn_recipes, 'rnn')
print(f" RNN Avg % of Ingredients Correctly Used {rnn_percentage}")

Reading recipe files


100%|██████████| 500/500 [00:00<00:00, 6793.08it/s]

 RNN Avg % of Ingredients Correctly Used 0.40686166354440284





Generated recipes where input is a random full recipe title

In [198]:
gpt2_title_dataset_path = os.path.join(CACHE_DIR, 'gpt2_title_prompt_output_recipes')
gpt2_title_recipes = load_file_recipes(gpt2_title_dataset_path)

gpt2_title_percentage = calc_used_ing_percentage(gpt2_title_recipes, 'gpt2')
print(f" GPT-2 Avg % of Ingredients Correctly Used {gpt2_title_percentage}")

Reading recipe files


100%|██████████| 500/500 [00:00<00:00, 5093.12it/s]

 GPT-2 Avg % of Ingredients Correctly Used 0.9372337200484742





In [187]:
rnn_title_dataset_path = os.path.join(CACHE_DIR, 'rnn_title_prompt_output_recipes')
rnn_title_recipes = load_file_recipes(rnn_title_dataset_path)

rnn_title_percentage = calc_used_ing_percentage(rnn_title_recipes, 'rnn')
print(f" RNN Avg % of Ingredients Correctly Used {rnn_title_percentage}")

Reading recipe files


100%|██████████| 500/500 [00:00<00:00, 7493.58it/s]

 RNN Avg % of Ingredients Correctly Used 0.4068141240118841





## BLEU-4

Measure the overall quality of the generated recipe texts with respect to the ground-truth recipes for the same title.

In [188]:
def recipe_to_ref_list(recipe):
    title = recipe['title']
    ingredients = recipe['ingredients']
    instructions = recipe['instructions']
    return f"{title}\n\n{ingredients}\n{instructions}".split()

In [219]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction

def calculate_bleu(candidate_dataset_path):
    reference_recipes = pd.read_pickle(os.path.join(CACHE_DIR, 'recipes.pkl'))
    candidates = load_file_recipes(candidate_dataset_path)
    titles_file = os.path.join(candidate_dataset_path, "titles.txt")

    if not os.path.exists(titles_file):
        return -1
    
    with open(titles_file) as f:
        titles = f.read().split('\n')
    
    smoothing_function = SmoothingFunction().method4

    cumulative_score = 0
    count = 0

    for i in range(len(titles)):
        references = reference_recipes.loc[reference_recipes['title'] == titles[i]]
        if len(references) == 0:
            continue
            
        references = references.apply(recipe_to_ref_list, axis=1)
        cumulative_score += corpus_bleu([references], [candidates[i].split()], smoothing_function=smoothing_function) # BLEU-4
        # cumulative_score += sentence_bleu(references.iloc[0], candidates[i].split(), smoothing_function=smoothing_function) # BLEU-4
        count += 1
    
    return cumulative_score / count

In [220]:
calculate_bleu(gpt2_title_dataset_path)

Reading recipe files


0.08187925854405792

In [216]:
calculate_bleu(rnn_title_dataset_path)

Reading recipe files


0.05972477100652108