In [1]:
#|default_exp utils.join_utils

In [3]:
#|export
from pyprojroot import here
root = here()
import sys
sys.path.append(str(root))

In [4]:
#|export
from nltk.corpus import wordnet
import warnings
from food_database.utils.utils import *

In [6]:
import pandas as pd
ingredients_df = pd.read_feather(f'{root}/data/local/recipe/partial/ingredients/0.feather')
ingredients = list(ingredients_df.iloc[:10]['name.name'])

In [7]:
#| export 
def get_synset(ingredient):

    synsets = wordnet.synsets(ingredient)
    if not synsets: return None

    filtered = [w for w in synsets if 'food' in w.lexname()]
    if filtered: synsets = filtered
    filtered = [w for w in synsets if ingredient in w.name()]
    if filtered: synsets = filtered

    return synsets[0]

We return indexes here to select the right synonyms. The synonyms method seems to use the synset, but uses a simplified API without going through the 

Some words are causing issues.

In [8]:
#| export
excluded_words = [
    'cut'
]

In [9]:
def clean_word(word):
    tokens = mt.tokenize(word)
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return detokenize(tokens)

In [14]:
#| export
def get_food_hypernyms(synset):

    with warnings.catch_warnings(): # closure throws warning if it exceeds depth limit
        warnings.simplefilter("ignore")
        hypernyms = list(synset.closure(lambda x: x.hypernyms(), depth=5))

    hypernyms = [ word.name().split('.')[0] for word in hypernyms ]
    
    try:
        hypernyms = hypernyms[:(hypernyms.index('food'))] 
    except ValueError:
        pass
    
    hypernyms = hypernyms[:7]
    
    return hypernyms

def flatten_list(l):
    return [x for xs in l for x in xs]

def clean_word(word):
    tokens = mt.tokenize(word)
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return detokenize(tokens)

def clean_alt_words(alt_words):
    alt_words = [reversed(w.split('_')) for w in alt_words]
    alt_words = flatten_list(alt_words)
    alt_words = [clean_word(w) for w in alt_words]
    alt_words = [w for w in alt_words if w not in excluded_words]
    alt_words = list(filter(None, alt_words))
    return alt_words

def find_alt_words(word):

    if not isinstance(word, str) or word == '': return 

    synset = get_synset(word)
    if not synset: return
       
    synonyms = list(set(synset.lemma_names()) - {word})
    hypernyms = get_food_hypernyms(synset)

    alt_words = [ *synonyms, *hypernyms ]
    
    alt_words = clean_alt_words(alt_words)

    alt_words = alt_words[:10]

    return alt_words

In [11]:
assert 'eggplant' in find_alt_words('aubergine')
assert 'bread' in find_alt_words('baguette')

In [12]:
find_alt_words('bacon')

['pork', 'meat']

In [16]:
find_alt_words('zucchini')

['courgette'] ['summer_squash', 'squash', 'vegetable', 'produce']


['courgette', 'squash', 'summer', 'squash', 'vegetable', 'produce']

We have a particular problem here, in that sometimes phrases are returned ('cut of pork'). What can we do about this? We could separate the words, however they only hold their meaning when they are together eg. baked_goods. We don't necessarily want the 'goods' doesn't mean anything really, and we don't necessarily want to word baked in there without it. This actually only stands if there is only one search term here. Likely these words will be refining terms rather than a search solely on this, in which case it should work. 

Lets do what we have done with our standard ingredient strings: separate them and reorder them so the last noun comes first.

In [11]:
from nbdev import nbdev_export; nbdev_export()

Note nbdev2 no longer supports nbdev1 syntax. Run `nbdev_migrate` to upgrade.
See https://nbdev.fast.ai/getting_started.html for more information.
  warn(f"Notebook '{nbname}' uses `#|export` without `#|default_exp` cell.\n"
Note nbdev2 no longer supports nbdev1 syntax. Run `nbdev_migrate` to upgrade.
See https://nbdev.fast.ai/getting_started.html for more information.
  warn(f"Notebook '{nbname}' uses `#|export` without `#|default_exp` cell.\n"
