In [1]:
import csv
from functools import lru_cache

import floret
import nltk.stem.porter as nsp
import numpy as np
import spacy

In [2]:
STEMMER = nsp.PorterStemmer()
# Disable stuff we don't need to speed up.
NLP = spacy.load(
    "en_core_web_md",
    disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"],
)

In [3]:
@lru_cache(maxsize=512)
def stem(token: str, to_lowecase: bool) -> str:
    """Stem function with cache to improve performance.

    The stem of a word output by the PorterStemmer is always the same, so we can
    cache the result the first time and return that for subsequent future calls
    without the need to do all the processing again.

    Parameters
    ----------
    token : str
        Token to stem

    Returns
    -------
    str
        Stem of token
    """
    return STEMMER.stem(token, to_lowercase=to_lowecase)

In [4]:
def prepare(document: str) -> list[str]:
    """Prepare recipe for embeddings training.

    Parameters
    ----------
    ingredients : list[str]
        List of recipe ingredients.
    instructions : list[str]
        List of recipes instructions.

    Returns
    -------
    str
        Prepared recipe.
    """
    doc = NLP(document)

    return [
        stem(str(token), to_lowecase=True)
        for token in doc
        if not token.is_punct
        and not token.is_currency
        and not token.is_digit
        and not token.is_space
        and not token.is_stop
        and not token.like_num
    ]

In [25]:
def word_similarity(word1: str, word2: str, model) -> float:
    """Calculate similarity between two word embeddings.

    This uses the reciprocal euclidean distance transformed by a
    sigmoid function to return a value between 0 and 1.
    1 indicates an exact match (i.e. same word).
    0 indicates no match whatsoever.

    Inputs
    ------
    word1 : str
        First word.
    word2 : str
        Second word.
    model : floret
        Embeddings model

    Returns
    -------
    float
        Value between 0 and 1.
    """
    euclidean_dist = np.linalg.norm(model[word1] - model[word2])

    if euclidean_dist == 0:
        return 1
    elif euclidean_dist == np.inf:
        return 0
    else:
        sigmoid = 1 / (1 + np.exp(-1/euclidean_dist))
        return float(sigmoid)

In [26]:
def doc_similarity(word: str, document: list[str], model) -> float:
    """Calculate the similarity of word to document.

    Similarlity score is calculated from the euclidean distance between word and
    all members of document. The reciprocal of this distance if transformed using
    a signmoid function to return the score between 0 and 1.

    Inputs
    ------
    word : str
        Word to calculate membership of.
    document : str
        Document to calculate word membership to.
    model : floret
        Embeddings model

    Returns
    -------
    float
        Membership score between 0 and 1, where 1 indicates exact match.
    """
    return max(word_similarity(word, d, model) for d in document)

In [39]:
def fuzzy_document_distance(document1: list[str], document2: list[str], model):
    """Calculate fuzzy document distance between two documents.

    Implementation of https://doi.org/10.1109/ACCESS.2021.3058559.

    Inputs
    ------
    document1 : list[str]
        Tokens for first document.
    document2 : list[str]
        Tokens for second document.
    model : floret
        Embeddings model

    """
    # Remove out of vocabularly words
    # ! Is this necessary?
    document1 = [token for token in document1 if token in model]
    document2 = [token for token in document2 if token in model]

    # If either document only contains out of vocab words, return infinite distance
    if not document1 or not document2:
        return float("inf")

    # Calculate fuzzy intersection
    union_membership = 0.0
    cj1_membership = 0.0
    cj2_membership = 0.0

    tokens = set(document1) | set(document2)
    for token in tokens:
        union_membership += doc_similarity(token, document1, model) * doc_similarity(
            token, document2, model
        )
        cj1_membership += doc_similarity(token, document1, model)
        cj2_membership += doc_similarity(token, document2, model)

    res = union_membership / (cj1_membership + cj2_membership - union_membership)
    return 1 - res

In [40]:
model = floret.load_model("test_embeddings.floret.bin")

In [44]:
fuzzy_document_distance(["red", "pepper"], ["pepper", "bell", "green", "raw"], model)

0.3849511742591858

In [45]:
fdc_ingredients = []
with open("data/fdc_ingredients.csv", "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        fdc_ingredients.append(row["description"])

In [46]:
scores = []
search = prepare("red onion")
for i, ing in enumerate(fdc_ingredients):
    prepared_fdc = prepare(ing)
    scores.append((fuzzy_document_distance(search, prepared_fdc, model), i))

for score, idx in sorted(scores, key=lambda x: x[0])[:5]:
    print(f"{fdc_ingredients[idx]}:\t {score}")

Onions, red, raw:	 0.16311589876810706
Cabbage, red, raw:	 0.3611718416213989
Onions, yellow, raw:	 0.3620023876428604
Onions, white, raw:	 0.3627890944480896
Peppers, bell, red, raw:	 0.38058264255523677
