In [1]:
#| default_exp utils.utils

In [2]:
#| export
import sys

from pyprojroot import here
root = here()

import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from sacremoses import MosesTokenizer, MosesDetokenizer
mt, md = MosesTokenizer(lang='en'), MosesDetokenizer(lang='en')
from nltk.stem import WordNetLemmatizer

import spacy
from spacy.matcher import Matcher
from spacy.util import filter_spans

import json
from itertools import groupby
import re
import string

from tqdm import tqdm
tqdm.pandas()

from fractions import Fraction
from word2number import w2n
import logging

Notebook to copy all utils functions.

In [3]:
#| export
with open(f'{root}/config/unit_conversions.json') as f:
    unit_list = json.load(f)

# Ingredient Cleaner

In [4]:
#| export
stop_words = set(stopwords.words('english'))
stop_words.remove('can')

lemmatizer = WordNetLemmatizer()

In [5]:
#| export
def detokenize(tokens):
    text = md.detokenize(tokens)
    text = re.sub(r'\s/\s', '/', text)
    return text

In [6]:
#| export
def clean_ingredient_string(ingredient):
    if pd.isnull(ingredient) or not ingredient: return ingredient
    ingredient = str(ingredient).lower()
    ingredient = re.sub(r'^\W+', '', ingredient) # remove poorly parsed punctuation
    ingredient = re.sub(r'[(),.]', '', ingredient) # comments often bracketted, which should be removed
    ingredient = ingredient.strip()
    tokens = mt.tokenize(ingredient)
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return detokenize(tokens)

In [7]:
assert clean_ingredient_string('pecan halves') == clean_ingredient_string('pecan') + ' ' + clean_ingredient_string('halves')
assert clean_ingredient_string('raspberries') == 'raspberry'
assert clean_ingredient_string('') == ''

# DF Text Matching

In [8]:
#| export
def contains_whole_word(string, search_word):
    if re.search(r"\b" + re.escape(search_word) + r"\b", string):
        return True
    else: 
        return False
    
def count_list_matches(list, search_strings):
    match_count = 0
    for item in list:
        for search_string in search_strings:
            if contains_whole_word(item, search_string):
                match_count += 1
    return match_count

# Unit Tagger

In [9]:
#| export
def train_unit_tagger(matcher):

    for unit_type in unit_list.keys():
        for unit in unit_list[unit_type].keys():
            match_strings = [unit.replace('_',' ')] + unit_list[unit_type][unit]['matches']
            filters = [[{'LOWER': match_word} for match_word in match_string.split(" ")] for match_string in match_strings]
            matcher.add(unit, filters)

    filters = [[{"POS": "NUM"}]]
    matcher.add("numeric", filters)

    return matcher

nlp = spacy.load("en_core_web_sm")
unit_tagger = Matcher(nlp.vocab)
unit_tagger = train_unit_tagger(unit_tagger)

In [10]:
doc = nlp("I need 2 1/2 tbsp of oz sugar and 500 ml of milk and a pint what about a fl oz and 1 g of salt")
matches = unit_tagger(doc)

# overwriting duplicate tags by largest size
spans = [doc[start:end] for match_id, start, end in matches]
filtered_idxs = [(span.start, span.end) for span in filter_spans(spans)]
matches = [match for match in matches if (match[1], match[2]) in filtered_idxs]

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

15619426672489268670 numeric 2 3 2
15619426672489268670 numeric 3 4 1/2
4504748935369724861 tablespoon 4 5 tbsp
15212922390252438877 ounce 6 7 oz
15619426672489268670 numeric 9 10 500
5673684224853602733 milliliter 10 11 ml
8861518500936608369 pint 15 16 pint
3579143477707580839 fluid_ounce 19 21 fl oz
15619426672489268670 numeric 22 23 1
3550475366760073739 gram 23 24 g


To deal with various spellings for units, we could tag them with our NLP model to keep consistency.

What we want to do here is have a function which tags a string with the 'base' spellings of the units. How should this be outputted?

- The string with the tagged 'base' spelling replacing the words
- Lists: One for the tags and another for the remainder

How we want to output this depends on what we are going to do with the outputs. What are we going to do? 
The unit doesn't actually matter that much. What matters more is the type of food that we want, which is moreso found in the description/comment of the food. The unit search matters most when it's not actually a standard unit.

In [11]:
from itertools import zip_longest

1. check for match indices
2. go through an correct each match index
3. remove values with previous matches

In [12]:
#| export
# numeric tags which come consecitvely should be read as a single tag eg. 1 1/2.
def join_repeated_numeric_tags(matches):
    numeric_repeats = [ (x[3] == 'numeric' and x[3] == y[3] and x[2] == y[1]) for x, y in zip(matches, matches[1:]) ] + [ False ] # finding consecutive numeric repeat values
    new_matches = []
    for i, match in enumerate(matches):
        if numeric_repeats[i-1]: continue # skip if previous was a numeric, current index would've been added to previous element
        while numeric_repeats[i]: i += 1 # need count in case of multiple consecitve numerics
        new_matches.append((match[0], match[1], matches[i][2], match[3]))

    return new_matches

In [13]:
'one'.isnumeric()

False

In [14]:
#| export
fractions = ['1/2', '1/3', '2/3', '1/4', '3/4', '1/5', '2/5', '3/5', '4/5']

In [15]:
6/8

0.75

In [16]:
#| export
def convert_fractions_to_decimal(numeric_string):
    final_number = 0.0
    for string_num in numeric_string.split(" "):
        if '-' in string_num:
            string_num = string_num.split('-')[0] # range of numbers (eg. 4-6 onions)
        if '/' in string_num:
            try:
                split = string_num.split('/')
                if (float(split[0])/float(split[1])) < 0.75: # if fraction is greater than 3/4 then it isn't a fraction - it's an either/or measure (eg. 7/8 onions)
                    string_num = str(float(Fraction(string_num)))                
                else:
                    string_num = split[0]
            except (ValueError):
                pass
        try:
            final_number += float(string_num)
        except ValueError:
            try:
                final_number += w2n.word_to_num(string_num) # number word (eg. one onion)
            except ValueError:
                pass
    return str(final_number)

In [17]:
assert convert_fractions_to_decimal('one') == '1.0'
assert convert_fractions_to_decimal('1 1/2') == '1.5'
assert convert_fractions_to_decimal('4-6') == '4.0'
assert convert_fractions_to_decimal('7/8') == '7.0'
assert convert_fractions_to_decimal('pie') == '0.0'

In [18]:
#| export 
# we want to tag the units as their actual values
def tag_numerics_with_float_value(matches):
    [ (match[0], match[1], match[2], convert_fractions_to_decimal(match[3])) for match in matches ]

In [19]:
#| export
def get_unit_type(unit_tags):
    for unit_type in unit_list.keys():
        if any([t in unit_list[unit_type] for t in unit_tags]):
            return unit_type
    return 'portion'

In [20]:
#| export
def tag_units(phrase):

    if pd.isnull(phrase): return ([], [], 'portion')

    doc = nlp(phrase)
    matches = unit_tagger(doc)

    # overwriting duplicate tags by largest size
    spans = [doc[start:end] for match_id, start, end in matches]
    filtered_idxs = [(span.start, span.end) for span in filter_spans(spans)]
    matches = [match for match in matches if (match[1], match[2]) in filtered_idxs]

    # update with tag names
    matches = [ match + (nlp.vocab.strings[match[0]],) for match in matches ]

    matches = join_repeated_numeric_tags(matches)
    matches = [ (match[0], match[1], match[2], convert_fractions_to_decimal(str(doc[match[1]:match[2]]))) if match[3] == 'numeric' else match for match in matches ]

    # remaining non-tagged tokens
    match_idxs = [range(start,end) for match_id, start, end, tag in matches]
    match_idxs_flattened = [element for sublist in match_idxs for element in sublist]
    remainders = [ str(doc[i]) for i in range(0,len(doc)) if i not in match_idxs_flattened ]

    # removing punctuation from strings 
    remainders = [ word.translate(str.maketrans('', '', string.punctuation)) for word in remainders ]
    remainders = [ word.strip() for word in remainders ]
    remainders = list(filter(None, remainders))

    unit_tags = list(list(zip(*matches))[3]) if matches else []
    unit_type = get_unit_type(unit_tags)

    return unit_tags, remainders, unit_type

In [21]:
tag_units('one')

(['1.0'], [], 'portion')

In [22]:
assert tag_units('portion of 1 1/2 or 3 1/2 5')[0] == (['portion', '1.5', '8.5'])
assert tag_units('serving 1/2 cup')[0] == ['portion', '0.5', 'cup']
assert tag_units('cup fl oz chicken stock') == (['cup', 'fluid_ounce'], ['chicken', 'stock'], 'volume')

In [23]:
tag_units('serving 1/2 cup')

(['portion', '0.5', 'cup'], [], 'volume')

# Memory Cleaning

In [24]:
#| export
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

def show_var_sizes():
    for name, size in sorted(((name, sys.getsizeof(value)) for name, value in list(
                            locals().items())), key= lambda x: -x[1])[:10]:
        print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

In [25]:
#| export
def clear_variable_cache():
    count = 0
    for name in dir():
        if re.search(r'^_[a-z]?[0-9]+', name) or re.search(r'^_+$', name):
            count += 1
            globals()[name] = None
    return count

In [26]:
from nbdev import nbdev_export; nbdev_export()