In [None]:
# imports

#standard
from collections import Counter, defaultdict, namedtuple
import re

# extra
from funcy import memoize
import pandas as pd

# local
from data_helpers import *

In [None]:
# read in dirty data
train = pd.read_json('data/train.json', orient='records', encoding='utf-8')
test = pd.read_json('data/test.json', orient='records', encoding='utf-8')

# set indices and join train and test data
train.set_index('id', drop=True, inplace=True)
test.set_index('id', drop=True, inplace=True)
test.insert(0, 'cuisine', 'test')
data = pd.concat((train, test), axis='index')

In [None]:
Ing = namedtuple('Ing', ['string', 'head', 'mods', 'states', 'brands', 'langs', 'cuisine', 'rcpid'])

In [None]:
# transform dirty string into Ing
@memoize
def make_ing(orig_phrase):
    # WARNING: these are done in this order for reasons
    
    phrase = orig_phrase.lower()

    # standardize 'n' and '&' to 'and'
    phrase = phrase.replace('&', ' and ')
    phrase = phrase.replace(' n ', ' and ')
    
    # strip extra whitespace
    phrase = ' '.join(phrase.split())
    
    # correct special spelling words
    # WARNING: strip extra whitespace first
    for k, v in compiled_spelling:
        phrase = k.sub(v, phrase)    
    
    # handle exceptions
    # WARNING: correct special spelling first
    for k, v in compiled_exceptions:
        phrase = k.sub(v, phrase)

    # remove parentheticals
    phrase = parenthetical_pattern.sub('', phrase)

    # remove useless chars
    # WARNING: remove parentheticals before removing individual '()' chars
    phrase = phrase.replace('â€', '')
    phrase = char_pattern.sub('', phrase)

    # sub and move brands
    brands = []
    match = compiled_brands.search(phrase)
    if match:
        brand = match.group(0)
        sub = brands_to_sub[brand] if brand in brands_to_sub else ''
        phrase = phrase.replace(brand, sub)
        brand = '_brand-{}'.format(brand.replace(' ', '')[:4])
        brands = brands + [brand] # avoid mutations when memoized
    
    # remove chars kept for brand recognition
    # WARNING: handle brands first
    phrase = brand_char_pattern.sub('', phrase)
    
    # remove trailing prep instructions
    phrases = phrase.split(',')
    phrase = phrases[0]
    phrases = phrase.split(' for ')
    phrase = phrases[0]
    
    # move either 'with x' or 'in x' phrases to front
    # WARNING: handle exceptions first
    phrases = phrase.split(' with ')
    if len(phrases) > 1:
        phrase = ' '.join([phrases[1], phrases[0]])
    else:
        phrases = phrase.split(' in ')
        if len(phrases) > 1:
            phrase = ' '.join([phrases[1], phrases[0]])
    
    # sub ' of (the)? ' with 'of'
    phrase = of_pattern.sub(r'\1of', phrase)
    # merge modwords
    phrase = low_pattern.sub(' low', phrase)
    phrase = free_pattern.sub('free ', phrase)
    phrase = high_pattern.sub(' high', phrase)
    
    # make ing
    words = [word for word in phrase.split() if word not in stopwords]
    words = [correct_spelling(word) if len(word) > 4 else word for word in words]
    phrases = [words_to_segment[word].split() if word in words_to_segment else [word] for word in words]
    words = [get_lemma(word) for words in phrases for word in words] # get lemma and flatten
    words = remove_first_dupes(words)
    langs = ['_lang-{}'.format(lang_trans[word]) for word in words if word in lang_trans]
    states = ['_state-{}'.format(word[:5]) for word in words if word in state_words]
    words = [word for word in words if word not in state_words]
    for _ in range(len(words)):
        head = words[-1]
        if head in heads_to_drop:
            words = words[:-1] # avoid mutations when memoized
            if not words and head in heads_to_sub:
                words = [heads_to_sub[head]]
        else:
            break
    if not words:
        print(orig_phrase)
        return ([], [], [], [], [], [])
    if words[-1] in supertype_appends:
        words = words + [supertype_appends[words[-1]]] # avoid mutations when memoized
    words = remove_first_dupes(words)
    head = words[-1]
    mod_words = sorted(words[:-1])
    string = ' '.join(mod_words + [head])
    if not mod_words:
        mods = [head]
    else:
        mods = ['{}-{}'.format(head, mod) for mod in mod_words]
    return (string, head, mods, states, brands, langs)

In [None]:
def make_ings(recipe):
    ings = []
    for phrase in recipe.ingredients:
        string, head, mods, states, brands, langs = make_ing(phrase)
        if not head:
            continue
        ing = Ing(string, head, mods, states, brands, langs, recipe.cuisine, recipe.name)
        ings.append(ing)
    return ings

In [None]:
# clear caches
make_ing.memory.clear()
get_lemma.memory.clear()

ings_series = data.apply(make_ings, axis='columns')
len(ings_series) # 49718

In [None]:
flattened_ings = ings_series.map(flatten)

In [None]:
strings = ings_series.map(lambda ings: [ing.string for ing in ings])
strings.name = 'strings'

In [None]:
# save
data.ingredients = flattened_ings
data = pd.concat([data, strings], axis='columns')
data.to_csv('data/cleaned_data.csv', header=True, encoding='utf-8')