In [None]:
# imports

#standard
from collections import Counter, defaultdict, namedtuple
from itertools import chain
import re

# extra
from funcy import memoize
import pandas as pd

# local
from data_helpers import *

In [None]:
# read in dirty data
train = pd.read_json('data/train.json', orient='records', encoding='utf-8')
test = pd.read_json('data/test.json', orient='records', encoding='utf-8')

# set indices and join train and test
train.set_index('id', drop=True, inplace=True)
test.set_index('id', drop=True, inplace=True)
test.insert(0, 'cuisine', 'test')
data = pd.concat((train, test), axis='index')

In [None]:
@memoize
def clean_phrase(orig_phrase):
    phrase = orig_phrase.lower()
    
    # remove useless chars
    phrase = char_pattern.sub('', phrase)
    
    # standardize 'n' and '&' to 'and'; '-' to ' '
    phrase = phrase.replace('&', ' and ')
    phrase = phrase.replace(' n ', ' and ')
    phrase = phrase.replace('-', ' ')
    
    # remove prep instructions
    split = phrase.split(',')
    phrase = split[0]
    split = phrase.split(' for ')
    phrase = split[0]
    
    # move 'with x', 'in x' phrases to front
    split = phrase.split(' with ')
    if len(split) > 1:
        # CR is this reverse?
        phrase = ' '.join([split[1], split[0]])
    split = phrase.split(' in ')
    if len(split) > 1:
        phrase = ' '.join([split[1], split[0]])

    # strip extra whitespace
    phrase = ' '.join(phrase.split())
    
    # hacky spelling correction
    for k, v in spellcheck_compiled:
        phrase = k.sub(v, phrase)
    
    # hacky pepper substitution
    if phrase.strip() == 'pepper':
        return 'bpepper'
    
    # substitute phrases
    split = phrase.split()
    # CR why is this not string.replace?
    for i, word in enumerate(split):
        if word in words_to_sub:
            split[i] = words_to_sub[word]
    phrase = ' '.join(split)
    for k, v in phrases_to_sub.items():
        sub = ' ' + v + ' '
        phrase = sub.join(phrase.split(k))
    
    # sub ' of (the)? ' with 'of'
    phrase = of_pattern.sub(r'\1of', phrase)
    
    # strip extra whitespace
    phrase = ' '.join(phrase.split())

    return phrase

In [None]:
def clean_words(phrases):
    res = set()
    for phrase in phrases:
        if not phrase:
            continue
        split = phrase.split()
        for i, word in enumerate(split):
            if len(word.split('-')) > 1:
                res.add(word)
                split[i] = 'below' # stopword
                continue
            if len(word) > 4:
                split[i] = correct_spelling(word)
            if word in words_to_segment:
                    split[i] = segment_word(word)
        split = remove_first_dupes(split)
        split = lemmatize(' '.join(split))
        if not split:
            continue
        for word in split:
            if word in lang_trans:
                res.add('{}-l'.format(lang_trans[word]))
        if len(split) > 1:
            res.add('-'.join(split[-2:]))
        res.update(split)
    return list(res)

In [None]:
# clear caches
clean_phrase.memory.clear()
lemmatize.memory.clear()

# clean data
cleaned_phrases = data.ingredients.map(lambda phrases: [clean_phrase(phrase) for phrase in phrases])
print('phrases cleaned')
#cleaned_words = cleaned_phrases.map(clean_words)

In [None]:
data.ingredients = cleaned_phrases

In [None]:
Ing = namedtuple('Ing', ['words', 'langs', 'cuisine', 'rcpid'])

In [None]:
def make_ings(recipe):
    ings = []
    for phrase in recipe.ingredients:
        if not phrase:
            continue
        words = phrase.split()
        for i, word in enumerate(words):
            if len(word) > 4:
                words[i] = correct_spelling(word)
            if word in words_to_segment:
                    words[i] = segment_word(word)
        words = remove_first_dupes(words)
        words = lemmatize(' '.join(words))
        words = words[::-1]
        if not words:
            continue
        langs = []
        for word in words:
            if word in lang_trans:
                langs.append('lang{}'.format(lang_trans[word]))
        ing = Ing(words, langs, recipe.cuisine, recipe.name)    
        ings.append(ing)
    return ings

In [None]:
ings_df = data.apply(make_ings, axis='columns')

In [None]:
ings = [ing for ings in ings_df for ing in ings]
len(ings)

In [None]:
# sort by head:
heads = defaultdict(list)
for ing in iter(ings):
    heads[ing.words[0]].append(ing)
len(heads)

In [None]:
head_info = {}
for head, ings_list in heads.items():
    words, cuisines = set(), set()
    for ing in iter(ings_list):
        words.add(tuple(ing.words))
        cuisines.add(ing.cuisine)
    head_info[head] = [words, cuisines]

In [None]:
len(ings)

In [None]:
head_counts = defaultdict(list)
for head, info in head_info.items():
    phrase_count = len(info[0])
    head_counts[phrase_count].append(head)

In [None]:
counts = sorted([(phrase_count, len(heads)) for phrase_count, heads in head_counts.items()])

In [None]:
todo = []
for var, count in iter(counts):
    if var > 50:
        todo.append(head_counts[var])

In [None]:
def get_dist(words):
    counts = Counter([ing.cuisine for ing in iter(ings) if ing.words == words and ing.cuisine != 'test'])
    total = sum(counts.values())
    for k, v in counts.items():
        counts[k] = round(100 * v / total)
    return (total, sorted(counts.items(), key=itemgetter(1)))

In [None]:
get_dist(['sauce', 'hot'])

In [None]:
get_dist(['sauce', 'hot', 'cholula'])

In [None]:
len(ings)

In [None]:
[ing for ing in iter(ings) if len(ing.words) > 5]

In [None]:
data.loc[47431].ingredients

In [None]:
lemmatize('italian dressing')

In [None]:
head_info['dressing']

In [None]:
# save
data.ingredients = cleaned_words
data.to_csv('data/cleaned_data.csv', header=True, encoding='utf-8')