In [None]:
# imports

# standard
from collections import Counter, defaultdict
from operator import itemgetter

# extra
import numpy as np
import pandas as pd

# local
from feature_helpers import *

# display settings
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.precision', 4)
pd.set_option('display.width', 1600)

In [None]:
recipes = load_clean_data()

In [None]:
# use recipe length
# replace rare words

In [None]:
ings = [ing for ings in recipes.ingredients for ing in ings]
ing_counts = Counter(ings)

In [None]:
rare_ings = sorted([ing for ing, count in ing_counts.items() if count <= 2 and len(ing.split('-')) == 1])

In [None]:
len(rare_ings)

In [None]:
    def get_class_tfidf(obs):
        recipe = obs.ingredients
        return tfidfs[recipe].loc[obs.cuisine].sum() / len(recipe)
    def refine_class(obs):
        if obs.class_tfidf < .1:
            obs.cuisine = '{}-a'.format(obs.cuisine)
        return obs
    train = data.query('cuisine != "test"')
    class_tfidfs = train.apply(get_class_tfidf, axis=1)
    class_tfidfs.name = 'class_tfidf'
    train = pd.concat([train, class_tfidfs], axis=1)
    res = train.apply(refine_class, axis=1)
    return res

In [None]:
def make_docs(recipes):
    ings = {}
    for cuisine, group in data.groupby('cuisine'):
        ings[cuisine] = [ing for ings in group.ingredients for ing in ings]
    docs = {}
    dropped = defaultdict(int)
    all_counts = []
    for cuisine, ings in ings.items():
        counts = Counter(words)
        if cuisine == 'test':
            for word, count in counts.items():
                dropped[word] += count
            continue
        cutoff = len(counts) // 1000
        keep = []
        for k, v in counts.items():
            if v <= cutoff:
                dropped[k] += v
                continue
            keep.extend([k] * v)
        all_counts.append((cuisine, len(counts), cutoff, len(Counter(keep))))
        docs[cuisine] = ' '.join(keep)
    return (docs, dropped, sorted(all_counts, key=itemgetter(3)))

In [None]:
def clean_ingredients(ingredients, total_counts, dropped_counts):
    to_delete = set()
    for word, count in dropped_counts.items():
        if count == total_counts[word]:
            to_delete.add(word)
    cleaned = ingredients.map(lambda lst: [word for word in lst if word not in to_delete])
    print('{} ingredients ({} deleted)'.format(len(cleaned), len(delete)))
    return cleaned

In [None]:
def make_cleaner_docs(data):
    cuisine_words = {}
    for cuisine, df in data.groupby('cuisine'):
        cuisine_words[cuisine] = [ing for recipe in df.ingredients for ing in recipe]
    docs = {}
    dropped = defaultdict(int)
    all_counts = []
    for cuisine, words in cuisine_words.items():
        counts = Counter(words)
        if cuisine == 'test':
            for word, count in counts.items():
                dropped[word] += count
            continue
        cutoff = len(counts) // 1000
        keep = []
        for k, v in counts.items():
            if v <= cutoff:
                dropped[k] += v
                continue
            keep.extend([k] * v)
        all_counts.append((cuisine, len(counts), cutoff, len(Counter(keep))))
        docs[cuisine] = ' '.join(keep)
    return (docs, dropped, sorted(all_counts, key=itemgetter(3)))

In [None]:
def get_winners(recipe, tfidfs):
    tfidf_sums = tfidfs[recipe].sum(axis=1)
    if len(recipe) == 1:
        return tfidf_sums.idxmax()
    rounds = defaultdict(int)
    for ing in recipe:
        minus_one = tfidf_sums - tfidfs[ing]
        first, second = minus_one.nlargest(2).index
        first = first.replace('-a', '')
        second = second.replace('-a', '')
        rounds[first] += 14
        rounds[second] += 10
    return max(rounds.items(), key=itemgetter(1))[0]

In [None]:
### Make output for ingredient features
indices = {k: i for i, k in enumerate(word_counts)}
col_names = [k for k, _ in sorted(indices.items(), key=itemgetter(1))]
zeros = np.zeros((data.shape[0], len(col_names)), dtype=np.uint8)
for ri, ings in enumerate(data.ingredients):
    zeros[ri, [indices[ing] for ing in ings]] = 1
zeros_df = pd.DataFrame(zeros, columns=col_names, index=data.index)
data = pd.concat((data, zeros_df), axis=1)

In [None]:
def save_output(output):
    train = output.query('cuisine != "test"')
    train = train.drop(columns=['ingredients'])
    train.cuisine.to_csv('data/cuisine.csv', header=False, encoding='utf-8')

In [None]:
train = train.drop(columns=['cuisine'])
train.to_csv('data/temp_train.csv', header=True, encoding='utf-8')

In [None]:
#test = output[output.iloc[:,0] != 'test']
test = output.query('cuisine == "test"')
test = test.drop(columns=['cuisine', 'ingredients'])
test.to_csv('data/temp_test.csv', header=True, encoding='utf-8')