In [None]:
# imports

#standard
from collections import deque
from math import log

# extra
import pandas as pd
from sklearn.metrics import confusion_matrix

# local
from feature_helpers import *

# display settings
pd.set_option('display.max_columns', 21)
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.precision', 4)
pd.set_option('display.width', 1600)

In [None]:
recipes = load_clean_data()

In [None]:
recipes.ingredients = remove_states(recipes.ingredients)
recipes.ingredients = remove_dupes(recipes.ingredients)

In [None]:
counts = make_counts(recipes)
rates = scale_counts(counts, recipes.cuisine.value_counts())
rare_merged, renamed = merge_rare_features(rates, .001, 'raretype')
rare_merged = rare_merged.drop(columns=['test'])
recipes.ingredients = remove_dupes(recipes.ingredients)
props = get_proportions(rare_merged)

In [None]:
recipes.ingredients = recipes.ingredients.map(lambda ings: update_names(ings, renamed))

In [None]:
last_e = deque(6*[0], 6)

In [None]:
def make_points(props, adj=True):
    smooth = lambda data, i: data.applymap(lambda x: log(1.01 + (x / (i + x))) if x else 0)
    points = smooth(props, .1)
    adjust = {
        # drop
        'british': .925,
        'cajun_creole': .875,
        #'chinese': .99,
        #'greek': .99,
        'indian': .95,
        'irish': .97,
        'jamaican': .91,
        'korean': .985,
        'moroccan': .89,
        'russian': .92,
        'spanish': .984,
        'thai': .95,
        'vietnamese': .93,
        # boost
        'brazilian': 1.01,
        'filipino': 1.01,
        'french': 1.03,
        'italian': 1.12,
        'japanese': 1.11,
        'mexican': 1.04,
        'southern_us': 1.03
    }
    if adj:
        for cuisine, val in adjust.items():
            points[cuisine] = val * points[cuisine]
    return points

In [None]:
points = make_points(props)

In [None]:
scores = recipes.apply(lambda recipe: make_scores(recipe, points, group=False), axis='columns')

In [None]:
scores_plus = add_score_features(scores)

In [None]:
output = pd.concat([recipes, scores_plus], axis='columns')
output.head()

In [None]:
def get_preds(recipe):
    maxpos = recipe.iloc[2:-7].values.argmax()
    pred = recipe.index[maxpos + 2]
    return (pred, recipe.cuisine)
preds = output.query('cuisine != "test"').apply(get_preds, axis='columns', result_type='expand')
preds.columns = ['falpos', 'falneg']
e = preds.query('falpos != falneg')
errs = len(e)
last_e.appendleft(errs)
print(errs, errs / len(output.query('cuisine != "test"')), last_e) # 11020, 8561
labels = sorted(e.falneg.unique())
cnf = confusion_matrix(e.falneg, e.falpos, labels=labels)
plot_cnf(cnf, labels)
pd.concat([e.falpos.value_counts(), e.falneg.value_counts(), output.query('cuisine != "test"').cuisine.value_counts()], axis='columns', sort=False)

In [None]:
save_output(output)