In [None]:
# imports

#standard
from collections import deque
from math import log

# extra
import pandas as pd
from sklearn.metrics import confusion_matrix

# local
from feature_helpers import *

# display settings
pd.set_option('display.max_columns', 21)
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.precision', 4)
pd.set_option('display.width', 1600)

In [None]:
recipes = load_clean_data()

In [None]:
recipes.ingredients = remove_states(recipes.ingredients)
recipes.ingredients = remove_dupes(recipes.ingredients)
recipes.ingredients = add_combos(recipes.ingredients)

In [None]:
counts = make_counts(recipes)
rare_merged, renamed = merge_rare_features(counts, 1, 3, 'raretype')
rare_merged = rare_merged.drop(columns=['test'])
rates = scale_counts(rare_merged, recipes.cuisine.value_counts())
recipes.ingredients = remove_dupes(recipes.ingredients)
props = get_proportions(rates)
props.loc['rarecombotype'] = 0.0

In [None]:
recipes.ingredients = recipes.ingredients.map(lambda ings: update_names(ings, renamed))
recipes.ingredients = remove_dupes(recipes.ingredients)

In [None]:
last_e = deque(6*[0], 6)

In [None]:
def make_points(props, adj=True): # combo_cutoff = 5
    smooth = lambda data, w: data.map(lambda x: w[0] * log(1.01 + (x / (w[1] + x))) if x else 0)
    points = props.copy()
    weights = {
        # drop
        'brazilian': [0.99, .4],
        'british': [1.03, .5],
        'cajun_creole': [0.82, .5],
        'chinese': [0.97, .4],
        'greek': [0.93, .4],
        'indian': [0.91, .5],
        'irish': [0.96, .4],
        'jamaican': [0.97, .5],
        'korean': [0.96, .4],
        'moroccan': [0.88, .5],
        'russian': [0.97, .5],
        'spanish': [0.98, .4],
        'thai': [0.91, .4],
        'vietnamese': [0.97, .5],
        # boost
        'filipino': [1.04, .4],
        'french': [1.08, .4],
        'italian': [1.04, .3],
        'japanese': [1.08, .3],
        'mexican': [1.04, .4],
        'southern_us': [1.07, .4],
    }
    if adj:
        for cuisine, weight in weights.items():
            points[cuisine] = smooth(points[cuisine], weight)
    return points

In [None]:
points = make_points(props)
scores = recipes.apply(lambda recipe: make_scores(recipe, points), axis='columns')

In [None]:
ranks = scores.apply(lambda score: score / score.max(), axis='columns')
scores_plus = add_score_features(ranks)
length = min_max_scale(recipes.ingredients.map(len))
length.name = 'length'
scores = scores.add_prefix('raw_')

In [None]:
def mark_leaders(rank):
    leaders = pd.Series(range(1, 21), index=rank.sort_values().index)
    return leaders / 20

In [None]:
leaders = ranks.apply(mark_leaders, axis='columns')
leaders = leaders.add_prefix('reg_')

In [None]:
output = pd.concat([recipes, scores_plus, scores, leaders, length], axis='columns')
output.head()

In [None]:
def get_preds(recipe):
    maxpos = recipe.iloc[2:22].values.argmax()
    pred = recipe.index[maxpos + 2]
    return (pred, recipe.cuisine)
preds = output.query('cuisine != "test"').apply(get_preds, axis='columns', result_type='expand')
preds.columns = ['falpos', 'falneg']
e = preds.query('falpos != falneg')
errs = len(e)
last_e.appendleft(errs)
print(errs, errs / len(output.query('cuisine != "test"')), last_e) # 8289, 7503, 6883
labels = sorted(e.falneg.unique())
cnf = confusion_matrix(e.falneg, e.falpos, labels=labels)
plot_cnf(cnf, labels)
falpos_counts, falneg_counts = e.falpos.value_counts(), e.falneg.value_counts()
pd.concat([falpos_counts, falneg_counts, falpos_counts + falneg_counts, output.query('cuisine != "test"').cuisine.value_counts()], axis='columns', sort=False)

In [None]:
def make_points(props, adj=True): # combo_cutoff = 3
    smooth = lambda data, w: data.map(lambda x: w[0] * log(1.01 + (x / (w[1] + x))) if x else 0)
    points = props.copy()
    weights = {
        # drop
        'brazilian': [0.99, .4],
        'british': [1.03, .5],
        'cajun_creole': [0.83, .5],
        'chinese': [0.97, .4],
        'greek': [0.93, .4],
        'indian': [0.91, .5],
        'irish': [0.96, .4],
        'jamaican': [0.97, .5],
        'korean': [0.96, .4],
        'moroccan': [0.87, .5],
        'russian': [0.97, .5],
        'spanish': [0.99, .4],
        'thai': [0.91, .4],
        'vietnamese': [0.97, .5],
        # boost
        'filipino': [1.04, .4],
        'french': [1.08, .4],
        'italian': [1.04, .3],
        'japanese': [1.08, .3],
        'mexican': [1.04, .4],
        'southern_us': [1.06, .4],
    }
    if adj:
        for cuisine, weight in weights.items():
            points[cuisine] = smooth(points[cuisine], weight)
    return points

In [None]:
points = make_points(props)
scores = recipes.apply(lambda recipe: make_scores(recipe, points), axis='columns')
output = pd.concat([recipes, scores], axis='columns')

In [None]:
save_output(output)