In [None]:
# imports

#standard
from collections import deque
from math import log

# extra
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

# local
from feature_helpers import *

# display settings
pd.set_option('display.max_columns', 21)
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.precision', 4)
pd.set_option('display.width', 1600)

In [None]:
recipes = load_clean_data()

In [None]:
recipes.ingredients = remove_states(recipes.ingredients)
recipes.ingredients = remove_dupes(recipes.ingredients)

In [None]:
counts = make_counts(recipes)
rare_merged, renamed = merge_rare_features(counts, 1, 'raretype')
rare_merged = rare_merged.drop(columns=['test']) 
rates = scale_counts(rare_merged, recipes.cuisine.value_counts())
recipes.ingredients = remove_dupes(recipes.ingredients)
props = get_proportions(rates)

In [None]:
recipes.ingredients = recipes.ingredients.map(lambda ings: update_names(ings, renamed))

In [None]:
last_e = deque(6*[0], 6)

In [None]:
def make_points(props, adj=True):
    smooth = lambda data, i: data.applymap(lambda x: log(1.01 + (x / (i + x))) if x else 0)
    points = smooth(props, .01)
    weights = {
        # drop
        'brazilian': .99,
        'british': .98,
        'cajun_creole': .965,
        'indian': .99,
        'irish': .99,
        'jamaican': .965,
        'moroccan': .97,
        'russian': .975,
        'spanish': .985,
        'thai': .984,
        'vietnamese': .985,
        # boost
        'chinese': 1.005,
        #'filipino': 1.005,
        'french': 1.0053,
        'greek': 1.005,
        'italian': 1.03,
        'japanese': 1.028,
        'korean': 1.005,
        'mexican': 1.01,
        'southern_us': 1.003
    }
    if adj:
        for cuisine, weight in weights.items():
            points[cuisine] = weight * points[cuisine]
    return points

In [None]:
points = make_points(props)

In [None]:
def make_scores(recipe, points, rank=True):
    scores = points.loc[recipe.ingredients]
    if not rank:
        return scores.mean()
    totals = scores.sum(axis='index')
    n_ings = len(recipe.ingredients)
    ranks = pd.Series(0.0, index=points.columns)
    if n_ings == 1:
        leaders = totals.nlargest(2).index
        ranks[leaders[0]] += 1.1
        ranks[leaders[1]] += 1
        return ranks / n_ings
    for ing in recipe.ingredients:
        leaders = (totals - scores.loc[ing].iloc[0]).nlargest(2).index
        ranks[leaders[0]] += 1.1
        ranks[leaders[1]] += 1
    return ranks / n_ings

In [None]:
scores = recipes.apply(lambda recipe: make_scores(recipe, points, rank=True), axis='columns')

In [None]:
scores_plus = add_score_features(scores)

In [None]:
output = pd.concat([recipes, scores_plus], axis='columns')
output.head()

In [None]:
def get_preds(recipe):
    maxpos = recipe.iloc[2:-7].values.argmax()
    pred = recipe.index[maxpos + 2]
    return (pred, recipe.cuisine)
preds = output.query('cuisine != "test"').apply(get_preds, axis='columns', result_type='expand')
preds.columns = ['falpos', 'falneg']
e = preds.query('falpos != falneg')
errs = len(e)
last_e.appendleft(errs)
print(errs, errs / len(output.query('cuisine != "test"')), last_e) # 8465
labels = sorted(e.falneg.unique())
cnf = confusion_matrix(e.falneg, e.falpos, labels=labels)
plot_cnf(cnf, labels)
falpos_counts, falneg_counts = e.falpos.value_counts(), e.falneg.value_counts()
pd.concat([falpos_counts, falneg_counts, falpos_counts + falneg_counts, output.query('cuisine != "test"').cuisine.value_counts()], axis='columns', sort=False)

In [None]:
def make_points(props, adj=True):
    smooth = lambda data, i: data.applymap(lambda x: log(1.01 + (x / (i + x))) if x else 0)
    points = smooth(props, .2)
    weights = {
        # drop
        'british': .905,
        'cajun_creole': .85,
        #'chinese': .99,
        'greek': .99,
        'indian': .94,
        'irish': .96,
        'jamaican': .915,
        'korean': .98,
        'moroccan': .87,
        'russian': .9,
        'spanish': .99,
        'thai': .96,
        'vietnamese': .94,
        # boost
        'brazilian': 1.02,
        'filipino': 1.02,
        'french': 1.04,
        'italian': 1.16,
        'japanese': 1.155,
        'mexican': 1.05,
        'southern_us': 1.045
    }
    if adj:
        for cuisine, weight in weights.items():
            points[cuisine] = weight * points[cuisine]
    return points

In [None]:
save_output(output)