In [None]:
# imports

#standard
from collections import deque

# extra
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

# local
from feature_helpers import *

# display settings
pd.set_option('display.max_columns', 21)
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.precision', 4)
pd.set_option('display.width', 1600)

In [None]:
recipes = load_clean_data()

In [None]:
recipes.strings = remove_dupes(recipes.strings)
recipes.ingredients = remove_dupes(recipes.ingredients)

In [None]:
recipes.strings = add_combos(recipes.strings)
recipes.ingredients = add_combos(recipes.ingredients)

In [None]:
string_counts = make_counts(recipes, field='strings')
string_merged, string_renamed = merge_rare_features(string_counts, 1, 3, 'raretype')
string_merged = string_merged.drop(columns=['test'])
string_rates = scale_counts(string_merged, recipes.cuisine.value_counts())
string_rates.loc['rarecombotype'] = 0.0
recipes.strings = remove_dupes(recipes.strings)
string_props = get_proportions(string_rates)
string_props.loc['rarecombotype'] = 0.0
recipes.strings = recipes.strings.map(lambda ings: update_names(ings, string_renamed))
recipes.strings = remove_dupes(recipes.strings)

In [None]:
string_prop_weights = {
    'brazilian': [0.98, 0.4],
    'british': [1.00, 0.5],
    'cajun_creole': [0.89, 0.5],
    'chinese': [0.99, 0.4],
    'filipino': [1.03, 0.4],
    'french': [1.07, 0.4],
    'greek': [0.97, 0.4],
    'indian': [0.93, 0.4], 
    'irish': [0.93, 0.4], 
    'italian': [1.05, 0.3],
    'jamaican': [0.86, 0.4],
    'japanese': [1.16, 0.3],
    'korean': [0.95, 0.4], 
    'mexican': [1.16, 0.4],
    'moroccan': [0.86, 0.5],
    'russian': [0.92, 0.5],
    'southern_us': [1.07, 0.4], 
    'spanish': [1.02, 0.4],
    'thai': [0.97, 0.4],
    'vietnamese': [0.93, 0.4]
}
string_points = reweight(string_props, string_prop_weights)
string_scores = recipes.apply(lambda recipe: string_points.loc[recipe.strings].mean(), axis='columns')
string_ranks = string_scores.apply(rank, axis='columns')
string_ratios = string_scores.apply(ratio, axis='columns')

In [None]:
counts = make_counts(recipes)
rare_merged, renamed = merge_rare_features(counts, 1, 3, 'raretype')
rare_merged = rare_merged.drop(columns=['test'])
rates = scale_counts(rare_merged, recipes.cuisine.value_counts())
rates.loc['rarecombotype'] = 0.0
recipes.ingredients = remove_dupes(recipes.ingredients)
props = get_proportions(rates)
props.loc['rarecombotype'] = 0.0
recipes.ingredients = recipes.ingredients.map(lambda ings: update_names(ings, renamed))
recipes.ingredients = remove_dupes(recipes.ingredients)

In [None]:
prop_weights = {
    'brazilian': [0.99, .4],
    'british': [1.02, .5],
    'cajun_creole': [0.83, .5],
    'chinese': [0.97, .4],
    'filipino': [1.04, .4],
    'french': [1.08, .4],
    'greek': [0.93, .4],
    'indian': [0.91, .5],
    'irish': [0.96, .4],
    'italian': [1.04, .3],
    'jamaican': [0.97, .5],
    'japanese': [1.08, .3],
    'korean': [0.96, .4],
    'mexican': [1.04, .4],
    'moroccan': [0.87, .5],
    'russian': [0.97, .5],
    'southern_us': [1.06, .4],
    'spanish': [0.99, .4],
    'thai': [0.91, .4],
    'vietnamese': [0.97, .5],
}
points = reweight(props, prop_weights)
scores = recipes.apply(lambda recipe: make_scores(recipe, points), axis='columns')
ranks = scores.iloc[:, 0:20].apply(rank, axis='columns')

In [None]:
top_points = points[points >= .2].fillna(0)
top_scores = recipes.apply(lambda recipe: top_points.loc[recipe.ingredients].mean(), axis='columns')
top_rates = top_scores * scores.iloc[:, 0:20].applymap(inverse)

In [None]:
ing_weights = {
    'brazilian': [0.99, .4],
    'british': [1.0, .5],
    'cajun_creole': [0.86, .5],
    'chinese': [0.97, .4],
    'filipino': [1.04, .4],
    'french': [1.08, .4],
    'greek': [0.95, .4],
    'indian': [0.94, .5],
    'irish': [0.94, .4],
    'italian': [1.05, .3],
    'jamaican': [0.96, .5],
    'japanese': [1.1, .3],
    'korean': [0.96, .4],
    'mexican': [1.05, .4],
    'moroccan': [0.85, .5],
    'russian': [0.95, .5],
    'spanish': [0.99, .4],
    'southern_us': [1.05, .4],
    'thai': [0.91, .4],
    'vietnamese': [0.98, .5],
}
ing_points = reweight(ings_only(props), ing_weights)
ing_scores = recipes.apply(lambda recipe: ing_points.loc[recipe.ingredients].mean(), axis='columns')
ing_ranks = ing_scores.apply(rank, axis='columns')

In [None]:
standardize_ranks = lambda ranks: pd.Series(np.arange(0, 1, 1/len(ranks)), index=ranks.sort_values().index)
std_ranks = ranks.apply(standardize_ranks, axis='columns')
group_scores = make_group_features(ranks)
comp_ratios = make_comparison_features(ranks)

In [None]:
ranks = ranks.add_prefix('rank_')
string_scores = string_scores.add_prefix('str_')
string_ranks = string_ranks.add_prefix('strrk_')
string_ratios = string_ratios.add_prefix('strrt_')
std_ranks = std_ranks.add_prefix('stdr_')
top_scores = top_scores.add_prefix('top_')
top_rates = top_rates.add_prefix('topr_')
ing_scores = ing_scores.add_prefix('ing_')
ing_ranks = ing_ranks.add_prefix('ingr_')

In [None]:
features = pd.concat([string_scores, string_ranks, string_ratios, scores, ranks, std_ranks, top_scores, top_rates,
                      ing_scores, ing_ranks, group_scores, comp_ratios], axis='columns')

In [None]:
output = pd.concat([recipes, features], axis='columns')
output.head()

In [None]:
last_e = deque(6*[0], 6)

In [None]:
output = pd.concat([recipes, string_scores], axis='columns')
def get_preds(recipe):
    maxpos = recipe.iloc[3:23].values.argmax()
    pred = recipe.index[maxpos + 3]
    return (pred, recipe.cuisine)
preds = output.query('cuisine != "test"').apply(get_preds, axis='columns', result_type='expand')
preds.columns = ['falpos', 'falneg']
e = preds.query('falpos != falneg')
errs = len(e)
last_e.appendleft(errs)
print(errs, 1 - errs / len(output.query('cuisine != "test"')), last_e) # 6900, strings:6700
labels = sorted(e.falneg.unique())
cnf = confusion_matrix(e.falneg, e.falpos, labels=labels)
plot_cnf(cnf, labels)
falpos_counts, falneg_counts = e.falpos.value_counts(), e.falneg.value_counts()
pd.concat([falpos_counts, falneg_counts, falpos_counts + falneg_counts, output.query('cuisine != "test"').cuisine.value_counts()], axis='columns', sort=False)

In [None]:
save_output(output)