In [None]:
# imports

# extra
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# local
from feature_helpers import *

# display settings
sns.set(style='whitegrid', palette='husl')
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.precision', 4)
pd.set_option('display.width', 1600)

In [None]:
recipes = load_clean_data()

In [None]:
metrics = get_metrics(recipes)

In [None]:
metrics.sort_values('unique_count')

In [None]:
sns.set(style='whitegrid', palette='husl')
recipe_counts_plot = sns.catplot(x='recipe_count', y='cuisine', data=metrics, kind='bar', height=7, aspect=1.8,
                                 order=metrics.recipe_count.sort_values(ascending=False).index)
recipe_counts_plot.set(xticks=range(0, 10500, 500));

In [None]:
recipe_lengths_plot = sns.catplot(x='recipe_length', y='cuisine', data=metrics, kind='bar', height=7, aspect=1.5,
                                  order=metrics.recipe_length.sort_values(ascending=False).index)
recipe_lengths_plot.set(xticks=range(0, 15, 2));

In [None]:
head_counts_plot = sns.catplot(x='head_count', y='cuisine', data=metrics, kind='bar', height=7, aspect=1.5,
                              order=metrics.head_count.sort_values(ascending=False).index)
head_counts_plot.set(xticks=range(0, 122000, 20000));

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))
sns.set_color_codes('pastel')
unique_counts_plot = sns.barplot(x='unique_count', y='cuisine', data=metrics, label='unique ings', color='b', ax=ax,
                              order=metrics.unique_count.sort_values(ascending=False).index)
rare_counts_plot = sns.barplot(x='rare_count', y='cuisine', data=metrics, label='rare ings', color='r', ax=ax,
                              order=metrics.unique_count.sort_values(ascending=False).index)
ax.legend(ncol=2, loc='lower right', frameon=True)
unique_counts_plot.set(xticks=range(0, 1100, 100));

In [None]:
# for friendlier plotting
def reshape_tfidfs(tfidfs):
    pairs = []
    for cuisine, vals in tfidfs.iterrows():
        pairs.extend([[cuisine, v] for v in vals if v > 0.0001])
    return pd.DataFrame(pairs, columns=['cuisine', 'tfidf'])
tfidfs = make_tfidfs(recipes.query('cuisine != "test"'))
smoothed_tfidfs = smooth_tfidfs(tfidfs, .6)
reshaped_tfidfs = reshape_tfidfs(tfidfs)
reshaped_smoothed_tfidfs = reshape_tfidfs(smoothed_tfidfs)

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))
all_tfidfs_plot = sns.distplot(reshaped_tfidfs.tfidf, kde=False)
all_tfidfs_plot.set(xticks=np.arange(0, .46, .02));

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))
max_tfidf = reshaped_smoothed_tfidfs.tfidf.max()
all_smoothed_tfidfs_plot = sns.distplot(reshaped_smoothed_tfidfs.tfidf, kde=False)
all_smoothed_tfidfs_plot.set(xticks=np.arange(0, max_tfidf + .05, .1));

In [None]:
cuisine_tfidfs_plot = sns.catplot(y='tfidf', x='cuisine', data=reshaped_tfidfs, kind='strip', height=7, aspect=2.5,
                             order=metrics.unique_count.sort_values(ascending=False).index)
cuisine_tfidfs_plot.set(yticks=np.arange(0, .45, .05));

In [None]:
max_tfidf = reshaped_smoothed_tfidfs.tfidf.max()
smoothed_cuisine_tfidfs_plot = sns.catplot(y='tfidf', x='cuisine', data=reshaped_smoothed_tfidfs, kind='strip', height=7, aspect=2.5,
                             order=metrics.unique_count.sort_values(ascending=False).index)
smoothed_cuisine_tfidfs_plot.set(yticks=np.arange(0, max_tfidf + .1, .05));

In [None]:
def normalize_dist(dist):
    dist_min = dist.min()
    dist_range = dist.max() - dist_min
    normed = dist.map(lambda x: (x - dist_min) / dist_range)
    return normed

In [None]:
def make_indicators(recipes, features, renamed):
    indicators = np.zeros([recipes.shape[0], len(features)], dtype=np.uint8)
    feature_index = {feature: i for i, feature in enumerate(features)}
    for row_i, ings in enumerate(recipes.ingredients):
        recipe_features = [renamed[feature] if feature in renamed else feature for feature in ings]
        for feature in recipe_features:
            indicators[row_i, feature_index[feature]] = 1
    indicators = pd.DataFrame(indicators, index=recipes.index, columns=features)
    return pd.concat([recipes.cuisine, indicators], axis='columns')

In [None]:
def get_feature_importances(train):
    X = train.drop(columns=['cuisine'])
    y = train['cuisine']
    rfc = RandomForestClassifier(min_samples_leaf=8, random_state=1, class_weight='balanced', n_estimators=200, criterion='gini')
    rfc_grid = {
        'max_depth': [20, 24],
        'min_samples_split': [15, 25, 35]
    }
    search = RandomizedSearchCV(rfc, param_distributions=rfc_grid, refit=True, n_iter=5, cv=3, n_jobs=-1)
    search.fit(X, y)
    best = search.best_estimator_
    print('Best score: {0:.3f}, train score: {1:.3f}'.format(search.best_score_, best.score(X, y)))
    print(search.best_params_)
    return pd.Series(best.feature_importances_, index=X.columns)

def merge_unused_features(recipes, counts, renamed, catchall):
    indicators = make_indicators(recipes, counts.index.to_list(), renamed)
    importances = get_feature_importances(indicators)
    
    unused_features = importances[importances == 0].index.to_list()
    long_features = [feature for feature in unused_features if len(feature.split('-')) > 1]
    merged, renamed_update = merge_features(counts, long_features, catchall)
    renamed = merge_arrows(renamed, renamed_update)
    
    indicators = make_indicators(recipes, merged.index.to_list(), renamed)
    importances = get_feature_importances(indicators)
    
    unused_features = importances[importances == 0].index.to_list()
    merged, renamed_update = merge_features(merged, unused_features, catchall)
    renamed = merge_arrows(renamed, renamed_update)
    
    return (merged, renamed)

unused_merged, renamed_update = merge_unused_features(recipes, rare_merged, renamed, 'raretype')
renamed = merge_arrows(renamed, renamed_update)

In [None]:
def equalize_counts(counts, smooth_type='linear', smooth_intensity=.7):
    smoothing_funcs = {
        'linear': lambda x: smooth_intensity * x,
        'tanh': lambda x: tanh(x),
        'sqrt_sigmoid': lambda x: x / sqrt(1 + x**2),
        'frac_sigmoid': lambda x: x / (smooth_intensity + x)
    }
    total = counts.sum(axis='columns')
    inverse_total = total.map(lambda x: 1 / x if x else 0)
    proportions = counts.T * inverse_total
    if smooth_type:
        proportions = proportions.applymap(smoothing_funcs[smooth_type])
    return proportions.T

In [None]:
def train_validate_split(data, val_size, seed):
    samples = []
    for cuisine, group in data.groupby('cuisine'):
        jiggle = np.random.choice([0, .01, .02, .03])
        samples.append(group.sample(frac=val_size + jiggle, replace=False, random_state=seed, axis='index'))
    val = pd.concat(samples, axis='index')
    train = data.drop(index=val.index)
    X_train = train.drop(columns=['cuisine'])
    X_val = val.drop(columns=['cuisine'])
    y_train = train['cuisine']
    y_val = val['cuisine']
    return (X_train, y_train, X_val, y_val)

In [None]:
def test_hyperparams(data, clf, grid, metric, val_size, splits=3):
    param_grid = ParameterGrid(grid)
    scores = defaultdict(list)
    for i in range(splits):
        X_train, y_train, X_val, y_val = train_validate_split(data, val_size, seed=i)
        best_model, best_score, _, _ = pf.bestFit(
            clf, param_grid, X_train, y_train, X_val, y_val, metric=metric, greater_is_better=True)
        scores[best_score].append(best_model)
    return scores

In [None]:
pct = lambda v: int(v * 100)

def test(X, y, title, clf, sampler=None, splits=3):
    kfold = KFold(n_splits=splits, shuffle=True, random_state=1)
    for train_i, test_i in kfold.split(X):
        X_train, X_test = X.iloc[train_i], X.iloc[test_i]
        y_train, y_test = y.iloc[train_i], y.iloc[test_i]
        if sampler:
            X_train, y_train = sampler.fit_resample(X_train, y_train)
        model = clf.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(metrics.accuracy_score(y_test, preds))
        print(metrics.classification_report(y_test, preds))

def get_errors(X, y, model, sort_col=None):
    errors = []
    preds = []
    for i in range(X.shape[0]):
        obs = X.iloc[i:i+1]
        real = y.iloc[i]
        y_pred = model.predict(obs)
        if y_pred != [real]:
            errors.append(i)
            preds.append(y_pred)
    errs = pd.concat([X.iloc[errors], y.iloc[errors]], axis=1)
    preds_df = pd.DataFrame(preds, index=errs.index, columns=['pred'])
    errs = pd.concat([errs, preds_df], axis=1)
    print('Errors:', errs.shape[0])
    if sort_col:
        errs.sort_values(sort_col, inplace=True)
    return errs