In [None]:
# imports

# standard
from collections import Counter, defaultdict
from operator import itemgetter
import re

# extra
import numpy as np
import pandas as pd

# local
from feature_helpers import *

# display settings
pd.set_option('display.max_columns', 21)
pd.set_option('display.max_colwidth', 400)
pd.set_option('display.precision', 4)
pd.set_option('display.width', 1600)

In [None]:
recipes = load_clean_data()

In [None]:
def get_recipe_features(ings):
    features = []
    for ing in ings:
        head = ing.heads[0]
        mods = ing.heads[1:]
        features.extend([head])
        features.extend(['{}-{}'.format(head, mod) for mod in mods])
        if ing.states:
            features.extend(['_state'] + ['_state-{}'.format(state) for state in ing.states])
        if ing.brands:
            features.extend(['_brand'] + ['_brand-{}'.format(brand) for brand in ing.brands])
        if ing.langs:
            features.extend(['_lang'] + ['_lang-{}'.format(lang) for lang in ing.langs])
    return features

In [None]:
recipe_features = recipes.ingredients.map(get_recipe_features)

In [None]:
recipes.ingredients = recipe_features

In [None]:
all_features = sorted(set([ing for ings in recipes.ingredients for ing in ings]))
feature_array = np.zeros([len(recipes), len(all_features)], dtype=np.uint8)
feature_index = {feature: i for i, feature in enumerate(all_features)}
for row_i, features in enumerate(recipes.ingredients):
    for feature in features:
        feature_array[row_i, feature_index[feature]] = 1

In [None]:
feature_df = pd.DataFrame(feature_array, index=recipes.index, columns=all_features)

In [None]:
feature_counts = feature_df.sum(axis='index')
rare_features = feature_counts[feature_counts < 10]
len(rare_features)

In [None]:
feature_df.shape

In [None]:
feature_df = feature_df.drop(columns=rare_features.index)
feature_df.shape

In [None]:
output = pd.concat([recipes, feature_df], axis='columns')

In [None]:
save_output(output)