In [None]:
# Imports
from collections import defaultdict, Counter
import numpy as np
from operator import itemgetter
import pandas as pd
import re

### Numpy Print Options ###
np.set_printoptions(
    threshold=2000, # 1000
    edgeitems=10, # 3
    linewidth=180, # 75
    precision=2
)

In [None]:
train = pd.read_json('data/train.json', orient='records', encoding='utf-8')
test = pd.read_json('data/test.json', orient='records', encoding='utf-8')

In [None]:
train.set_index('id', drop=True, inplace=True)
test.set_index('id', drop=True, inplace=True)
test.insert(0, 'cuisine', 'test')
data = pd.concat((train, test), axis=0)

In [None]:
chars_to_remove = r'[®™\!\'%\(\)\d]' # for re.sub

In [None]:
phrases_to_remove = {
 'â€',
 'a taste of thai',
 'all natural',
 'argo',
 'artisan blend',
 'bertolli',
 'best food',
 'big slice',
 'bone in',
 'campbells',
 'cholesterol free',
 'classic',
 'conimex woksaus specials',
 'country crock',
 'crystal farms',
 'delallo',
 'deli fresh',
 'diet',
 'duncan hines',
 'e-fu',
 'family harvest',
 'family size',
 'farmhouse originals',
 'fat free',
 'finely shredded',
 'foster farms',
 'franks redhot',
 'frenchs',
 'gluten-free',
 'gluten free',
 'good seasons',
 'gourmet garden',
 'green giant',
 'heinz',
 'hellmann',
 'hidden valley',
 'hidden valley original',
 'home originals',
 'honeysuckle white',
 'hurst',
 'jimmy dean',
 'johnsonville',
 'king arthur',
 'klondike',
 'knorr',
 'knudsen',
 'kraft',
 'land o lakes',
 'less sodium',
 'lipton',
 'lipton recip secret',
 'lipton recipe secret',
 'low fat',
 'low-fat',
 'low sodium',
 'low-sodium',
 'made with',
 'no-stick',
 'old el paso',
 'old world style',
 'original',
 'oscar mayer',
 'oz',
 'pam',
 'pasta sides',
 'pillsbury',
 'pompeian',
 'premium',
 'pure wesson',
 'ready rice',
 'recipe creations',
 'reduced fat',
 'reduced sodium',
 'refrigerated',
 'rotel',
 'ro-tel',
 'sargento',
 'shredded',
 'simply organic',
 'skin on',
 'soy vay',
 'spice islands',
 'stonefire',
 'store bought',
 'taco bell',
 'thick and chunky',
 'thick n chunky',
 'veri veri',
 'with a hint of',
 'with a touch of philadelphia',
}

In [None]:
phrases_to_sub = {
 'i cant believ its not butter': 'butter',
 'i cant believe its not butter': 'butter',
 'i cant believ it not butter': 'butter',
 'ragu': 'pasta sauce',
 'uncle bens': 'rice',
 'wish bone': 'dressing',
 'wish-bone': 'dressing',
 'yoplait': 'yogurt'
}

In [None]:
def clean_phrases(orig_phrase):
    phrase = orig_phrase.lower()
    # remove prep instructions: 'food, prep'
    split = phrase.split(',')
    if len(split) > 1:
        phrase = split[0]
    phrase = re.sub(chars_to_remove, '', phrase)
    phrase = re.sub(r' ?\& ?', ' and ', phrase)
    for k, v in phrases_to_sub.items():
        phrase = re.sub(k, v, phrase)
    for remove in phrases_to_remove:
        phrase = re.sub(remove, '', phrase)
    # move 'with x' phrase to front
    split = phrase.split(' with ')
    if len(split) > 1:
        phrase = ' '.join([split[1], split[0]])
    # remove single letters
    if len(phrase) == 0:
        print(orig_phrase)
    phrase = re.sub(r'^\w | \w\b| \w ', ' ', phrase)
    phrase = re.sub(r'\s+', ' ', phrase)
    phrase = phrase.strip()
    return phrase

In [None]:
data.ingredients = data.ingredients.map(lambda l: list(map(clean_phrases, l)))

In [None]:
ing_counts = Counter(ings)
len(ing_counts)

In [None]:
ing_map = {k: i for i, k in enumerate(ing_counts.keys())}
ing_names = [tup[0] for tup in sorted(ing_map.items(), key=itemgetter(1))]

In [None]:
len(ing_names)

In [None]:
phrase_lengths = defaultdict(list)
for name in ing_names:
    phrase_lengths[len(name.split())].append(name)
phrase_lengths.keys()

In [None]:
zeros = np.zeros((data.shape[0], len(ing_counts)), dtype=np.uint8)

In [None]:
for ri, ings in enumerate(data.ingredients):
    zeros[ri, [ing_map[ing] for ing in ings]] = 1

In [None]:
zeros_df = pd.DataFrame(zeros, columns=ing_names, index=data.index)

In [None]:
zeros_df.shape

In [None]:
data = pd.concat((data, zeros_df), axis=1)

In [None]:
data.shape

In [None]:
train = data.query('cuisine != "test"')
train = train.drop(columns=['ingredients'])
train_cuisine = train.cuisine
train_cuisine.to_csv('data/cuisine.csv', header=False, encoding='utf-8')

In [None]:
train = train.drop(columns=['cuisine'])
train.to_csv('data/phrases_cleaned_train.csv', header=True, encoding='utf-8')

In [None]:
test = data.query('cuisine == "test"')
test = test.drop(columns=['cuisine', 'ingredients'])
test.to_csv('data/phrases_cleaned_test.csv', header=True, encoding='utf-8')

In [None]:
test.head()