In [None]:
# Imports
from collections import defaultdict, Counter
from csv import DictReader
#from googletrans import Translator
from helpers import *
import numpy as np
from operator import itemgetter
import pandas as pd
import re

### Numpy Print Options ###
np.set_printoptions(
    threshold=2000, # 1000
    edgeitems=10, # 3
    linewidth=180, # 75
    precision=2
)

In [None]:
train = pd.read_json('data/train.json', orient='records', encoding='utf-8')
test = pd.read_json('data/test.json', orient='records', encoding='utf-8')

In [None]:
train.set_index('id', drop=True, inplace=True)
test.set_index('id', drop=True, inplace=True)
test.insert(0, 'cuisine', 'test')
data = pd.concat((train, test), axis=0)

In [None]:
# hack instead of bigger solution that didn't work
spellcheck = {
    r'i cant believe? its? not': '',
    'sauc': 'sauce',
    'recip': 'recipe',
    'reduc': 'reduced',
    'jonshonville': 'johnsonville',
    'burgundi': 'burgundy',
    'jell o': 'gelatin',
    'jello': 'gelatin',
    'made with': 'with',
    'v': 'v_eight',
    'e fu': 'yi mein',
    'fatfree': 'fat free',
    'miracle whip': 'mayonaise'
}

In [None]:
phrase_sub_classes = {
    # to remove
    '': [' oz ', 'bone in', 'skin on', 'head on', 'on the vine', 'i cant believe its not', 'refrigerated', 'store bought'],
    'brand': ['argo', 'artisan blends', 'best foods', 'big slice', 'bisquick', 'breakstone', 'campbells', 'country crock', 
              'crystal farms', 'duncan hines', 'egglands best', 'family harvest', 'farmhouse originals', 
              'foster farms', 'franks redhot', 'frenchs', 'good seasons', 'gourmet garden',
              'green giant', 'heinz', 'hellmann', 'hershey', 'hidden valley', 'home originals', 'honeysuckle white',
              'hurst', 'jimmy dean', 'johnsonville', 'king arthur', 'klondike', 'knorr', 'knudsen', 'kraft', 'land o lakes', 'lipton',
              'lipton recipe secret', 'nestle', 'nielsen massey', 'no stick', 'oscar mayer', 
              'pam', 'pasta sides', 'pepperidge farm', 'pillsbury', 
              'pompeian', 'pure wesson', 'ready rice', 'recipe creations', 'recipe secret', 'recipe secrets',
              'sargento', 'simply organic', 'spice islands', 'stonefire', 'a hint of', 'a touch of philadelphia', 
              'uncle bens', 'wish bone', 'wishbone', 'yoplait', 'lea and perrins', 'honey bunches of oats'],
    'easian brand': ['a taste of thai', 'conimex woksaus specials', 'soy vay', 'veri veri'],
    'italian brand': ['barilla', 'bertolli', 'delallo', 'old world style', 'ragu'],
    'mexican brand': ['old el paso', 'rotel', 'ro tel', 'taco bell'],
    'marketing': ['all natural', 'cholesterol free', 'classic', 'deli fresh', 'diet', 'family size', 'fat free',
                  'gluten free', 'less sodium', 'low fat', 'low sodium', 'non fat', 'original', 'premium', 
                  'reduced fat', 'reduced sodium', 'thick and chunky', 'traditional'],
    # ands
    'half_and_half': ['half and half'],
    'mac_and_cheese': ['macaroni and cheese'],
    'bread_and_butter': ['bread and butter'],
    'm_and_ms': ['m and ms'],
    'pork_and_beans': ['pork and beans'],
    'sweet_and_sour': ['sweet and sour'],
}

In [None]:
phrases_to_sub = invert_dict_lists(phrase_sub_classes)

In [None]:
rare_trans = {}
with open('data/rare_translations.csv', 'r', encoding='utf-8-sig') as file:
    reader = DictReader(file, fieldnames=['k', 'v'])
    for row in reader:
        rare_trans[row['k']] = row['v']

In [None]:
def clean_phrase(orig_phrase):
    phrase = orig_phrase.lower()
    
    # remove useless chars
    phrase = re.sub(r'[®™’â€/\!\'%\(\)\.\d]', '', phrase)
    
    # standardize 'n' and '&' to 'and'; '-' to ' '
    phrase = re.sub(r' ?\& ?', ' and ', phrase)
    phrase = re.sub(r' n ', ' and ', phrase)
    phrase = re.sub(r'-', ' ', phrase)
    
    # remove 'in x' and 'for x'
    split = phrase.split(' in ')
    phrase = split[0]
    split = phrase.split(' for ')
    phrase = split[0]
    
    # remove prep instructions
    split = phrase.split(',')
    phrase = split[0]
    
    # move 'with x' phrase to front
    split = phrase.split(' with ')
    if len(split) > 1:
        phrase = ' '.join([split[1], split[0]])
    
    # hacky spelling correction
    for k, v in spellcheck.items():
        key = r'(\b)' + k + r'(\b)'
        phrase = re.sub(key, r'\1' + v + r'\2', phrase)        
    
    # substitute phrases
    # TODO optimize?
    for k, v in phrases_to_sub.items():
        key = r'(\b)' + k + r'(\b)'
        phrase = re.sub(key, r'\1' + v + r'\2', phrase)
    
    # sub ' of (the)? ' with '_of_'
    phrase = re.sub(r' of (?:the )?', '_of_', phrase)
    
    # remove remaining stopwords
    phrase = re.sub(' and | or | up ', ' ', phrase)
    
    # remove final-'s' from all words
    phrase = re.sub(r's(\b)', r'\1', phrase)
    
    # remove single letters
    phrase = re.sub(r'\b\w\b', ' ', phrase)
    
    # trim whitespacce
    phrase = re.sub(r'\s+', ' ', phrase.strip())
    
    # check for empty phrases
    if len(phrase) == 0:
        print(orig_phrase)
    
    return phrase

In [None]:
def encode_words(phrases):
    words = set()
    for phrase in phrases:
        phrase = clean_phrase(phrase)
        if not phrase:
            continue
        split = phrase.split()
        for i, word in enumerate(split):
            if len(word) > 4:
                split[i] = correct_spelling(word)
            if word in rare_trans:
                split[i] = rare_trans[word]
        split[-1] = split[-1] + '-h'
        words.update(split)
    return list(words)

In [None]:
data.ingredients = data.ingredients.map(encode_words)

In [None]:
words = []
_ = data.ingredients.map(words.extend)
len(words)

In [None]:
word_counts = Counter(words)
len(word_counts)

In [None]:
common = set()
rare = {}
for word, freq in word_counts.items():
    if freq < 5:
        rare[word] = ''
        continue
    if freq > 24000:
        common.add(word)

In [None]:
len(rare)

In [None]:
indices = {k: i for i, k in enumerate(word_counts)}
col_names = [k for k, _ in sorted(indices.items(), key=itemgetter(1))]

In [None]:
zeros = np.zeros((data.shape[0], len(col_names)), dtype=np.uint8)

In [None]:
for ri, ings in enumerate(data.ingredients):
    zeros[ri, [indices[ing] for ing in ings]] = 1

In [None]:
zeros_df = pd.DataFrame(zeros, columns=col_names, index=data.index)

In [None]:
zeros_df.shape

In [None]:
data = pd.concat((data, zeros_df), axis=1)

In [None]:
data.shape

In [None]:
train = data.query('cuisine != "test"')
train = train.drop(columns=['ingredients'])
train_cuisine = train.cuisine
train_cuisine.to_csv('data/cuisine.csv', header=False, encoding='utf-8')

In [None]:
train = train.drop(columns=['cuisine'])
train.to_csv('data/rare_trans_cleaned_train.csv', header=True, encoding='utf-8')

In [None]:
test = data.query('cuisine == "test"')
test = test.drop(columns=['cuisine', 'ingredients'])
test.to_csv('data/rare_trans_cleaned_test.csv', header=True, encoding='utf-8')

In [None]:
test.head()