In [None]:
# Imports
from collections import defaultdict, Counter
import csv
from functools import lru_cache
#from helpers import *
import numpy as np
from operator import itemgetter
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# alt
data = pd.read_csv('data/data_cleaned.csv', header=0, index_col=0, encoding='utf-8', 
                   converters={'ingredients': lambda x: x[2:-2].split("', '")})
data.head()

In [None]:
train = pd.read_json('data/train.json', orient='records', encoding='utf-8')
test = pd.read_json('data/test.json', orient='records', encoding='utf-8')

In [None]:
train.set_index('id', drop=True, inplace=True)
test.set_index('id', drop=True, inplace=True)
test.insert(0, 'cuisine', 'test')
data = pd.concat((train, test), axis=0)

In [None]:
@lru_cache(maxsize=4096)
def clean_phrase(orig_phrase):
    phrase = orig_phrase.lower()
    
    # remove useless chars
    phrase = char_pattern.sub('', phrase)
    
    # standardize 'n' and '&' to 'and'; '-' to ' '
    phrase = ' and '.join(phrase.split('&'))
    phrase = ' and '.join(phrase.split(' n '))
    phrase = ' '.join(phrase.split('-'))
    
    # remove prep instructions
    split = phrase.split(',')
    phrase = split[0]
    split = phrase.split(' for ')
    phrase = split[0]
    
    # move 'with x', 'in x' phrases to front
    split = phrase.split(' with ')
    if len(split) > 1:
        phrase = ' '.join([split[1], split[0]])
    split = phrase.split(' in ')
    if len(split) > 1:
        phrase = ' '.join([split[1], split[0]])
    
    # hacky spelling correction
    for k, v in spellcheck_compiled:
        phrase = k.sub(v, phrase) 
    
    # substitute phrases
    split = phrase.split()
    for i, word in enumerate(split):
        if word in words_to_sub: 
            split[i] = words_to_sub[word]
    phrase = ' '.join(split)
    for k, v in phrases_to_sub.items():
        sub = ' ' + v + ' '
        phrase = sub.join(phrase.split(k))
    
    # sub ' of (the)? ' with '-of-'
    phrase = of_pattern.sub('-of-', phrase)

    return phrase

In [None]:
phrases_cleaned = data.ingredients.map(lambda l: list(map(clean_phrase, l)))

In [None]:
def clean_words(phrases):
    res = set()
    for phrase in phrases:
        if not phrase:
            continue
        split = phrase.split()
        for i, word in enumerate(split):
            if len(word.split('-')) > 1:
                res.add(word)
                split[i] = 'below' # stopword
                continue
            if len(word) > 4:
                split[i] = correct_spelling(word)
            if word in words_to_segment:
                    split[i] = segment_word(word)
        split = remove_dupes(split)
        split = lemmatize(' '.join(split))
        if not split:
            continue
        for word in split:
            if word in lang_trans:
                res.add('{}-l'.format(lang_trans[word]))
        #if len(split) > 1:
        #    res.add(' '.join(split[-2:]))
        res.update(split)
    return list(res)

In [None]:
words_cleaned = phrases_cleaned.map(clean_words)

In [None]:
words = []
_ = words_cleaned.map(words.extend)
len(words)

In [None]:
word_counts = Counter(words)
len(word_counts)

In [None]:
rare = set()
for word, count in word_counts.items():
    if count < 3:
        rare.add(word)
len(rare)

In [None]:
for word in rare:
    del word_counts[word]
len(word_counts)

In [None]:
nonrare = words_cleaned.map(lambda lst: [word for word in lst if not word in rare])

In [None]:
data.ingredients = nonrare

In [None]:
#data.to_csv('data/data_cleaned.csv', header=True, encoding='utf-8')

In [None]:
# Make tfidfs

In [None]:
cuisines = data.cuisine.value_counts().index

In [None]:
ing_df = data[['cuisine', 'ingredients']]

In [None]:
docs_list = defaultdict(list)
for rowid, vals in ing_df.iterrows():
    docs_list[vals.cuisine].extend(vals.ingredients)

In [None]:
len(docs_list['southern_us'])

In [None]:
it = Counter(docs_list['italian'])

In [None]:
fr = Counter(docs_list['french'] + docs_list['southern_us'] + docs_list['russian'])

In [None]:
for fk, fv in fr.items():
    if fk in it:
        iv = it[fk]
        new_iv = iv - (iv**2 // (iv + fv))
        if new_iv < 0:
            new_iv = 0
        it[fk] = new_iv

In [None]:
new_it = []
for k, v in it.items():
    new_it.extend([k] * v)

In [None]:
docs_list['italian'] = new_it

In [None]:
docs = {}
for k, v in docs_list.items():
    if k == 'test':
        continue
    docs[k] = ' '.join(v)

In [None]:
doc_keys = []
doc_vals = []
for k, v in docs.items():
    doc_keys.append(k)
    doc_vals.append(v)

In [None]:
vectorizer = TfidfVectorizer(encoding='utf-8', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, 
                             strip_accents=None, token_pattern=r'[\w-]+', analyzer='word', stop_words=None)

In [None]:
tfidfs = vectorizer.fit_transform(doc_vals)
names = vectorizer.get_feature_names()
tfidfs = pd.SparseDataFrame(tfidfs)
tfidfs.index = doc_keys
tfidfs.columns = names

In [None]:
tfidfs

In [None]:
### Make output for tfidf features

In [None]:
summary = tfidfs.describe(percentiles=[.5])

In [None]:
tfidf_weights = summary.loc['std'] / summary.loc['mean']

In [None]:
tfidf_sums = tfidfs.sum(axis=0)

In [None]:
tfidf_pct = tfidfs / tfidf_sums

In [None]:
def old_get_scores(recipe):
    weighted_tfidfs = tfidf_weights[recipe] * tfidfs[recipe]
    return weighted_tfidfs.sum(axis=1) / len(recipe)

def get_scores(recipe):
    return tfidf_pct[recipe].sum(axis=1) / len(recipe)

In [None]:
"""Errors: x is misclassified as y
brazilian as mexican, italian
british as southern_us, french
filipino as chinese
french as italian, southern_us
greek as italian
irish as southern_us
russian as italian, french
spanish as italian
"""

In [None]:
tfidfs_recipes = data.ingredients.apply(get_scores)

In [None]:
tfidfs_recipes.describe()

In [None]:
data = pd.concat((data, tfidfs_recipes), axis=1)
data.shape

In [None]:
data.head()

In [None]:
### Make output for ingredient features

In [None]:
indices = {k: i for i, k in enumerate(word_counts)}
col_names = [k for k, _ in sorted(indices.items(), key=itemgetter(1))]
zeros = np.zeros((data.shape[0], len(col_names)), dtype=np.uint8)
for ri, ings in enumerate(data.ingredients):
    zeros[ri, [indices[ing] for ing in ings]] = 1

In [None]:
zeros_df = pd.DataFrame(zeros, columns=col_names, index=data.index)
zeros_df.shape

In [None]:
data = pd.concat((data, zeros_df), axis=1)
data.shape

In [None]:
data.head()

In [None]:
# save

In [None]:
train = data.query('cuisine != "test"')
train = train.drop(columns=['ingredients'])
train_cuisine = train.iloc[:,0]
train_cuisine.to_csv('data/cuisine.csv', header=False, encoding='utf-8')

In [None]:
train = train.drop(columns=['cuisine'])
train.to_csv('data/tfidf_it_red_train.csv', header=True, encoding='utf-8')

In [None]:
test = data[data.iloc[:,0] != 'test']
test = test.drop(columns=['cuisine', 'ingredients'])
test.to_csv('data/tfidf_it_red_test.csv', header=True, encoding='utf-8')

In [None]:
train.head()