In [12]:
import pandas as pd
from sklearn import model_selection
from collections import Counter
import itertools
from nltk.util import ngrams
import inflect
p = inflect.engine()
fraction_of_test_data = 0.2
n_grams = 2
cross_validating = True

In [13]:
data = pd.read_json('train.json')
if not cross_validating:
    test_data = pd.read_json('test.json')

In [14]:
# Function changes text from Unicode to ASCII and to lower case
def encode_ascii(item):
    if isinstance(item, list):
        return [r.encode('ascii', 'ignore').lower() for r in item]
    else:
        return item.encode('ascii', 'ignore').lower()

# Function changes all plurals to singular so ingredients like tomatoes and tomato are grouped together
def singular(row):
    return [p.singular_noun(i) if p.singular_noun(i) else i for i in row]

# Function removes 'salt' as an ingredeint from the recipes
def remove_salt(row):
    if 'salt' in row:
        row.remove('salt')  
    return row

# Function allows ingredients to be split into ngrams
def use_ngrams(ingredient_list):
    new_ingredient_list = []
    for ingredient in ingredient_list:
        if len(ingredient.split()) > n_grams:
            words = list(ngrams(ingredient.split(), n_grams))
            new_ingredient_list.append(list(' '.join(i) for i in words))
        else:
            new_ingredient_list.append([ingredient])
    return list(itertools.chain.from_iterable(new_ingredient_list))   

# Functions handles ingredients from test data that do not appear in training data
def handle_unknown_ingredient(i, ingredients): 
    while len(i.split()) > 1:
        if i in ingredients:
            return i
        else:
            i = ' '.join(i.split()[1:])
    return i

In [15]:
data_labels = data['cuisine'].apply(encode_ascii)
data_features = data['ingredients'].apply(encode_ascii).apply(singular)
data_features.apply(remove_salt)

if not cross_validating:
    X_test = test_data['ingredients'].apply(encode_ascii).apply(singular)
    X_test.apply(remove_salt)
    print X_test[:5]

In [16]:
data_features = data_features.apply(use_ngrams)

if not cross_validating:
    X_test = X_test.apply(use_ngrams)

In [17]:
#Dividing data into test and training sets for cross-validation
if cross_validating:
    X_train, X_test, y_train, y_test = model_selection.train_test_split(data_features, data_labels, test_size=fraction_of_test_data, random_state=0)

In [18]:
#Calculating prior probability of cuisine
cuisine_stats = Counter(y_train)
cuisine_probability = {k.encode('ascii','ignore'): float(v)/len(y_train) for k, v in cuisine_stats.items()}

In [19]:
# Calculating ingredient probability
all_ingredients = list(itertools.chain.from_iterable(X_train.apply(encode_ascii)))
unique_ingredients = list(set(all_ingredients))
ingredients_count = len(all_ingredients)
ingredient_stats = Counter(all_ingredients)
ingredient_probability = {k: float(v) / ingredients_count for k, v in ingredient_stats.items()}

In [20]:
# Calculating the probability of ingredient given cuisine
unique_cuisines = data['cuisine'].unique()
cuisine_ingredient_dict = {k: [] for k in unique_cuisines}
for cuisine in unique_cuisines:
    cuisine_ingredient_list = X_train[y_train[y_train == cuisine].index].apply(encode_ascii)
    flattened_list = list(itertools.chain.from_iterable(cuisine_ingredient_list))
    cuisine_ingredient_dict[cuisine] = Counter(flattened_list)

probability_ingredient_given_cuisine = {k: {} for k in unique_cuisines}
for k,v in cuisine_ingredient_dict.items():
    for i,j in v.items():
        j = float(j)/cuisine_stats[k]
        probability_ingredient_given_cuisine[k][i] = j

In [21]:
# Naive Bayes algorithm: Prob{cuisine|ingredients} = Prob{ingredients|cuisine}*Prob{cuisine}/Prob{ingredients}
# => Prob{cuisine|ingredients} = Prob{ingredient_1|cusine}*Prob{ingredient_2|cuisine}....Prob{ingredient_N|cuisine}*Prob(cuisine)/(Prob{ingredient_1}*Prob{ingredient_2}.....Prob{ingredient_N})
def naive_bayes(data):
    output = []
    for recipe in data:
        recipe = set(recipe)
        probability_cuisine_given_ingredients = {k : 0 for k in unique_cuisines}
        for cuisine in unique_cuisines:
            probability_ingredient_given_cuisine_total = 1
            probability_ingredient_total = 1
            for i in recipe:
                if i not in unique_ingredients:
                    i = handle_unknown_ingredient(i, unique_ingredients)
                try:
                    probability_ingredient_given_cuisine_total *= probability_ingredient_given_cuisine[cuisine][i]  
                    probability_ingredient_total *= ingredient_probability[i] 
                except:
                    NameError
            probability_cuisine_given_ingredients[cuisine] = probability_ingredient_given_cuisine_total*cuisine_probability[cuisine]\
                                                        /probability_ingredient_total      
        output.append(max(probability_cuisine_given_ingredients, key=probability_cuisine_given_ingredients.get))
    return output
final_result = naive_bayes(X_test)

In [46]:
# Uncomment if cross-validating
correct = 0
incorrectly_classified = {k:[] for k in unique_cuisines}
for i in range(len(final_result)):
    if y_test.iloc[i] == final_result[i]:
        correct += 1
    else:
        incorrectly_classified[y_test.iloc[i]].append(final_result[i])
print 'Fraction of recipes classified correctly =', float(correct)/(len(final_result))

Fraction of recipes classified correctly = 0.73488372093


In [47]:
# print "CORRECT CUISINE - Counter({'incorrect cuisine': # of times classified as}) \n"
# for key, value in incorrectly_classified.items():
#     print key.upper(), '-', Counter(encode_ascii(value)), '\n'

In [23]:
# Formatting for Kaggle submission
if not cross_validating:
    output_for_submission = pd.DataFrame(data=final_result, index=test_data['id'], columns=['cuisine'])
    output_for_submission.to_csv('naive_bayes_4.csv')