# 2. Whats Cooking

## 2.1. Load training and test data

In [4]:
## Download nltk if you don't have it installed
# import nltk
# nltk.download()

In [5]:
import pandas as pd
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB

train_dishes = pd.read_json('./train.json')
test_dishes = pd.read_json('./test.json')

## 2.2 Clean the data

In [6]:
def wordTokenizer(text):
    return text.split(",")

def clean_data(ingredients):
    lmtzr = WordNetLemmatizer()
    def split_word(strOfWords):
        toRemove = ['oz','g','lb','n','s']
        tempWords  = []
        for word in strOfWords.split():
            word = "".join(x for x in word if x.isalpha())
            word = "".join(lmtzr.lemmatize(word))
            if word.lower() not in toRemove:
                 tempWords.append(word)
        return " ".join(tempWords)
    return [[split_word(x) for x in y] for y in ingredients]

print("Number of samples: ", train_dishes["cuisine"].shape[0])
print("Categories of Cuisines: ", train_dishes["cuisine"].unique().size)
train_dishes["total_ingredients_no_clean"] = train_dishes["ingredients"].map(",".join)
cv_no_clean = CountVectorizer(tokenizer=wordTokenizer)
X_no_clean = cv_no_clean.fit_transform(train_dishes["total_ingredients_no_clean"].values)

# Clean the data
train_dishes["ingredients"]= clean_data(train_dishes["ingredients"])
train_dishes["total_ingredients"] = train_dishes["ingredients"].map(",".join)
train_dishes["total_ingredients"] = train_dishes["total_ingredients"].str.strip()

Number of samples:  39774
Categories of Cuisines:  20


## 2.3 Generate X 

In [7]:
#Use count vectorizer to create X = n*d where n = number of samples, d=dishes. 
cv = CountVectorizer(stop_words="english",lowercase=True,tokenizer=wordTokenizer,token_pattern="[A-Za-z]*",strip_accents='ascii',vocabulary=None)
X = cv.fit_transform(train_dishes["total_ingredients"].values)
print("Unique ingredients after data cleaning: ", X.shape[1])

Unique ingredients after data cleaning:  6691


## 2.4 Naive Bayes Classifier using Gaussian and Bernoulli

In [8]:
cuisineEncoder = LabelEncoder()
y = cuisineEncoder.fit_transform(train_dishes.cuisine)
gau = cross_val_score(GaussianNB(), X.toarray(), y,cv=3)
ber = cross_val_score(BernoulliNB(), X, y,cv=3)
print("GaussianNB average accuracy : ",gau.mean())
print("BernoulliNB average accuracy : ",ber.mean())


GaussianNB average accuracy :  0.379720131093
BernoulliNB average accuracy :  0.683939638856


## 2.6 Logistic Regression

In [9]:
log = cross_val_score(LogisticRegression(), X, y,cv=3)
print("Logisitic regression average accuracy : ",log.mean())

Logisitic regression average accuracy :  0.775607791516


## 2.7 Logistic Regression On Test Data

In [10]:
# clean the test data
test_dishes["ingredients"]= clean_data(test_dishes["ingredients"])
test_dishes["total_ingredients"] = test_dishes["ingredients"].map(";".join)
test_dishes["total_ingredients"] = test_dishes["total_ingredients"].str.strip()

# Generate X_test using the vocabulary of unique ingredients in X using CountVectorizer
newVec = CountVectorizer(stop_words="english",lowercase=True,tokenizer=wordTokenizer,token_pattern="[A-Za-z]*",strip_accents='ascii',vocabulary=cv.vocabulary_)
X_test = newVec.fit_transform(test_dishes["total_ingredients"].values)

#Perform Logistis Regression
logReg = LogisticRegression()
logReg.fit(X,y)
y_test = logReg.predict(X_test)

#Change encoding back from numbers to labels using cuisineEncoder
final_y_cuisines = list(cuisineEncoder.inverse_transform(y_test))
solution = pd.DataFrame({'cuisine':np.array(final_y_cuisines)}, index=test_dishes.id)

#Create a csv file for submission to kaggle
solution.to_csv("./cooking_submission.csv", sep=',')
print("Done")

Done
