## 1. Setup

In [363]:
#pre-processing
import os
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
import pandas as pd
from sklearn import feature_extraction, model_selection, pipeline, manifold, preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

#feature engg
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import feature_extraction, model_selection, pipeline

#### Import Cuisine Data

In [65]:
cuisine = pd.read_json('./data/train.json')
display(cuisine)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [13]:
cuisine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           39774 non-null  int64 
 1   cuisine      39774 non-null  object
 2   ingredients  39774 non-null  object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


#### Import Recipe Data

In [62]:
all_recipes = pd.read_json('./data/recipes_raw_nosource_allrecipes.json', orient='index')
epicurious = pd.read_json('./data/recipes_raw_nosource_epicurious.json', orient='index')
food_network = pd.read_json('./data/recipes_raw_nosource_foodnetwork.json', orient='index')
recipes = pd.concat([all_recipes, epicurious, food_network], axis=0)

In [63]:
recipes = recipes.reset_index()
recipes = recipes.drop(columns=['index', 'picture_link'])
display(recipes)

Unnamed: 0,title,ingredients,instructions
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ..."
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ..."
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...
...,...,...,...
124642,Summer Corn Salad,"[4 ears fresh corn, 2 heads Belgian endive, 2 ...",Watch how to make this recipe.\nPreheat a gril...
124643,Zucchini Stuffed Tomatoes,"[4 large plum tomatoes, Salt and sugar, 1 1/2 ...",Preheat the broiler. Cut the tomatoes in 1/2 c...
124644,Pepper Pasta Quick Cook,"[3 tablespoons olive oil, 2 tablespoons unsalt...",Heat the oil and butter in a large skillet ove...
124645,Chocolate Cake with Armagnac Ice Cream,"[8 ounces butter, 8 ounces bittersweet chocola...",Preheat oven to 350 degrees. On the top half o...


## 2. Pre-Processing

### A. Cleaning and Tokenization

#### Add stopwords

In [301]:
from nltk.corpus import stopwords
stopword_list = stopwords.words("english")
addl_stop_words = ['advertisement', 'advertisments', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons', 'ounce', 'ounces', 'salt', 'pepper', 'pound', 'pounds']
stopword_list.extend(addl_stop_words)

#### String cleaning function

In [398]:
def clean_string(list, lemmatize = True, stemming = False):
    str = ' '.join(list) #converting the list to string
    clean_text = ''
    
    lower = str.lower().split() #lowercase and tokenize
    
    clean_words = []
    for word in lower:
        if len(word) > 2:
            digit = re.sub(r'\d+','', word) #removing digits
            text = re.sub(r'[^\w\s]', '', digit) #removing punc and characters
            
            
            if lemmatize:
                lm = WordNetLemmatizer()  #lemmatize
                lemm = lm.lemmatize(text)
                clean_words.append(lemm)
                
                if stemming:
                    stemmer = PorterStemmer #stemming
                    stemm = stemmer.stem(text)
                    clean_words.append(stemm)
         
    rem_stop = [i for i in clean_words if i not in stopword_list]  #remove stopwords
    
    clean_text = ' '.join(rem_stop) #join as a string
    space = re.sub(' +', ' ', clean_text) #remove multi-spaces
    
    return space    

#### Clean Ingredients

In [402]:
recipes['clean_ingredients'] = recipes['ingredients'].apply(lambda x: clean_string(x))

In [403]:
display(recipes)

Unnamed: 0,title,ingredients,instructions,clean_ingredients
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ...",skinless boneless chicken breast half butter c...
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ...",condensed cream mushroom soup package dry oni...
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....,packed brown sugar ketchup lean ground beef m...
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....,butter softened white sugar packed brown sugar...
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...,whole wheat rotini pasta fresh broccoli floret...
...,...,...,...,...
124642,Summer Corn Salad,"[4 ears fresh corn, 2 heads Belgian endive, 2 ...",Watch how to make this recipe.\nPreheat a gril...,ear fresh corn head belgian endive olive oil f...
124643,Zucchini Stuffed Tomatoes,"[4 large plum tomatoes, Salt and sugar, 1 1/2 ...",Preheat the broiler. Cut the tomatoes in 1/2 c...,large plum tomato sugar zucchini shallot slice...
124644,Pepper Pasta Quick Cook,"[3 tablespoons olive oil, 2 tablespoons unsalt...",Heat the oil and butter in a large skillet ove...,olive oil unsalted butter medium clove garlic ...
124645,Chocolate Cake with Armagnac Ice Cream,"[8 ounces butter, 8 ounces bittersweet chocola...",Preheat oven to 350 degrees. On the top half o...,butter bittersweet chocolate whole egg egg yol...


#### Before and After Pre-processing

In [411]:
print(recipes['ingredients'][200])

print('\n')

print(recipes['clean_ingredients'][200])

['1 (32 ounce) package frozen hash brown potatoes ADVERTISEMENT', '8 ounces cooked, diced ham ADVERTISEMENT', '2 (10.75 ounce) cans condensed cream of potato soup ADVERTISEMENT', '1 (16 ounce) container sour cream ADVERTISEMENT', '2 cups shredded sharp Cheddar cheese ADVERTISEMENT', '1 1/2 cups grated Parmesan cheese ADVERTISEMENT', 'ADVERTISEMENT']


 package frozen hash brown potato cooked diced ham condensed cream potato soup container sour cream shredded sharp cheddar cheese grated parmesan cheese


### B. Removing NA

In [406]:
#dataframe info

recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124647 entries, 0 to 124646
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   title              124595 non-null  object
 1   ingredients        124647 non-null  object
 2   instructions       124473 non-null  object
 3   clean_ingredients  124647 non-null  object
dtypes: object(4)
memory usage: 3.8+ MB


In [408]:
#removing NA

recipes.dropna(inplace=True)

In [409]:
#after removing NA

recipes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124473 entries, 0 to 124646
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   title              124473 non-null  object
 1   ingredients        124473 non-null  object
 2   instructions       124473 non-null  object
 3   clean_ingredients  124473 non-null  object
dtypes: object(4)
memory usage: 4.7+ MB


## 2. Feature Engg and Classification of Cuisines 

#### Instantiate Vectorizer

In [395]:
count_uni = CountVectorizer(lowercase = True, stop_words = 'english', ngram_range = (1,1))
count_bi = CountVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,2))
count_binary = CountVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,2), binary = True)
tfidf_uni = TfidfVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,1))
tfidf_bi = TfidfVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,2))

In [396]:
vect_dict = {'count_uni': count_uni,
            'count_bi': count_bi,
            'count_binary': count_binary,
            'tfidf_uni': tfidf_uni,
            'tfidf_bi': tfidf_bi}

#### Classification Methods

In [397]:
# Logistic Regression, SVM, Random Forest, Naive Bayes,  Neural Networks, KNN
 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import feature_extraction, model_selection, pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [413]:
nb = MultinomialNB()
logreg = LogisticRegression()
svc = SVC()
rf = RandomForestClassifier()
#knn = KneighborsClassifier()

In [424]:
X = cuisine['ingredients']
y = cuisine['cuisine']

In [416]:
#Cross Validation

inner_cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
outer_cv = KFold(n_splits = 5, shuffle = True, random_state = 1)

In [417]:
#Stating the parameters

kernel = ['rbf', 'linear', 'poly', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]

In [422]:
# set up parameter grid
params = {'classify__kernel': kernel, 'classify__C': C}

for method in vect_dict:
    pipe = Pipeline([
        ('vectorize', vect_dict[method]),
        ('scale', StandardScaler()),
        ('classify', svc)
    ])