## 1. Setup

In [2]:
#pre-processing
import os
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
import pandas as pd
from sklearn import feature_extraction, model_selection, pipeline, manifold, preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

#feature engg
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import feature_extraction, model_selection, pipeline

#### Import Cuisine Data

In [3]:
cuisine = pd.read_json('./data/train.json')
display(cuisine)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [4]:
cuisine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           39774 non-null  int64 
 1   cuisine      39774 non-null  object
 2   ingredients  39774 non-null  object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


#### Import Recipe Data

In [5]:
all_recipes = pd.read_json('./data/recipes_raw_nosource_allrecipes.json', orient='index')
epicurious = pd.read_json('./data/recipes_raw_nosource_epicurious.json', orient='index')
food_network = pd.read_json('./data/recipes_raw_nosource_foodnetwork.json', orient='index')
recipes = pd.concat([all_recipes, epicurious, food_network], axis=0)

In [6]:
recipes = recipes.reset_index()
recipes = recipes.drop(columns=['index', 'picture_link'])
display(recipes)

Unnamed: 0,title,ingredients,instructions
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ..."
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ..."
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...
...,...,...,...
124642,Summer Corn Salad,"[4 ears fresh corn, 2 heads Belgian endive, 2 ...",Watch how to make this recipe.\nPreheat a gril...
124643,Zucchini Stuffed Tomatoes,"[4 large plum tomatoes, Salt and sugar, 1 1/2 ...",Preheat the broiler. Cut the tomatoes in 1/2 c...
124644,Pepper Pasta Quick Cook,"[3 tablespoons olive oil, 2 tablespoons unsalt...",Heat the oil and butter in a large skillet ove...
124645,Chocolate Cake with Armagnac Ice Cream,"[8 ounces butter, 8 ounces bittersweet chocola...",Preheat oven to 350 degrees. On the top half o...


## 2. Pre-Processing

### A. Cleaning and Tokenization

#### Add stopwords

In [7]:
from nltk.corpus import stopwords
stopword_list = stopwords.words("english")
addl_stop_words = ['advertisement', 'advertisments', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons', 'ounce', 'ounces', 'salt', 'pepper', 'pound', 'pounds']
stopword_list.extend(addl_stop_words)

#### String cleaning function

In [10]:
def clean_string(list, lemmatize = True, stemming = False):
    str = ' '.join(list) #converting the list to string
    clean_text = ''
    
    lower = str.lower().split() #lowercase and tokenize
    
    clean_words = []
    for word in lower:
        if len(word) > 2:
            digit = re.sub(r'\d+','', word) #removing digits
            text = re.sub(r'[^\w\s]', '', digit) #removing punc and characters
            
            
            if lemmatize:
                lm = WordNetLemmatizer()  #lemmatize
                lemm = lm.lemmatize(text)
                clean_words.append(lemm)
                
                if stemming:
                    stemmer = PorterStemmer #stemming
                    stemm = stemmer.stem(text)
                    clean_words.append(stemm)
         
    rem_stop = [i for i in clean_words if i not in stopword_list]  #remove stopwords
    
    clean_text = ' '.join(rem_stop) #join as a string
    space = re.sub(' +', ' ', clean_text) #remove multi-spaces
    
    return space    

#### Clean Ingredients for Recipes dataset

In [11]:
recipes['clean_ingredients_r'] = recipes['ingredients'].apply(lambda x: clean_string(x))

In [12]:
display(recipes)

Unnamed: 0,title,ingredients,instructions,clean_ingredients_r
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ...",skinless boneless chicken breast half butter c...
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ...",condensed cream mushroom soup package dry oni...
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....,packed brown sugar ketchup lean ground beef m...
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....,butter softened white sugar packed brown sugar...
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...,whole wheat rotini pasta fresh broccoli floret...
...,...,...,...,...
124642,Summer Corn Salad,"[4 ears fresh corn, 2 heads Belgian endive, 2 ...",Watch how to make this recipe.\nPreheat a gril...,ear fresh corn head belgian endive olive oil f...
124643,Zucchini Stuffed Tomatoes,"[4 large plum tomatoes, Salt and sugar, 1 1/2 ...",Preheat the broiler. Cut the tomatoes in 1/2 c...,large plum tomato sugar zucchini shallot slice...
124644,Pepper Pasta Quick Cook,"[3 tablespoons olive oil, 2 tablespoons unsalt...",Heat the oil and butter in a large skillet ove...,olive oil unsalted butter medium clove garlic ...
124645,Chocolate Cake with Armagnac Ice Cream,"[8 ounces butter, 8 ounces bittersweet chocola...",Preheat oven to 350 degrees. On the top half o...,butter bittersweet chocolate whole egg egg yol...


#### Before and After Pre-processing

In [14]:
print(recipes['ingredients'][200])

print('\n')

print(recipes['clean_ingredients_r'][200])

['1 (32 ounce) package frozen hash brown potatoes ADVERTISEMENT', '8 ounces cooked, diced ham ADVERTISEMENT', '2 (10.75 ounce) cans condensed cream of potato soup ADVERTISEMENT', '1 (16 ounce) container sour cream ADVERTISEMENT', '2 cups shredded sharp Cheddar cheese ADVERTISEMENT', '1 1/2 cups grated Parmesan cheese ADVERTISEMENT', 'ADVERTISEMENT']


 package frozen hash brown potato cooked diced ham condensed cream potato soup container sour cream shredded sharp cheddar cheese grated parmesan cheese


### B. Removing NA

In [15]:
#dataframe info

recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124647 entries, 0 to 124646
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   title                124595 non-null  object
 1   ingredients          124647 non-null  object
 2   instructions         124473 non-null  object
 3   clean_ingredients_r  124647 non-null  object
dtypes: object(4)
memory usage: 3.8+ MB


In [16]:
#removing NA

recipes.dropna(inplace=True)

In [17]:
#after removing NA

recipes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124473 entries, 0 to 124646
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   title                124473 non-null  object
 1   ingredients          124473 non-null  object
 2   instructions         124473 non-null  object
 3   clean_ingredients_r  124473 non-null  object
dtypes: object(4)
memory usage: 4.7+ MB


#### Clean Ingredients for Cuisine dataset

In [18]:
display(cuisine)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [21]:
cuisine['clean_ingredients_c'] = cuisine['ingredients'].apply(lambda x: clean_string(x))

In [22]:
print(cuisine['ingredients'][200])

print('\n')

print(cuisine['clean_ingredients_c'][200])

['turnip greens', 'vegetable oil', 'fresh lemon juice', 'black peppercorns', 'butternut squash', 'apples', 'kosher salt', 'buttermilk', 'country ham', 'shallots', 'extra-virgin olive oil']


turnip green vegetable oil fresh lemon juice black peppercorn butternut squash apple kosher buttermilk country ham shallot extravirgin olive oil


## 2. Feature Engg and Classification of Cuisines 

#### Instantiate Vectorizer

In [23]:
count_uni = CountVectorizer(lowercase = True, stop_words = 'english', ngram_range = (1,1))
count_bi = CountVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,2))
count_binary = CountVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,2), binary = True)
tfidf_uni = TfidfVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,1))
tfidf_bi = TfidfVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,2))

In [24]:
vect_dict = {'count_uni': count_uni,
            'count_bi': count_bi,
            'count_binary': count_binary,
            'tfidf_uni': tfidf_uni,
            'tfidf_bi': tfidf_bi}

#### Subsetting dataset for testing

In [59]:
cuisine_subset = cuisine.head(n = 20)
display(cuisine_subset)

Unnamed: 0,id,cuisine,ingredients,clean_ingredients_c
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olive grape tomato garli...
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour ground tomato ground black thyme e...
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",egg mayonaise cooking oil green chilies grille...
3,22213,indian,"[water, vegetable oil, wheat, salt]",water vegetable oil wheat
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",black shallot cornflour cayenne onion garlic p...
5,6602,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge...",plain flour sugar butter egg fresh ginger root...
6,42779,spanish,"[olive oil, salt, medium shrimp, pepper, garli...",olive oil medium shrimp garlic chopped cilantr...
7,3735,italian,"[sugar, pistachio nuts, white almond bark, flo...",sugar pistachio nut white almond bark flour va...
8,16903,mexican,"[olive oil, purple onion, fresh pineapple, por...",olive oil purple onion fresh pineapple pork po...
9,12734,italian,"[chopped tomatoes, fresh basil, garlic, extra-...",chopped tomato fresh basil garlic extravirgin ...


In [49]:
cuisine.cuisine.value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

#### Classification Methods

### The madness starts here

In [134]:
# Logistic Regression, SVM, Random Forest, Naive Bayes,  Neural Networks, KNN
 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import feature_extraction, model_selection, pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import NMF, PCA, TruncatedSVD, FastICA
from sklearn.model_selection import cross_validate, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


In [109]:
from sklearn import svm

In [60]:
nb = MultinomialNB()
logreg = LogisticRegression()
svc = SVC(multi_class = 'ovr')
rf = RandomForestClassifier()
#knn = KneighborsClassifier()

In [136]:
#we used this
svc = SVC()

In [110]:
#not working

linear = svm.SVC(kernel='linear', decision_function_shape='ovo')
rbf = svm.SVC(kernel='rbf', decision_function_shape='ovo')
poly = svm.SVC(kernel='poly',  decision_function_shape='ovo')
sig = svm.SVC(kernel='sigmoid', decision_function_shape='ovo')

In [137]:
class SparseToDense(TransformerMixin):
    
    def fit(self, X, y = None, **fit_params):
        return self
    
    def transform(self, X, y = None, **fit_params):
        return X.toarray()

In [138]:
pca = PCA()
ncomps = [5, 10, 20, 50, 75, 100]

In [139]:
X = cuisine_subset['clean_ingredients_c']
y = cuisine_subset['cuisine']

##X_count = tfidf_uni.fit_transform(X)

In [140]:
#Cross Validation

inner_cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
outer_cv = KFold(n_splits = 5, shuffle = True, random_state = 1)

In [141]:
#Stating the parameters

kernel = [rbf, linear]
C = [1, 10, 100, 1000]

# set up parameter grid
params = {'classify__kernel': kernel, 'classify__C': C, 'dim_red__n_components': ncomps}

for method in vect_dict:
    pipe = Pipeline([
        ('vectorize', vect_dict[method]),
        ('densify', SparseToDense()),
        ('scale', StandardScaler()),
        ('dim_red', pca),
        ('classify', svc)
    ])
    
grid_SVC = GridSearchCV(pipe, params, cv = inner_cv)

In [143]:
#This is where we are running into error. 

#scores = cross_validate(grid_SVC,
                       X = X,
                       y = y,
                       cv = outer_cv,
                       scoring = ['roc_auc', 'accuracy', 'f1', 'precision', 'recall'],
                       return_estimator = True, error_score = 'raise')

#auc = scores['test_roc_auc']
#accuracy = scores['test_accuracy']
#f1 = scores['test_f1']
#precision = scores['test_precision']
#recall = scores['test_recall']
#estimators = scores['estimator']

IndentationError: unexpected indent (Temp/ipykernel_12832/3061204600.py, line 4)

In [144]:
print(accuracy)
print(accuracy.mean())

print(precision)
print(precision.mean())

print(recall)
print(recall.mean())

print(f1)
print(f1.mean())

[nan nan nan nan nan]
nan
[nan nan nan nan nan]
nan
[nan nan nan nan nan]
nan
[nan nan nan nan nan]
nan


## Below: Code from Kaggle: https://www.kaggle.com/code/rahulsridhar2811/cuisine-classification-with-accuracy-78-88/notebook

In [126]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(cuisine_subset['cuisine'])
cuisine_subset['cuisine']=le.transform(cuisine_subset['cuisine']) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cuisine_subset['cuisine']=le.transform(cuisine_subset['cuisine'])


In [127]:
cuisine_subset_map={'0':'brazilian', '1':'british', '2':'cajun_creole', '3':'chinese', '4':'filipino', '5':'french', '6':'greek', '7':'indian', '8':'irish', '9':'italian', '10':'jamaican', '11':'japanese', '12':'korean', '13':'mexican', '14':'moroccan', '15':'russian', '16':'southern_us', '17':'spanish', '18':'thai', '19':'vietnamese'}

In [128]:
X = cuisine_subset['clean_ingredients_c']
y = cuisine_subset['cuisine']

##X_count = tfidf_uni.fit_transform(X)

In [131]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_tf = vectorizer.fit_transform(cuisine_subset['clean_ingredients_c'])

print(X_tf)
#print(vectorizer.get_feature_names())

  (0, 41)	0.3115386541764817
  (0, 21)	0.22636168665179335
  (0, 53)	0.3115386541764817
  (0, 9)	0.2738472456045715
  (0, 64)	0.3115386541764817
  (0, 127)	0.2738472456045715
  (0, 100)	0.18267087709523874
  (0, 117)	0.2738472456045715
  (0, 65)	0.15306791138792583
  (0, 150)	0.20941335706395003
  (0, 68)	0.3115386541764817
  (0, 99)	0.17172194849203987
  (0, 11)	0.18267087709523874
  (0, 84)	0.2738472456045715
  (0, 121)	0.3115386541764817
  (1, 98)	0.1308507475691997
  (1, 156)	0.22670974091786258
  (1, 92)	0.22670974091786258
  (1, 90)	0.31201767674976016
  (1, 37)	0.22670974091786258
  (1, 163)	0.24748471450935447
  (1, 70)	0.22670974091786258
  (1, 49)	0.22670974091786258
  (1, 149)	0.31201767674976016
  (1, 72)	0.3659035045378975
  :	:
  (18, 123)	0.31912136386134354
  (19, 120)	0.23552212944726256
  (19, 34)	0.23552212944726256
  (19, 14)	0.23552212944726256
  (19, 136)	0.23552212944726256
  (19, 141)	0.23552212944726256
  (19, 10)	0.23552212944726256
  (19, 16)	0.20702755681649

In [133]:
from sklearn.svm import SVC

parameter_candidates = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]

clf = GridSearchCV(estimator=svm.SVC(), param_grid=parameter_candidates, n_jobs=-1)

clf.fit(X_tf, y)   
print('Best score for data1:', clf.best_score_) 
print('Best C:',clf.best_estimator_.C) 
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)



Best score for data1: 0.3
Best C: 1
Best Kernel: linear
Best Gamma: scale
