## 1. Setup

In [1]:
#pre-processing
import os
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
import pandas as pd
from sklearn import feature_extraction, model_selection, pipeline, manifold, preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

#feature engg
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import feature_extraction, model_selection, pipeline

#### Import Cuisine Data

In [2]:
cuisine = pd.read_json('./data/train.json')
display(cuisine)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [3]:
cuisine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           39774 non-null  int64 
 1   cuisine      39774 non-null  object
 2   ingredients  39774 non-null  object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


#### Import Recipe Data

In [4]:
all_recipes = pd.read_json('./data/recipes_raw_nosource_allrecipes.json', orient='index')
epicurious = pd.read_json('./data/recipes_raw_nosource_epicurious.json', orient='index')
food_network = pd.read_json('./data/recipes_raw_nosource_foodnetwork.json', orient='index')
recipes = pd.concat([all_recipes, epicurious, food_network], axis=0)

In [5]:
recipes = recipes.reset_index()
recipes = recipes.drop(columns=['index', 'picture_link'])
display(recipes)

Unnamed: 0,title,ingredients,instructions
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ..."
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ..."
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...
...,...,...,...
124642,Summer Corn Salad,"[4 ears fresh corn, 2 heads Belgian endive, 2 ...",Watch how to make this recipe.\nPreheat a gril...
124643,Zucchini Stuffed Tomatoes,"[4 large plum tomatoes, Salt and sugar, 1 1/2 ...",Preheat the broiler. Cut the tomatoes in 1/2 c...
124644,Pepper Pasta Quick Cook,"[3 tablespoons olive oil, 2 tablespoons unsalt...",Heat the oil and butter in a large skillet ove...
124645,Chocolate Cake with Armagnac Ice Cream,"[8 ounces butter, 8 ounces bittersweet chocola...",Preheat oven to 350 degrees. On the top half o...


## 2. Pre-Processing

### A. Cleaning and Tokenization

#### Add stopwords

In [6]:
from nltk.corpus import stopwords
stopword_list = stopwords.words("english")
addl_stop_words = ['advertisement', 'advertisments', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons', 'ounce', 'ounces', 'salt', 'pepper', 'pound', 'pounds']
stopword_list.extend(addl_stop_words)

#### String cleaning function

In [7]:
def clean_string(list, lemmatize = True, stemming = False):
    str = ' '.join(list) #converting the list to string
    clean_text = ''
    
    lower = str.lower().split() #lowercase and tokenize
    
    clean_words = []
    for word in lower:
        if len(word) > 2:
            digit = re.sub(r'\d+','', word) #removing digits
            text = re.sub(r'[^\w\s]', '', digit) #removing punc and characters
            
            
            if lemmatize:
                lm = WordNetLemmatizer()  #lemmatize
                lemm = lm.lemmatize(text)
                clean_words.append(lemm)
                
                if stemming:
                    stemmer = PorterStemmer #stemming
                    stemm = stemmer.stem(text)
                    clean_words.append(stemm)
         
    rem_stop = [i for i in clean_words if i not in stopword_list]  #remove stopwords
    
    clean_text = ' '.join(rem_stop) #join as a string
    space = re.sub(' +', ' ', clean_text) #remove multi-spaces
    
    return space    

#### Clean Ingredients for Recipes dataset

In [8]:
recipes['clean_ingredients_r'] = recipes['ingredients'].apply(lambda x: clean_string(x))

In [9]:
display(recipes)

Unnamed: 0,title,ingredients,instructions,clean_ingredients_r
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ...",skinless boneless chicken breast half butter c...
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ...",condensed cream mushroom soup package dry oni...
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....,packed brown sugar ketchup lean ground beef m...
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....,butter softened white sugar packed brown sugar...
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...,whole wheat rotini pasta fresh broccoli floret...
...,...,...,...,...
124642,Summer Corn Salad,"[4 ears fresh corn, 2 heads Belgian endive, 2 ...",Watch how to make this recipe.\nPreheat a gril...,ear fresh corn head belgian endive olive oil f...
124643,Zucchini Stuffed Tomatoes,"[4 large plum tomatoes, Salt and sugar, 1 1/2 ...",Preheat the broiler. Cut the tomatoes in 1/2 c...,large plum tomato sugar zucchini shallot slice...
124644,Pepper Pasta Quick Cook,"[3 tablespoons olive oil, 2 tablespoons unsalt...",Heat the oil and butter in a large skillet ove...,olive oil unsalted butter medium clove garlic ...
124645,Chocolate Cake with Armagnac Ice Cream,"[8 ounces butter, 8 ounces bittersweet chocola...",Preheat oven to 350 degrees. On the top half o...,butter bittersweet chocolate whole egg egg yol...


#### Before and After Pre-processing

In [10]:
print(recipes['ingredients'][200])

print('\n')

print(recipes['clean_ingredients_r'][200])

['1 (32 ounce) package frozen hash brown potatoes ADVERTISEMENT', '8 ounces cooked, diced ham ADVERTISEMENT', '2 (10.75 ounce) cans condensed cream of potato soup ADVERTISEMENT', '1 (16 ounce) container sour cream ADVERTISEMENT', '2 cups shredded sharp Cheddar cheese ADVERTISEMENT', '1 1/2 cups grated Parmesan cheese ADVERTISEMENT', 'ADVERTISEMENT']


 package frozen hash brown potato cooked diced ham condensed cream potato soup container sour cream shredded sharp cheddar cheese grated parmesan cheese


### B. Removing NA

In [11]:
#dataframe info

recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124647 entries, 0 to 124646
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   title                124595 non-null  object
 1   ingredients          124647 non-null  object
 2   instructions         124473 non-null  object
 3   clean_ingredients_r  124647 non-null  object
dtypes: object(4)
memory usage: 3.8+ MB


In [12]:
#removing NA

recipes.dropna(inplace=True)

In [13]:
#after removing NA

recipes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124473 entries, 0 to 124646
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   title                124473 non-null  object
 1   ingredients          124473 non-null  object
 2   instructions         124473 non-null  object
 3   clean_ingredients_r  124473 non-null  object
dtypes: object(4)
memory usage: 4.7+ MB


#### Clean Ingredients for Cuisine dataset

In [14]:
display(cuisine)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [15]:
cuisine['clean_ingredients_c'] = cuisine['ingredients'].apply(lambda x: clean_string(x))

In [16]:
print(cuisine['ingredients'][200])

print('\n')

print(cuisine['clean_ingredients_c'][200])

['turnip greens', 'vegetable oil', 'fresh lemon juice', 'black peppercorns', 'butternut squash', 'apples', 'kosher salt', 'buttermilk', 'country ham', 'shallots', 'extra-virgin olive oil']


turnip green vegetable oil fresh lemon juice black peppercorn butternut squash apple kosher buttermilk country ham shallot extravirgin olive oil


## 2. Feature Engg and Classification of Cuisines 

#### Instantiate Vectorizer

In [17]:
count_uni = CountVectorizer(lowercase = True, stop_words = 'english', ngram_range = (1,1))
count_bi = CountVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,2))
count_binary = CountVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,2), binary = True)
tfidf_uni = TfidfVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,1))
tfidf_bi = TfidfVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,2))

In [18]:
vect_dict = {'count_uni': count_uni,
            'count_bi': count_bi,
            'count_binary': count_binary,
            'tfidf_uni': tfidf_uni,
            'tfidf_bi': tfidf_bi}

#### Subsetting dataset for testing

In [19]:
cuisine_subset = cuisine.head(n = 20)
display(cuisine_subset)

Unnamed: 0,id,cuisine,ingredients,clean_ingredients_c
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olive grape tomato garli...
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour ground tomato ground black thyme e...
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",egg mayonaise cooking oil green chilies grille...
3,22213,indian,"[water, vegetable oil, wheat, salt]",water vegetable oil wheat
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",black shallot cornflour cayenne onion garlic p...
5,6602,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge...",plain flour sugar butter egg fresh ginger root...
6,42779,spanish,"[olive oil, salt, medium shrimp, pepper, garli...",olive oil medium shrimp garlic chopped cilantr...
7,3735,italian,"[sugar, pistachio nuts, white almond bark, flo...",sugar pistachio nut white almond bark flour va...
8,16903,mexican,"[olive oil, purple onion, fresh pineapple, por...",olive oil purple onion fresh pineapple pork po...
9,12734,italian,"[chopped tomatoes, fresh basil, garlic, extra-...",chopped tomato fresh basil garlic extravirgin ...


In [20]:
cuisine.cuisine.value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

#### Classification Methods

### The madness starts here

In [21]:
# Logistic Regression, SVM, Random Forest, Naive Bayes,  Neural Networks, KNN
 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import feature_extraction, model_selection, pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import NMF, PCA, TruncatedSVD, FastICA
from sklearn.model_selection import cross_validate, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import svm

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [22]:
nb = MultinomialNB()
logreg = LogisticRegression()

rf = RandomForestClassifier()
#knn = KneighborsClassifier()

### SVC Classifier

In [23]:
svc = SVC()

In [24]:
class SparseToDense(TransformerMixin):
    
    def fit(self, X, y = None, **fit_params):
        return self
    
    def transform(self, X, y = None, **fit_params):
        return X.toarray()

In [25]:
pca = PCA()
ncomps = [5, 10, 20, 50, 75, 100]

In [26]:
X = cuisine['clean_ingredients_c']
y = cuisine['cuisine']

##X_count = tfidf_uni.fit_transform(X)

In [27]:
cuisine_subset_map={'0':'brazilian', '1':'british', '2':'cajun_creole', '3':'chinese', '4':'filipino', '5':'french', '6':'greek', '7':'indian', '8':'irish', '9':'italian', '10':'jamaican', '11':'japanese', '12':'korean', '13':'mexican', '14':'moroccan', '15':'russian', '16':'southern_us', '17':'spanish', '18':'thai', '19':'vietnamese'}

In [None]:
#Cross Validation

inner_cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
outer_cv = KFold(n_splits = 5, shuffle = True, random_state = 1)

#Stating the parameters

kernel = ['rbf', 'linear']
C = [1, 10, 100, 1000]

# set up parameter grid
params = {'classify__kernel': kernel, 'classify__C': C}

for method in vect_dict:
    pipe = Pipeline([
        ('vectorize', vect_dict[method]),
        ('densify', SparseToDense()),
        ('scale', StandardScaler()),
        ('dim_red', pca),
        ('classify', svc)
    ])
    
grid_SVC = GridSearchCV(pipe, params, cv = inner_cv)

#This is where we are running into error. 

scores = cross_validate(grid_SVC,
                       X = X,
                       y = y,
                       cv = outer_cv, 
                       scoring = {'accuracy' : make_scorer(accuracy_score),
       'precision' : make_scorer(precision_score, average = 'macro'),
       'recall' : make_scorer(recall_score, average = 'macro'), 
       'f1_score' : make_scorer(f1_score, average = 'macro')},
                       return_estimator = True)

In [173]:
print(list(scores.values())[-1])
print(list(scores.values())[-2])
print(list(scores.values())[-3])
print(list(scores.values())[-4])


[0.1        0.22222222 0.1        0.1        0.1       ]
[0.25       0.33333333 0.2        0.25       0.25      ]
[0.0625     0.16666667 0.06666667 0.0625     0.0625    ]
[0.25 0.5  0.25 0.25 0.25]


#### Function for AUC (To revisit)

In [None]:
 def custom_auc(ground_truth, predictions):
  
     fpr, tpr, _ = roc_curve(ground_truth, predictions[:, 1], pos_label=1)    
     return auc(fpr, tpr)

# to be standart sklearn's scorer        
 my_auc = make_scorer(custom_auc, greater_is_better=True, needs_proba=True)

### Random forest classifier

In [181]:
logreg = LogisticRegression()

In [182]:
class SparseToDense(TransformerMixin):
    
    def fit(self, X, y = None, **fit_params):
        return self
    
    def transform(self, X, y = None, **fit_params):
        return X.toarray()

In [183]:
pca = PCA()
ncomps = [5, 10, 20, 50, 75, 100]

In [184]:
X = cuisine_subset['clean_ingredients_c']
y = cuisine_subset['cuisine']

##X_count = tfidf_uni.fit_transform(X)

In [185]:
cuisine_subset_map={'0':'brazilian', '1':'british', '2':'cajun_creole', '3':'chinese', '4':'filipino', '5':'french', '6':'greek', '7':'indian', '8':'irish', '9':'italian', '10':'jamaican', '11':'japanese', '12':'korean', '13':'mexican', '14':'moroccan', '15':'russian', '16':'southern_us', '17':'spanish', '18':'thai', '19':'vietnamese'}

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [193]:
#Cross Validation

inner_cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
outer_cv = KFold(n_splits = 5, shuffle = True, random_state = 1)

#Stating the parameters

penalty = ['none', 'l1', 'l2', 'elasticnet']
C = [100, 10, 1.0, 0.1, 0.01]
solvers = ['liblinear']

# set up parameter grid
params = {'C': C, 'penalty': penalty, 'solver': solvers}



In [194]:
for method in vect_dict:
    pipe = Pipeline([
        ('vectorize', vect_dict[method]),
        ('densify', SparseToDense()),
        ('scale', StandardScaler()),
        #('dim_red', pca),
        ('classify', logreg)
    ])
    
grid_SVC = GridSearchCV(pipe, params, cv = inner_cv)

#This is where we are running into error. 

scores = cross_validate(grid_SVC,
                       X = X,
                       y = y,
                       cv = outer_cv, 
                       scoring = {'accuracy' : make_scorer(accuracy_score),
       'precision' : make_scorer(precision_score, average = 'macro'),
       'recall' : make_scorer(recall_score, average = 'macro'), 
       'f1_score' : make_scorer(f1_score, average = 'macro')},
                       return_estimator = True)

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jsoba\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jsoba\anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 891, in fit
    self._run_search(evaluate_candidates)
  File "C:\Users\jsoba\anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 1392, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "C:\Users\jsoba\anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 838, in e

In [192]:
print(list(scores.values())[-1])
print(list(scores.values())[-2])
print(list(scores.values())[-3])
print(list(scores.values())[-4])

[nan nan nan nan nan]
[nan nan nan nan nan]
[nan nan nan nan nan]
[nan nan nan nan nan]


####  Link from Kaggle: https://www.kaggle.com/code/rahulsridhar2811/cuisine-classification-with-accuracy-78-88/notebook