## 1. Setup

In [86]:
#pre-processing
import numpy as np
import os
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
import pandas as pd
from sklearn import feature_extraction, model_selection, pipeline, manifold, preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

#feature engg
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import feature_extraction, model_selection, pipeline

#to ignore warnings
import warnings
warnings.filterwarnings('ignore')

#### Import Cuisine Data

In [2]:
cuisine = pd.read_json('./data/train.json')
display(cuisine)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [3]:
cuisine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           39774 non-null  int64 
 1   cuisine      39774 non-null  object
 2   ingredients  39774 non-null  object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


#### Import Recipe Data

In [4]:
all_recipes = pd.read_json('./data/recipes_raw_nosource_allrecipes.json', orient='index')
epicurious = pd.read_json('./data/recipes_raw_nosource_epicurious.json', orient='index')
food_network = pd.read_json('./data/recipes_raw_nosource_foodnetwork.json', orient='index')
recipes = pd.concat([all_recipes, epicurious, food_network], axis=0)

In [5]:
recipes = recipes.reset_index()
recipes = recipes.drop(columns=['index', 'picture_link'])
display(recipes)

Unnamed: 0,title,ingredients,instructions
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ..."
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ..."
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...
...,...,...,...
124642,Summer Corn Salad,"[4 ears fresh corn, 2 heads Belgian endive, 2 ...",Watch how to make this recipe.\nPreheat a gril...
124643,Zucchini Stuffed Tomatoes,"[4 large plum tomatoes, Salt and sugar, 1 1/2 ...",Preheat the broiler. Cut the tomatoes in 1/2 c...
124644,Pepper Pasta Quick Cook,"[3 tablespoons olive oil, 2 tablespoons unsalt...",Heat the oil and butter in a large skillet ove...
124645,Chocolate Cake with Armagnac Ice Cream,"[8 ounces butter, 8 ounces bittersweet chocola...",Preheat oven to 350 degrees. On the top half o...


## 2. Pre-Processing

### A. Cleaning and Tokenization

#### Add stopwords

In [6]:
from nltk.corpus import stopwords
stopword_list = stopwords.words("english")
addl_stop_words = ['advertisement', 'advertisments', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons', 'ounce', 'ounces', 'salt', 'pepper', 'pound', 'pounds']
stopword_list.extend(addl_stop_words)

#### String cleaning function

In [7]:
def clean_string(list, lemmatize = True, stemming = False):
    str = ' '.join(list) #converting the list to string
    clean_text = ''
    
    lower = str.lower().split() #lowercase and tokenize
    
    clean_words = []
    for word in lower:
        if len(word) > 2:
            digit = re.sub(r'\d+','', word) #removing digits
            text = re.sub(r'[^\w\s]', '', digit) #removing punc and characters
            
            
            if lemmatize:
                lm = WordNetLemmatizer()  #lemmatize
                lemm = lm.lemmatize(text)
                clean_words.append(lemm)
                
                if stemming:
                    stemmer = PorterStemmer #stemming
                    stemm = stemmer.stem(text)
                    clean_words.append(stemm)
         
    rem_stop = [i for i in clean_words if i not in stopword_list]  #remove stopwords
    
    clean_text = ' '.join(rem_stop) #join as a string
    space = re.sub(' +', ' ', clean_text) #remove multi-spaces
    
    return space    

#### Clean Ingredients for Recipes dataset

In [8]:
recipes['clean_ingredients_r'] = recipes['ingredients'].apply(lambda x: clean_string(x))

In [9]:
display(recipes)

Unnamed: 0,title,ingredients,instructions,clean_ingredients_r
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ...",skinless boneless chicken breast half butter c...
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ...",condensed cream mushroom soup package dry oni...
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....,packed brown sugar ketchup lean ground beef m...
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....,butter softened white sugar packed brown sugar...
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...,whole wheat rotini pasta fresh broccoli floret...
...,...,...,...,...
124642,Summer Corn Salad,"[4 ears fresh corn, 2 heads Belgian endive, 2 ...",Watch how to make this recipe.\nPreheat a gril...,ear fresh corn head belgian endive olive oil f...
124643,Zucchini Stuffed Tomatoes,"[4 large plum tomatoes, Salt and sugar, 1 1/2 ...",Preheat the broiler. Cut the tomatoes in 1/2 c...,large plum tomato sugar zucchini shallot slice...
124644,Pepper Pasta Quick Cook,"[3 tablespoons olive oil, 2 tablespoons unsalt...",Heat the oil and butter in a large skillet ove...,olive oil unsalted butter medium clove garlic ...
124645,Chocolate Cake with Armagnac Ice Cream,"[8 ounces butter, 8 ounces bittersweet chocola...",Preheat oven to 350 degrees. On the top half o...,butter bittersweet chocolate whole egg egg yol...


#### Before and After Pre-processing

In [10]:
print(recipes['ingredients'][200])

print('\n')

print(recipes['clean_ingredients_r'][200])

['1 (32 ounce) package frozen hash brown potatoes ADVERTISEMENT', '8 ounces cooked, diced ham ADVERTISEMENT', '2 (10.75 ounce) cans condensed cream of potato soup ADVERTISEMENT', '1 (16 ounce) container sour cream ADVERTISEMENT', '2 cups shredded sharp Cheddar cheese ADVERTISEMENT', '1 1/2 cups grated Parmesan cheese ADVERTISEMENT', 'ADVERTISEMENT']


 package frozen hash brown potato cooked diced ham condensed cream potato soup container sour cream shredded sharp cheddar cheese grated parmesan cheese


### B. Removing NA

In [11]:
#dataframe info

recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124647 entries, 0 to 124646
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   title                124595 non-null  object
 1   ingredients          124647 non-null  object
 2   instructions         124473 non-null  object
 3   clean_ingredients_r  124647 non-null  object
dtypes: object(4)
memory usage: 3.8+ MB


In [12]:
#removing NA

recipes.dropna(inplace=True)

In [13]:
#after removing NA

recipes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124473 entries, 0 to 124646
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   title                124473 non-null  object
 1   ingredients          124473 non-null  object
 2   instructions         124473 non-null  object
 3   clean_ingredients_r  124473 non-null  object
dtypes: object(4)
memory usage: 4.7+ MB


#### Clean Ingredients for Cuisine dataset

In [14]:
display(cuisine)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [15]:
cuisine['clean_ingredients_c'] = cuisine['ingredients'].apply(lambda x: clean_string(x))

In [16]:
print(cuisine['ingredients'][200])

print('\n')

print(cuisine['clean_ingredients_c'][200])

['turnip greens', 'vegetable oil', 'fresh lemon juice', 'black peppercorns', 'butternut squash', 'apples', 'kosher salt', 'buttermilk', 'country ham', 'shallots', 'extra-virgin olive oil']


turnip green vegetable oil fresh lemon juice black peppercorn butternut squash apple kosher buttermilk country ham shallot extravirgin olive oil


## 2. Feature Engg and Classification of Cuisines 

#### Instantiate Vectorizer

In [17]:
count_uni = CountVectorizer(lowercase = True, stop_words = 'english', ngram_range = (1,1))
count_bi = CountVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,2))
count_binary = CountVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,2), binary = True)
tfidf_uni = TfidfVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,1))
tfidf_bi = TfidfVectorizer(lowercase = True,stop_words = 'english', ngram_range = (1,2))

In [19]:
vect_dict = {'count_uni': count_uni,
            'count_bi': count_bi,
            'count_binary': count_binary,
            'tfidf_uni': tfidf_uni,
            'tfidf_bi': tfidf_bi}

#### Subsetting dataset for testing

In [20]:
cuisine_subset = cuisine.head(n = 1000)
display(cuisine_subset)

Unnamed: 0,id,cuisine,ingredients,clean_ingredients_c
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olive grape tomato garli...
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour ground tomato ground black thyme e...
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",egg mayonaise cooking oil green chilies grille...
3,22213,indian,"[water, vegetable oil, wheat, salt]",water vegetable oil wheat
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",black shallot cornflour cayenne onion garlic p...
...,...,...,...,...
995,5964,italian,"[tomato sauce, extra-virgin olive oil, grated ...",tomato sauce extravirgin olive oil grated parm...
996,30297,korean,"[water, medium-grain rice]",water mediumgrain rice
997,3037,mexican,"[eggs, grating cheese, jalapeno chilies, fresh...",egg grating cheese jalapeno chilies freshly gr...
998,19712,cajun_creole,"[andouille sausage, butter, garlic cloves, dri...",andouille sausage butter garlic clove dried or...


In [21]:
cuisine.cuisine.value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

#### Classification Methods

In [61]:
# Logistic Regression, SVM, Random Forest, Naive Bayes,  Neural Networks, KNN
 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import feature_extraction, model_selection, pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import NMF, PCA, TruncatedSVD, FastICA
from sklearn.model_selection import cross_validate, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

### SVC Classifier with Nested CV

In [24]:
svc = SVC()

In [25]:
class SparseToDense(TransformerMixin):
    
    def fit(self, X, y = None, **fit_params):
        return self
    
    def transform(self, X, y = None, **fit_params):
        return X.toarray()

In [26]:
pca = PCA()
ncomps = [5, 10, 20, 50, 75, 100]

In [27]:
X = cuisine['clean_ingredients_c']
y = cuisine['cuisine']

##X_count = tfidf_uni.fit_transform(X)

In [28]:
X

0        romaine lettuce black olive grape tomato garli...
1        plain flour ground tomato ground black thyme e...
2        egg mayonaise cooking oil green chilies grille...
3                                water vegetable oil wheat
4        black shallot cornflour cayenne onion garlic p...
                               ...                        
39769    light brown sugar granulated sugar butter warm...
39770    kraft zesty italian dressing purple onion broc...
39771    egg citrus fruit raisin sourdough starter flou...
39772    boneless chicken skinless thigh minced garlic ...
39773    green chile jalapeno chilies onion ground blac...
Name: clean_ingredients_c, Length: 39774, dtype: object

In [29]:
cuisine_subset_map={'0':'brazilian', '1':'british', '2':'cajun_creole', '3':'chinese', '4':'filipino', '5':'french', '6':'greek', '7':'indian', '8':'irish', '9':'italian', '10':'jamaican', '11':'japanese', '12':'korean', '13':'mexican', '14':'moroccan', '15':'russian', '16':'southern_us', '17':'spanish', '18':'thai', '19':'vietnamese'}

In [None]:
#Cross Validation
#outer loop should be smaller
#Fraction of the whole dataset (1/2 of dataset) -- startified sampling to select same amt of data from each category
#shrink the dataset to half
#print intermediate results
#reduce hyper params tuning
#reduce dataset, verboase = 10; estimate how long it takes 
#BERT for feature extraction
#https://www.tensorflow.org/text/tutorials/classify_text_with_bert


inner_cv = KFold(n_splits = 2, shuffle = True, random_state = 1)
outer_cv = KFold(n_splits = 3, shuffle = True, random_state = 1) #decreasing this number might help

#Stating the parameters

kernel = ['rbf', 'radial', 'poly'] # 
C = [1, 10, 100, 1000]

# set up parameter grid
params = {'classify__kernel': kernel, 'classify__C': C}



In [None]:
 for method in vect_dict:
        pipe = Pipeline([
        ('vectorize',  vect_dict[method]),
        ('densify', SparseToDense()),
        ('scale', StandardScaler()),
        ('dim_red', pca),
        ('classify', svc)])
        
        grid_SVC = GridSearchCV(pipe, params, cv = inner_cv, verbose = 10) #verbose = 10
        
        scores = cross_validate(grid_SVC,
                       X = X,
                       y = y,
                       cv = outer_cv, 
                       scoring = {'accuracy' : make_scorer(accuracy_score),
       'precision' : make_scorer(precision_score, average = 'macro'),
       'recall' : make_scorer(recall_score, average = 'macro'), 
       'f1_score' : make_scorer(f1_score, average = 'macro')},
                       return_estimator = True)
    

In [None]:
 pipe = Pipeline([('vectorize', tfidf_bi),
        ('densify', SparseToDense()),
        ('scale', StandardScaler()),
        ('dim_red', pca),
        ('classify', svc)])
        
grid_SVC = GridSearchCV(pipe, params, cv = inner_cv, verbose = 10) #verbose = 10
        
scores = cross_validate(grid_SVC,
                       X = X,
                       y = y,
                       cv = outer_cv, 
                       scoring = {'accuracy' : make_scorer(accuracy_score),
       'precision' : make_scorer(precision_score, average = 'macro'),
       'recall' : make_scorer(recall_score, average = 'macro'), 
       'f1_score' : make_scorer(f1_score, average = 'macro')},
                       return_estimator = True)



Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV 1/2; 1/12] START classify__C=1, classify__kernel=rbf........................


In [34]:
print(list(scores.values())[-1])
print(list(scores.values())[-2])
print(list(scores.values())[-3])
print(list(scores.values())[-4])


[0.23884531 0.23975532 0.2664474 ]
[0.23328955 0.23244513 0.25525637]
[0.39990993 0.36994563 0.42246204]
[0.50299401 0.52852853 0.56756757]


#### Function for AUC (To revisit)

In [None]:
 def custom_auc(ground_truth, predictions):
  
     fpr, tpr, _ = roc_curve(ground_truth, predictions[:, 1], pos_label=1)    
     return auc(fpr, tpr)

# to be standart sklearn's scorer        
 my_auc = make_scorer(custom_auc, greater_is_better=True, needs_proba=True)

####  Link from Kaggle: https://www.kaggle.com/code/rahulsridhar2811/cuisine-classification-with-accuracy-78-88/notebook
#### Additional:

* https://htmlpreview.github.io/?https://github.com/kulsoom-abdullah/kulsoom-abdullah.github.io/blob/master/AWS-lambda-implementation/model_implementation/recipe_binary_classification/recipe%20binary%20classification.html#Naive-Bayes
* https://htmlpreview.github.io/?https://github.com/kulsoom-abdullah/kulsoom-abdullah.github.io/blob/master/AWS-lambda-implementation/model_implementation/recipe_multiclass_classification/recipe%20multiclass%20classification.html#Logistic-Regression
* https://github.com/kulsoom-abdullah/kulsoom-abdullah.github.io/tree/master/AWS-lambda-implementation/model_implementation/recipe_multiclass_classification

### Cuisine subset

In [87]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix,roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, cross_validate,StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

In [73]:
cuisine_subset = cuisine.sample(n = 17000)
#display(cuisine_subset)

In [66]:
X_s = cuisine_subset['clean_ingredients_c']
y_s = cuisine_subset['cuisine']

### Cuisine full dataset

In [88]:
X = cuisine['clean_ingredients_c']
y = cuisine['cuisine']

### Define Vectorizer and Test_train

In [89]:
#TFIDF Vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

#Using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y, random_state = 123)

### Logistic Regression - Tuning

In [90]:
lr = LogisticRegression(max_iter = 300, random_state = 123, multi_class = 'multinomial')

In [92]:
#Assigning values to params 
penalty = ['l2', 'l1', 'elasticnet']
C = [0.001, 0.01, 0.1, 10]
solvers = ['newton-cg', 'lbfgs']

#Fitting into pipeline
lr_pipe = Pipeline([('vect', vectorizer), 
                    ('lr', lr)])

#Create hyperparameter dict
hyperparameters = dict(lr__C = C, lr__penalty = penalty, lr__solver = solvers)

#Grid search
clf = GridSearchCV(lr_pipe, hyperparameters, cv = 5)
best_model = clf.fit(X, y)

#Printing best params
print('Best Penalty:', best_model.best_estimator_.get_params()['lr__penalty'])
print('Best C:', best_model.best_estimator_.get_params()['lr__C'])
print('Best Solver:', best_model.best_estimator_.get_params()['lr__solver'])

Best Penalty: l2
Best C: 10
Best Solver: newton-cg


###  Logistic Regression  - Model

In [110]:
#Fitting vectorizer
matrix_train_lr = vectorizer.fit_transform(X_train)
matrix_test_lr = vectorizer.transform(X_test)

#Fitting final model with Best hyper params
lr_final = LogisticRegression(max_iter = 300, random_state = 123, multi_class ='multinomial', solver = 'newton-cg', C = 10, penalty = 'l2')
lr_final.fit(matrix_train_lr, y_train)
y_pred = lr_final.predict(matrix_test_lr)
pred_prob = lr_final.predict_proba(matrix_test_lr)

#Classification metrics
print('f1 score weighted %s' % f1_score(y_test, y_pred, average = 'weighted'))
print('Accuracy score %s' % accuracy_score(y_test, y_pred))
#print('AUC score %s' % roc_auc_score(y_test, pred_prob))
#print('ROC score %s' % roc_curve(y_test, y_pred))
print(classification_report(y_test, y_pred))

#cm_lr_test=confusion_matrix(y_test, y_pred)
#print(cm_lr_test)

acc_lr = accuracy_score(y_test, y_pred)
f1_lr = f1_score(y_test, y_pred, average = 'weighted')

f1 score weighted 0.7835361766450769
Accuracy score 0.7874094931617055
              precision    recall  f1-score   support

   brazilian       0.80      0.57      0.67       117
     british       0.62      0.45      0.52       201
cajun_creole       0.78      0.69      0.73       386
     chinese       0.81      0.87      0.84       668
    filipino       0.79      0.56      0.65       189
      french       0.58      0.67      0.62       662
       greek       0.77      0.69      0.73       294
      indian       0.87      0.91      0.89       751
       irish       0.69      0.49      0.58       167
     italian       0.81      0.89      0.84      1960
    jamaican       0.92      0.72      0.81       131
    japanese       0.85      0.69      0.76       356
      korean       0.86      0.75      0.80       207
     mexican       0.91      0.91      0.91      1610
    moroccan       0.81      0.77      0.79       205
     russian       0.65      0.44      0.53       122
 southern_



### Multinomial Naive Bayes - Tuning

In [100]:
nb = MultinomialNB()

In [101]:
#Assigning values to params
alpha = [0.01, 0.1, 0.5, 1.0, 10.0]

#Fitting into pipeline
nb_pipe = Pipeline([('vect', vectorizer), 
                    ('nb', nb)])

#Create hyperparameter dict
hyperparameters = dict(nb__alpha = alpha)

#Grid search
clf = GridSearchCV(nb_pipe, hyperparameters, cv = 5)
best_model = clf.fit(X, y)

#Printing best params
print('Best Alpha:', best_model.best_estimator_.get_params()['nb__alpha'])


Best Alpha: 0.01


### Multinomial Naive Bayes - Model

In [113]:
#Fitting vectorizer
matrix_train = vectorizer.fit_transform(X_train)
matrix_test = vectorizer.transform(X_test)

nb_final = MultinomialNB(alpha = 0.01)
nb_final.fit(matrix_train, y_train)
y_pred = nb_final.predict(matrix_test)

#Classification metrics
print('f1 score weighted %s' % f1_score(y_test,y_pred, average ='weighted'))
print('Accuracy score %s' % accuracy_score(y_test, y_pred))
#print('AUC score %s' % roc_auc_score(y_test, y_pred))
#print('ROC score %s' % roc_curve(y_test, y_pred))
print(classification_report(y_test, y_pred))

acc_mnb = accuracy_score(y_test, y_pred)
f1_mnb = f1_score(y_test, y_pred, average = 'weighted')

f1 score weighted 0.7366113370242328
Accuracy score 0.7417538213998391
              precision    recall  f1-score   support

   brazilian       0.71      0.47      0.57       117
     british       0.53      0.41      0.46       201
cajun_creole       0.66      0.69      0.68       386
     chinese       0.76      0.88      0.81       668
    filipino       0.80      0.49      0.61       189
      french       0.52      0.60      0.56       662
       greek       0.74      0.56      0.64       294
      indian       0.84      0.88      0.86       751
       irish       0.71      0.44      0.54       167
     italian       0.78      0.85      0.81      1960
    jamaican       0.85      0.58      0.69       131
    japanese       0.85      0.64      0.73       356
      korean       0.88      0.65      0.75       207
     mexican       0.88      0.88      0.88      1610
    moroccan       0.76      0.70      0.73       205
     russian       0.66      0.34      0.45       122
 southern_

### Random Forest - Tuning

In [31]:
rf = RandomForestClassifier(random_state = 123)

In [193]:
#Assigning values to params 
n_estimators = [10, 100, 500]
max_features = ['sqrt', 'log2']
min_samples_split = [2, 10, 100]
min_samples_leaf = [5, 10] 


#Fitting into pipeline
rf_pipe = Pipeline([('vect', vectorizer), 
                    ('rf', rf)])

#Create hyperparameter dict
hyperparameters = dict(rf__n_estimators = estimators, rf__max_features = max_features, rf__min_samples_split = min_samples_split, rf__min_samples_leaf = min_samples_leaf)

#Grid search
clf = GridSearchCV(rf_pipe, hyperparameters, cv = 5)
best_model = clf.fit(X, y)

#Printing best params
print('Best Estimator:', best_model.best_estimator_.get_params()['rf__n_estimators'])
print('Best features:', best_model.best_estimator_.get_params()['rf__max_features'])
print('Best samples split:', best_model.best_estimator_.get_params()['rf__min_samples_split'])
print('Best samples leaf:', best_model.best_estimator_.get_params()['rf__min_samples_leaf'])
              

Best Estimator: 500
Best features: sqrt
Best samples split: 2
Best samples leaf: 5


### Random Forest - Model

In [115]:
#Fitting vectorizer
matrix_train_rf = vectorizer.fit_transform(X_train)
matrix_test_rf = vectorizer.transform(X_test)

#Fitting final model with Best hyper params
rf_final = RandomForestClassifier(random_state = 123, n_estimators = 500, max_features = 'sqrt', min_samples_split = 2, min_samples_leaf = 5)
rf_final.fit(matrix_train_rf, y_train)
y_pred = rf_final.predict(matrix_test_rf)

#Classification metrics
print('f1 score weighted %s' % f1_score(y_test,y_pred, average ='weighted'))
print('Accuracy score %s' % accuracy_score(y_test, y_pred))
#print('AUC score %s' % roc_auc_score(y_test, y_pred))
#print('ROC score %s' % roc_curve(y_test, y_pred))
print(classification_report(y_test, y_pred))

acc_rf = accuracy_score(y_test, y_pred)
f1_rf = f1_score(y_test, y_pred, average = 'weighted')

f1 score weighted 0.5700773806452174
Accuracy score 0.6296259050683829
              precision    recall  f1-score   support

   brazilian       0.84      0.26      0.40       117
     british       0.00      0.00      0.00       201
cajun_creole       0.87      0.48      0.62       386
     chinese       0.61      0.90      0.73       668
    filipino       1.00      0.06      0.12       189
      french       0.57      0.21      0.30       662
       greek       0.93      0.22      0.35       294
      indian       0.77      0.87      0.82       751
       irish       0.80      0.02      0.05       167
     italian       0.56      0.92      0.70      1960
    jamaican       1.00      0.10      0.18       131
    japanese       0.95      0.46      0.62       356
      korean       0.98      0.20      0.34       207
     mexican       0.74      0.91      0.82      1610
    moroccan       1.00      0.18      0.31       205
     russian       0.00      0.00      0.00       122
 southern_

### SVC - Tuning


In [42]:
svc = SVC(random_state = 123)

In [44]:
#Assigning values to params 
kernel = ['rbf', 'poly', 'sigmoid']  
C = [0.001, 0.01, 0.1, 10]

#Fitting into pipeline
svc_pipe = Pipeline([('vect', vectorizer),
                      #('densify', SparseToDense()),
                      #('scale', StandardScaler()),
                      ('svc', svc)])

#Create hyperparameter dict
hyperparameters = dict(svc__kernel = kernel, svc__C = C)

#Grid search
clf = GridSearchCV(svc_pipe, hyperparameters, cv = 5)
best_model = clf.fit(X_s, y_s)

#Printing best params
print('Best Kernel:', best_model.best_estimator_.get_params()['svc__kernel'])
print('Best C:', best_model.best_estimator_.get_params()['svc__C'])

Best Kernel: sigmoid
Best C: 10


### SVC - Model

In [119]:
#Fitting vectorizer
matrix_train = vectorizer.fit_transform(X_train)
matrix_test = vectorizer.transform(X_test)

#Fitting final model with Best hyper params
svc_final = SVC(random_state = 123, kernel = 'sigmoid', C = 10)
svc_final.fit(matrix_train, y_train)
y_pred = svc_final.predict(matrix_test)

#Classification metrics
print('f1 score weighted %s' % f1_score(y_test,y_pred, average='weighted'))
print('Accuracy score %s' % accuracy_score(y_test, y_pred))
#print('AUC score %s' % roc_auc_score(y_test, y_pred))
#print('ROC score %s' % roc_curve(y_test, y_pred))
print(classification_report(y_test, y_pred))

acc_svm = accuracy_score(y_test, y_pred)
f1_svm = f1_score(y_test, y_pred, average = 'weighted')

f1 score weighted 0.7517830331203548
Accuracy score 0.7522123893805309
              precision    recall  f1-score   support

   brazilian       0.68      0.65      0.66       117
     british       0.49      0.53      0.51       201
cajun_creole       0.71      0.69      0.70       386
     chinese       0.78      0.84      0.81       668
    filipino       0.69      0.60      0.64       189
      french       0.50      0.64      0.56       662
       greek       0.68      0.69      0.69       294
      indian       0.85      0.88      0.87       751
       irish       0.52      0.47      0.50       167
     italian       0.80      0.81      0.81      1960
    jamaican       0.85      0.76      0.80       131
    japanese       0.76      0.68      0.72       356
      korean       0.84      0.74      0.79       207
     mexican       0.89      0.89      0.89      1610
    moroccan       0.77      0.77      0.77       205
     russian       0.56      0.40      0.47       122
 southern_

### XGB - Tuning

In [106]:
import xgboost as xgb

In [84]:
xgb = xgb.XGBClassifier(objective = 'multi:softmax', eval_metric = 'mlogloss')

In [104]:
max_depth = [1, 5, 20, 50]
gamma = [1, 10]
reg_alpha = [1, 40, 180]
reg_lambda = [0, 1]
colsample_bytree = [0.5, 1]
min_child_weight = [0, 1, 10]
eta = [0.5, 1]


xgb_pipe = Pipeline([('vect', vectorizer),
                      ('xgb', xgb)])

#Create hyperparameter dict
hyperparameters = dict(xgb__max_depth = max_depth, xgb__gamma = gamma, xgb__reg_alpha = reg_alpha, 
                       xgb__reg_lambda = reg_lambda, xgb__colsample_bytree = colsample_bytree, xgb__min_child_weight = min_child_weight, xgb__eta = eta)

#Grid search
clf = GridSearchCV(xgb_pipe, hyperparameters, cv = 5)
best_model = clf.fit(X_s, y_s)

#Printing best params
print('Best Depth:', best_model.best_estimator_.get_params()['xgb__max_depth'])
print('Best Gamma:', best_model.best_estimator_.get_params()['xgb__gamma'])
print('Best Alpha:', best_model.best_estimator_.get_params()['xgb__reg_alpha'])
print('Best Lambda:', best_model.best_estimator_.get_params()['xgb__reg_lambda'])
print('Best Colsample:', best_model.best_estimator_.get_params()['xgb__colsample_bytree'])
print('Best Child weight:', best_model.best_estimator_.get_params()['xgb__min_child_weight'])
print('Best ETA:', best_model.best_estimator_.get_params()['xgb__eta'])

Best Depth: 1
Best Gamma: 1
Best Alpha: 1
Best Lambda: 0
Best Colsample: 0.5
Best Child weight: 1
Best ETA: 0.5


### XGB - Model

In [120]:
#Fitting vectorizer
matrix_train = vectorizer.fit_transform(X_train)
matrix_test = vectorizer.transform(X_test)

#Fitting final model with Best hyper params
xgb_final = xgb.XGBClassifier(objective = 'multi:softmax', max_depth = 1, gamma = 1, reg_alpha = 1, 
                     reg_lambda = 0, colsample_bytree = 0.5, min_child_weight = 1, eta = 0.5, eval_metric = 'mlogloss')

xgb_final.fit(matrix_train, y_train)
y_pred = xgb_final.predict(matrix_test)

#Classification metrics
print('f1 score weighted %s' % f1_score(y_test,y_pred, average='weighted'))
print('Accuracy score %s' % accuracy_score(y_test, y_pred))
#print('AUC score %s' % roc_auc_score(y_test, y_pred))
#print('ROC score %s' % roc_curve(y_test, y_pred))
print(classification_report(y_test, y_pred))

acc_xgb = accuracy_score(y_test, y_pred)
f1_xgb = f1_score(y_test, y_pred, average = 'weighted')

f1 score weighted 0.7420894779545216
Accuracy score 0.7489943684633951
              precision    recall  f1-score   support

   brazilian       0.74      0.57      0.64       117
     british       0.54      0.23      0.33       201
cajun_creole       0.81      0.65      0.72       386
     chinese       0.79      0.84      0.81       668
    filipino       0.69      0.48      0.56       189
      french       0.53      0.56      0.54       662
       greek       0.79      0.71      0.75       294
      indian       0.87      0.86      0.86       751
       irish       0.67      0.44      0.53       167
     italian       0.71      0.88      0.78      1960
    jamaican       0.89      0.69      0.78       131
    japanese       0.87      0.66      0.75       356
      korean       0.80      0.67      0.73       207
     mexican       0.89      0.90      0.89      1610
    moroccan       0.80      0.71      0.75       205
     russian       0.57      0.35      0.44       122
 southern_

### Over and Under Sampling

In [129]:
from imblearn import pipeline as pl
from imblearn.metrics import classification_report_imbalanced
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [130]:
lr_samp = LogisticRegression(max_iter = 300, random_state = 123, multi_class = 'multinomial',  solver = 'newton-cg', C = 10, penalty = 'l2')

In [131]:
random_os = RandomOverSampler(random_state=123)
random_us = RandomUnderSampler(random_state=123)

### Oversampling LR

In [134]:
lr_pipe_os = pl.make_pipeline(vectorizer,
                           random_os,
                           lr_samp)


# Train the classifier with balancing
lr_pipe_os.fit(X_train, y_train)

# Test the classifier and get the prediction
y_pred_bal = lr_pipe_os.predict(X_test)

print("results on oversampling:\n")

print('f1 score weighted %s' % f1_score(y_test, y_pred_bal, average = 'weighted'))
print('Accuracy score %s' % accuracy_score(y_test, y_pred_bal))
print(classification_report_imbalanced(y_test, y_pred_bal))

acc_os_lr = accuracy_score(y_test, y_pred_bal)
f1_os_lr = f1_score(y_test, y_pred_bal, average = 'weighted')

results on oversampling:

f1 score weighted 0.7850797931436236
Accuracy score 0.7838897827835881
                    pre       rec       spe        f1       geo       iba       sup

   brazilian       0.71      0.68      1.00      0.69      0.82      0.65       117
     british       0.53      0.57      0.99      0.55      0.75      0.54       201
cajun_creole       0.74      0.74      0.99      0.74      0.86      0.72       386
     chinese       0.82      0.84      0.99      0.83      0.91      0.82       668
    filipino       0.71      0.65      0.99      0.68      0.80      0.62       189
      french       0.58      0.68      0.96      0.62      0.81      0.63       662
       greek       0.71      0.77      0.99      0.74      0.87      0.74       294
      indian       0.89      0.90      0.99      0.89      0.95      0.89       751
       irish       0.57      0.57      0.99      0.57      0.75      0.54       167
     italian       0.85      0.82      0.96      0.84      0.8

### Undersampling LR

In [135]:
lr_pipe_us = pl.make_pipeline(vectorizer,
                           random_us,
                           lr_samp)



# Train the classifier with balancing
lr_pipe_us.fit(X_train, y_train)

# Test the classifier and get the prediction
y_pred_bal = lr_pipe_us.predict(X_test)

print("results on Undersampling:\n")

print('f1 score weighted %s' % f1_score(y_test, y_pred_bal, average = 'weighted'))
print('Accuracy score %s' % accuracy_score(y_test, y_pred_bal))
print(classification_report_imbalanced(y_test, y_pred_bal))

acc_us_lr = accuracy_score(y_test, y_pry_pred_bal)
f1_us_lr = f1_score(y_test, y_pred_bal, average = 'weighted')

results on Undersampling:

f1 score weighted 0.7208509660202405
Accuracy score 0.705852775543041
                    pre       rec       spe        f1       geo       iba       sup

   brazilian       0.42      0.76      0.99      0.54      0.87      0.73       117
     british       0.31      0.62      0.97      0.42      0.77      0.58       201
cajun_creole       0.63      0.76      0.98      0.69      0.86      0.73       386
     chinese       0.85      0.77      0.99      0.81      0.87      0.75       668
    filipino       0.49      0.68      0.99      0.57      0.82      0.65       189
      french       0.50      0.53      0.96      0.52      0.72      0.49       662
       greek       0.60      0.79      0.98      0.68      0.88      0.76       294
      indian       0.91      0.83      0.99      0.87      0.91      0.81       751
       irish       0.35      0.63      0.98      0.45      0.79      0.59       167
     italian       0.89      0.67      0.98      0.76      0.8

NameError: name 'y_pry_pred_bal' is not defined

### Dataframe with Scores

In [132]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Support Vector Machines', 'XG Boost Model', 'Logistic Regression - Oversampled', 
              'Logistic Regression - Undersampled', 'Naive Bayes', 'Random Forest'],
    'Accuracy Score': [acc_lr, acc_svm, acc_xgb, acc_os_lr, acc_us_lr, acc_mnb, acc_rf],
    'F1 Score': [f1_lr, f1_svm, f1_xgb, f1_os_lr, f1_us_lr, f1_mnb, f1_rf]})
models.sort_values(by = 'Accuracy Score', ascending = False)

Unnamed: 0,Model,Accuracy Score,F1 Score
0,Logistic Regression,0.787409,0.783536
1,Support Vector Machines,0.752212,0.751783
2,XG Boost Model,0.748994,0.742089
3,Logistic Regression - Oversampled,0.748994,0.742089
4,Logistic Regression - Undersampled,0.748994,0.742089
5,Naive Bayes,0.741754,0.736611
6,Random Forest,0.629626,0.570077
