# Preprocessing
___

## Load Libraries

In [1]:
# !pip install -U spacy

In [44]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

# NLP
import spacy

# Vectorizing
from skopt.space import Integer, Real, Categorical
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from skopt import BayesSearchCV
from sklearn.pipeline import Pipeline

## Read in Data

In [3]:
# If your test does not have equal columns to the number of columns you used in your training for the model. Will the model perform the same or worse?
# Yes, your test and train columns need to be the same

In [4]:
train = pd.read_csv('../data/train.csv', index_col='Unnamed: 0')

In [5]:
test = pd.read_csv('../data/test.csv', index_col='Unnamed: 0')

## Create Class

In [6]:
class ProcessedFoods:
    def __init__(self, df, column):
        self.df = df
        self.column = column
        
    # Add column with character count
    def get_char_count(self, new_column='char_count'):
        self.df[new_column] = self.df[self.column].apply(lambda x: sum([len(item) for item in x]))
        return self.df[new_column]
    
    # Add column with ingredient count 
    def get_ingredient_count(self, new_column='ingredient_count'):
        self.df[new_column] = self.df[self.column].apply(lambda x: len(str(x).split(',')))
        return self.df[new_column]
    
    # Create a new column where values are list type
    def convert_to_list(self, new_column='ingredient_list'): 
        self.df[new_column] = [i[2:-2].split("', '") for i in self.df[self.column]]
        return self.df[new_column]
    
    # Make vocabulary list of unique values in each row's list
    def make_vocabulary(self, column):
        list_name = []
        
        # prefer to use df from convert_to_list
        for i in range(len(column)):
            for item in column[i]:
                if item not in list_name:
                    list_name.append(item)
                        
        # pickle for your pleasure
        with open('../data/word_list.pkl', 'wb') as pickle_in:
            pickle.dump(list_name, pickle_in)
        
        # set equal to vocabulary for vectorizer
        return list_name

Adapted from: [Isaac Aderogba](https://deepnote.com/@isaac-aderogba/Spacy-Food-Entities-2cc2d19c-c3ac-4321-8853-0bcf2ef565b3)

## Tidy Train dataset

In [7]:
train.head(3)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"['romaine lettuce', 'black olives', 'grape tom..."
1,25693,southern_us,"['plain flour', 'ground pepper', 'salt', 'toma..."
2,20130,filipino,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki..."


In [8]:
process_train = ProcessedFoods(df = train, column = 'ingredients')

### character count

In [9]:
train['char_count'] = process_train.get_char_count()

In [10]:
train.shape

(39774, 4)

### ingerdient count

In [11]:
train['ingredient_count'] = process_train.get_ingredient_count()

In [12]:
train.shape

(39774, 5)

### convert column to list

In [13]:
train['ingredient_lists'] = process_train.convert_to_list()

In [14]:
type(train['ingredient_lists'][0])

list

### make vocabulary

In [15]:
train_vocab = process_train.make_vocabulary(train['ingredient_lists'])

In [16]:
train_vocab[:5]

['romaine lettuce', 'black olives', 'grape tomatoes', 'garlic', 'pepper']

## Tidy Test dataset

In [17]:
test.head(3)

Unnamed: 0,id,ingredients
0,18009,"['baking powder', 'eggs', 'all-purpose flour',..."
1,28583,"['sugar', 'egg yolks', 'corn starch', 'cream o..."
2,41580,"['sausage links', 'fennel bulb', 'fronds', 'ol..."


In [18]:
process_test = ProcessedFoods(df = test, column = 'ingredients')

### character count

In [19]:
test['char_count'] = process_test.get_char_count()

In [20]:
test.shape

(9944, 3)

### ingeredient count

In [21]:
test['ingredient_count'] = process_test.get_ingredient_count()

In [22]:
test.shape

(9944, 4)

### convert column to list

In [23]:
test['ingredient_lists'] = process_test.convert_to_list()

In [24]:
type(test['ingredient_lists'][0])

list

### make vocabulary

In [25]:
test_vocab = process_test.make_vocabulary(test['ingredient_lists'])

In [26]:
test_vocab[:5]

['baking powder', 'eggs', 'all-purpose flour', 'raisins', 'milk']

In [27]:
index_cuisine = range(len(train['cuisine'].unique()))
index_cuisine = dict(zip(train['cuisine'].unique(), index_cuisine))

In [28]:
train['cuisine'] = train['cuisine'].replace(index_cuisine)

## Set X and y

In [29]:
train.head(3)

Unnamed: 0,id,cuisine,ingredients,char_count,ingredient_count,ingredient_list,ingredient_lists
0,10259,0,"['romaine lettuce', 'black olives', 'grape tom...",144,9,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes..."
1,25693,1,"['plain flour', 'ground pepper', 'salt', 'toma...",155,11,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,2,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...",172,12,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking oil, g..."


In [30]:
test.head(3)

Unnamed: 0,id,ingredients,char_count,ingredient_count,ingredient_list,ingredient_lists
0,18009,"['baking powder', 'eggs', 'all-purpose flour',...",80,6,"[baking powder, eggs, all-purpose flour, raisi...","[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"['sugar', 'egg yolks', 'corn starch', 'cream o...",157,11,"[sugar, egg yolks, corn starch, cream of tarta...","[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"['sausage links', 'fennel bulb', 'fronds', 'ol...",82,6,"[sausage links, fennel bulb, fronds, olive oil...","[sausage links, fennel bulb, fronds, olive oil..."


In [58]:
X_train = train.drop(columns=['ingredient_lists', 'cuisine', ''])
X_test = test.drop(columns=['ingredient_lists'])

In [59]:
X_train.shape

(39774, 5)

In [60]:
X_test.shape

(9944, 5)

In [61]:
y_train = train['cuisine']
y_train.head()

0    0
1    1
2    2
3    3
4    3
Name: cuisine, dtype: int64

In [62]:
y_train.shape

(39774,)

## Vectorize

### Train

In [63]:
len(train_vocab)

6847

In [73]:
#creating a pipeline 
train_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('bnb', MultinomialNB())
])

#Transformer pipe params 
train_pipe_params = {
    'cvec__max_features': Integer(1, 4047)
}

#Instantiate GridSearchCV
bs = BayesSearchCV(
    estimator = train_pipe,
    search_spaces = train_pipe_params, 
    cv = 5,
    n_jobs = -1
)

In [74]:
bs.estimator.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'cvec', 'bnb', 'cvec__analyzer', 'cvec__binary', 'cvec__decode_error', 'cvec__dtype', 'cvec__encoding', 'cvec__input', 'cvec__lowercase', 'cvec__max_df', 'cvec__max_features', 'cvec__min_df', 'cvec__ngram_range', 'cvec__preprocessor', 'cvec__stop_words', 'cvec__strip_accents', 'cvec__token_pattern', 'cvec__tokenizer', 'cvec__vocabulary', 'bnb__alpha', 'bnb__class_prior', 'bnb__fit_prior'])

In [75]:
bs.fit(train[['ingredient_list']], y_train)

ValueError: Found input variables with inconsistent numbers of samples: [1, 31819]

In [38]:
train_matrix = train_cvec.fit_transform(X_train['ingredients'])



In [39]:
train_df = pd.DataFrame(train_matrix.todense(), columns=train_cvec.get_feature_names())



In [40]:
train_df

Unnamed: 0,romaine lettuce,black olives,grape tomatoes,garlic,pepper,purple onion,seasoning,garbanzo beans,feta cheese crumbles,plain flour,...,Oscar Mayer Cotto Salami,Challenge Butter,orange glaze,cholesterol free egg substitute,ciabatta loaf,Lipton® Iced Tea Brew Family Size Tea Bags,Hidden Valley® Greek Yogurt Original Ranch® Dip Mix,lop chong,tomato garlic pasta sauce,crushed cheese crackers
0,0,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39769,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39770,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39771,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39772,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Test

In [41]:
len(test_vocab)

4500

In [42]:
# pass vocabulary to vectorizer
test_cvec = CountVectorizer(vocabulary = test_vocab)

In [43]:
test_matrix = test_cvec.fit_transform(X_test['ingredients'])



In [44]:
test_df = pd.DataFrame(test_matrix.todense(), columns=test_cvec.get_feature_names())



In [45]:
test_df

Unnamed: 0,baking powder,eggs,all-purpose flour,raisins,milk,white sugar,sugar,egg yolks,corn starch,cream of tartar,...,fraise,beef heart,lambs liver,soft cheese,sliced mango,pork strips,shark fillets,hash brown,porter,butter crackers
0,0,1,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9939,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9942,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Concatenate Vectorized columns to train set

### train

In [46]:
train.shape

(39774, 7)

In [47]:
train_df.shape

(39774, 6847)

In [48]:
train_df_full = pd.concat([train[['id', 'cuisine', 'char_count', 'ingredient_count']], train_df], axis=1)

In [49]:
train_df_full

Unnamed: 0,id,cuisine,char_count,ingredient_count,romaine lettuce,black olives,grape tomatoes,garlic,pepper,purple onion,...,Oscar Mayer Cotto Salami,Challenge Butter,orange glaze,cholesterol free egg substitute,ciabatta loaf,Lipton® Iced Tea Brew Family Size Tea Bags,Hidden Valley® Greek Yogurt Original Ranch® Dip Mix,lop chong,tomato garlic pasta sauce,crushed cheese crackers
0,10259,0,144,9,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,25693,1,155,11,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
2,20130,2,172,12,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,22213,3,43,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13162,3,282,20,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39769,29109,16,194,12,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39770,11462,6,147,7,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
39771,2238,16,147,12,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39772,41882,8,360,21,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### test

In [50]:
test.shape

(9944, 6)

In [51]:
test_df.shape

(9944, 4500)

In [58]:
test_df_full = pd.concat([test[['id', 'char_count', 'ingredient_count']], test_df], axis=1)

In [59]:
test_df_full

Unnamed: 0,id,char_count,ingredient_count,baking powder,eggs,all-purpose flour,raisins,milk,white sugar,sugar,...,fraise,beef heart,lambs liver,soft cheese,sliced mango,pork strips,shark fillets,hash brown,porter,butter crackers
0,18009,80,6,0,1,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,28583,157,11,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,41580,82,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,29752,336,21,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,35687,136,8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9939,30246,160,9,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9940,36028,64,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9941,22339,137,8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9942,42525,248,15,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Export datasets

In [54]:
with open('../data/train_vec.pkl', 'wb') as pickle_out:
    pickle.dump(train_df_full, pickle_out)

In [60]:
with open('../data/test_vec.pkl', 'wb') as pickle_out:
    pickle.dump(test_df_full, pickle_out)