In [None]:
import re
import cv2
import time
import warnings
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import keras
import lightgbm as lgb

from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.applications.xception import Xception
from keras.applications.xception import preprocess_input as xception_preprocessor
from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input as inception_v3_preprocessor
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.applications.inception_resnet_v2 import preprocess_input as inception_resnet_v2_preprocessor
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard, Callback
from keras.engine import InputSpec, Layer
from keras.layers import Dense, Input, Flatten, Dropout, GlobalAveragePooling2D, Conv1D
from keras.layers.normalization import BatchNormalization
from keras.layers import LSTM, Embedding, Activation, GRU, CuDNNGRU, CuDNNLSTM, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten, Masking
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import load_img
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from os import makedirs
from os.path import expanduser, exists, join
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.utils import shuffle
from gensim.models import Word2Vec
from tqdm import tqdm
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser

warnings.filterwarnings("ignore")
pd.set_option('max_colwidth',400)

tqdm.pandas()

# EDA & FE

In [None]:
# Generate path to image
train_set = pd.read_json('../input/whats-cooking/train.json', orient='columns')
test_set = pd.read_json('../input/whats-cooking/test.json', orient='columns')

In [None]:
ingredients_individual = Counter([ingredient for ingredient_list in train_set.ingredients for ingredient in ingredient_list])
ingredients_individual = pd.DataFrame.from_dict(ingredients_individual,orient='index').reset_index()
ingredients_individual = ingredients_individual.rename(columns={'index':'Ingredient', 0:'Count'})

In [None]:
print(len(train_set.cuisine.unique()))
print(len(ingredients_individual.Ingredient.unique()))

In [None]:
# Category list for current major category
cat = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]
num_class = len(cat)

## One-Hot Encoding
Ensure nn does not mistake a correlation between each cateogries. There may be problems as there is no ordinal relationship and allowing the representation to lean on any such relationship might be damaging to learning.

In [None]:
lb = LabelEncoder()
enc = OneHotEncoder(sparse=False)
enc.fit(np.array(cat).reshape(-1, 1))

y_cat = lb.fit_transform(train_set.cuisine)
y_ohe = enc.transform(y_cat.reshape(-1, 1))

## Prepare Text

### Text Cleaning

In [16]:
def preprocessing(titles_array):
    
    processed_array = []
    
    for title in tqdm(titles_array):
        ingredients = []
        
        for ingredient in title:
            # remove other non-alphabets symbols with space (i.e. keep only alphabets and whitespaces).
            ingredient = ingredient.strip().lower()
            ingredient = re.sub('[^a-zA-Z ]', '', ingredient)
            ingredient = re.sub('   ',  ' ', ingredient)
            ingredient = re.sub('  ', ' ', ingredient)
            
            words = ingredient.split()
        
            # keep words that have length of more than 1 (e.g. gb, bb), remove those with length 1.
            ingredients.append('_'.join([word for word in words if len(word) > 1]))
            
        processed_array.append(' '.join(ingredients))
    
    return processed_array

train_set['processed'] = preprocessing(train_set['ingredients'])
test_set['processed'] = preprocessing(test_set['ingredients'])
print(train_set['processed'][0])

100%|██████████| 39774/39774 [00:01<00:00, 26513.23it/s]
100%|██████████| 9944/9944 [00:00<00:00, 26423.96it/s]


0                                                                                                             romaine_lettuce black_olives grape_tomatoes garlic pepper purple_onion seasoning garbanzo_beans feta_cheese_crumbles
1                                                                                                        plain_flour ground_pepper salt tomatoes ground_black_pepper thyme eggs green_tomatoes yellow_corn_meal milk vegetable_oil
2                                                                                          eggs pepper salt mayonaise cooking_oil green_chilies grilled_chicken_breasts garlic_powder yellow_onion soy_sauce butter chicken_livers
3                                                                                                                                                                                                   water vegetable_oil wheat salt
4    black_pepper shallots cornflour cayenne_pepper onions garlic_paste milk butter salt lem

### Custom Word Vector with Word2Vec

In [None]:
sentences = pd.concat([train_set['processed'], test_set['processed']],axis=0)
train_sentences = list(sentences.progress_apply(str.split).values)

model = Word2Vec(sentences=train_sentences, 
                 sg=1, 
                 min_count=1,
                 size=300,  
                 workers=4)

model.wv.save_word2vec_format('custom_glove_300d.txt')

In [None]:
print('Max number of ingredient in train is {0:.0f}.'.format(np.max(train_set.ingredients.map(lambda l: len(l)))))
print('Max number of ingredient in test is {0:.0f}.'.format(np.max(test_set.ingredients.map(lambda l: len(l)))))
print('Vocab Size: ', len(model.wv.vocab.keys()))

### Weijie Countvectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer



### Custom Embedding

- Text data will have to be encoded for the NN.
- Custom trained embedding is used in this case.
- Embedding matrix for the model will also be created

In [None]:
max_len = max(np.max(train_set.ingredients.map(lambda l: len(l))), np.max(test_set.ingredients.map(lambda l: len(l))))

max_features = len(model.wv.vocab.keys())
tk = Tokenizer(lower = True, filters='', num_words=max_features)
tk.fit_on_texts(train_sentences)

embedding_path = "custom_glove_300d.txt"
embed_size = 300

train_set1 = tk.texts_to_sequences(train_set['processed'])
train_set1 = pad_sequences(train_set1, maxlen = max_len)

test_set1 = tk.texts_to_sequences(test_set['processed'])
test_set1 = pad_sequences(test_set1, maxlen = max_len)

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words+1, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:embedding_matrix[i] = embedding_vector

max_features = nb_words
print('No. of Features is {0:.0f}.'.format(max_features))

# Functions
***
*  Custom attention layer for text

In [None]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')
        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim
        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)
        a = K.exp(eij)
        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())
        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim


Model Architecture:
- Input
- Embedding1
- GRU
- Attention Layer
- Dense Layer

In [None]:
def textv1():
    inp = Input(shape = (max_len,))
    x = Embedding(max_features + 1, embed_size, weights = [embedding_matrix], trainable = True)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(CuDNNGRU(256, return_sequences = True))(x)
    x = Attention(max_len)(x)
    x = BatchNormalization()(x)
    x = Dense(128, activation='relu') (x)
    x = Dropout(0.5)(x)
    x = Dense(num_class, activation = "softmax")(x)
    
    model = Model(inputs = inp, outputs = x)
    
    return model

model1 = textv1()
model1.summary()

### textv2

#### MODEL HAVE REALLY LOW ACCURACY. NOT USING.

Model Architecture:
- Input
- Embedding
    - GRU
        - Conv1D - Average
        - Conv1D - Max
    - LSTM
        - Conv1D - Average
        - Conv1D - Max
- Concatenation Layer
- Dense Layer

In [None]:
def textv2():
    inp = Input(shape = (max_len,))
    x1 = Embedding(max_features + 1, embed_size, weights = [embedding_matrix], trainable = True)(inp)
    x1 = SpatialDropout1D(0.3)(x1)
    
    x_gru = Bidirectional(CuDNNGRU(128, return_sequences = True))(x1)
    
    x_conv1 = Conv1D(32, kernel_size=4, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x_conv1)
    max_pool1_gru = GlobalMaxPooling1D()(x_conv1)
    
    x_conv2 = Conv1D(32, kernel_size=3, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool2_gru = GlobalAveragePooling1D()(x_conv2)
    max_pool2_gru = GlobalMaxPooling1D()(x_conv2)
    
    
    x_lstm = Bidirectional(CuDNNLSTM(128, return_sequences = True))(x1)
    
    x_conv3 = Conv1D(32, kernel_size=4, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x_conv3)
    max_pool1_lstm = GlobalMaxPooling1D()(x_conv3)
    
    x_conv4 = Conv1D(32, kernel_size=3, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool2_lstm = GlobalAveragePooling1D()(x_conv4)
    max_pool2_lstm = GlobalMaxPooling1D()(x_conv4)
    
    
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool2_gru, max_pool2_gru,
                     avg_pool1_lstm, max_pool1_lstm, avg_pool2_lstm, max_pool2_lstm])
    x = BatchNormalization()(x)
    x = Dense(128, activation='relu') (x)
    x = Dropout(0.5)(x)
    x = Dense(num_class, activation = "softmax")(x)
    
    model = Model(inputs = inp, outputs = x)
    
    return model

#model2 = textv2()
#model2.summary()

### textv3

#### MODEL HAVE REALLY LOW ACCURACY. NOT USING.

Model Architecture:
- Input
- Embedding1
- Masking
- LSTM
- Dense Layer

In [None]:
def textv3():
    inp = Input(shape = (max_len,))
    x = Embedding(max_features + 1, embed_size, weights = [embedding_matrix], trainable = True)(inp)
    
    x = SpatialDropout1D(0.3)(x)
    x = Masking()(x)
    x = LSTM(256)(x)
    
    x = BatchNormalization()(x)
    x = Dense(128, activation='relu') (x)
    x = Dropout(0.5)(x)
    x = Dense(num_class, activation = "softmax")(x)
    
    model = Model(inputs = inp, outputs = x)
    
    return model

#model3 = textv3()
#model3.summary()

## Training

Training model on k-fold validation.

In [None]:
n_folds=20
epochs=10
verbose=2 # 1 for Debugging. 2 for Committing
batch_size = 64

for i in range(n_folds):
    print("Training on Fold: ", i + 1)

    x_train, x_valid, y_train, y_valid = train_test_split(train_set1, y_ohe, test_size = 0.1, random_state = i)
    
    # Model 1
    model1 = textv1()
    callbacks = [EarlyStopping(monitor='val_loss', patience=3, verbose=1, min_delta=1e-4),
                 ModelCheckpoint(filepath='temp1.hdf5', verbose=1,save_best_only=True, mode='auto')]
    model1.compile(loss = "categorical_crossentropy", optimizer='adam', metrics = ["accuracy"])
    model1.fit(x_train, y_train, batch_size = batch_size, epochs = epochs, verbose = verbose,
               validation_data = (x_valid, y_valid), callbacks = callbacks, shuffle=True)
    
    model1.load_weights('temp1.hdf5')
    model1.compile(loss = "categorical_crossentropy", optimizer='adam', metrics = ["accuracy"])
    
    '''
    # Model 2
    model2 = textv2()
    callbacks = [EarlyStopping(monitor='val_loss', patience=3, verbose=1, min_delta=1e-4),
                 ModelCheckpoint(filepath='temp2.hdf5', verbose=1,save_best_only=True, mode='auto')]
    model2.compile(loss = "categorical_crossentropy", optimizer='adam', metrics = ["accuracy"])
    model2.fit(x_train, y_train, batch_size = batch_size, epochs = epochs, verbose = verbose,
               validation_data = (x_valid, y_valid), callbacks = callbacks, shuffle=True)
    
    model2.load_weights('temp2.hdf5')
    model2.compile(loss = "categorical_crossentropy", optimizer='adam', metrics = ["accuracy"])
    '''
    '''
    # Model 3
    model3 = textv3()
    callbacks = [EarlyStopping(monitor='val_loss', patience=3, verbose=1, min_delta=1e-4),
                 ModelCheckpoint(filepath='temp3.hdf5', verbose=1,save_best_only=True, mode='auto')]
    model3.compile(loss = "categorical_crossentropy", optimizer='adam', metrics = ["accuracy"])
    model3.fit(x_train, y_train, batch_size = batch_size, epochs = epochs, verbose = verbose,
               validation_data = (x_valid, y_valid), callbacks = callbacks, shuffle=True)
    
    model3.load_weights('temp3.hdf5')
    model3.compile(loss = "categorical_crossentropy", optimizer='adam', metrics = ["accuracy"])
    '''
    # Predict
    if (i == 0):
        train_pred = model1.predict(train_set1, verbose=2)
        test_pred = model1.predict(test_set1, verbose=2)
    else:
        train_pred += model1.predict(train_set1, verbose=2)
        test_pred += model1.predict(test_set1, verbose=2)
    '''
    train_pred += model2.predict(train_set1, verbose=2)
    test_pred += model2.predict(test_set1, verbose=2)
    train_pred += model3.predict(train_set1, verbose=2)
    test_pred += model3.predict(test_set1, verbose=2)
    '''
    print("======="*12, end="\n\n\n")

train_pred /= n_folds
test_pred /= n_folds

## Save Results

In [None]:
predictions = np.round(np.argmax(test_pred, axis=1)).astype(int)
test_set.drop(['ingredients', 'processed'], axis=1)

sub = pd.DataFrame({'id': test_set['id'], 'cuisine': lb.inverse_transform(predictions)}, columns=['id', 'cuisine'])
sub.to_csv('predicitions.csv', index=False)

### Download Link

Kaggle does not save csv. Using download link as bypass to get file directly without having to commit.

In [None]:
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

def create_download_link(df, filename = "submission.csv", title = "Download CSV file"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

# create a link to download the dataframe
create_download_link(sub)