Tag it

This is a multi-label classification problem.

Importing the dependencies

In [1]:
import pandas as pd
import seaborn as sns
import string
from collections import Counter

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(["minutes", "hours", "seconds", "teaspoon", "spoon"])

punctuations = '!"#$%&\'()*+,.-/:;<=>?@[\\]^_`{|}~'
table = str.maketrans('', '', punctuations)

import difflib

from tqdm import tqdm

import keras.backend as K
from keras.preprocessing.sequence import pad_sequences
import numpy as np

from keras.callbacks import EarlyStopping
from keras.models import Input, Model
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers import Embedding, Bidirectional, LSTM, \
    GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, concatenate, \
        TimeDistributed, MaxPooling1D, add
from keras.layers.convolutional import Conv1D
from keras.optimizers import Adam
from keras.utils import plot_model

from keras.losses import sparse_categorical_crossentropy

import tensorflow as tf

from gensim.models.fasttext import FastText

[nltk_data] Downloading package punkt to /home/msr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/msr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/msr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


Importing dataset

In [2]:
df = pd.read_csv("recipes_82k.csv")

### Before diving into the code - an explanation for some of my design choices 

###### Tokenisation, lemmatisation and general data cleaning - 
After trials, I finally settled on NLTK and its corpus instead of SpaCy. The primary reason was the time constraint and the fact that NLTK was more suited to rapid experimentation as it was more than twice as fast as SpaCy.

###### Word vectors - 
I trained word vectors from the corpus itself, as this task is domain-specific, so I wasn't sure how a general-purpose word embedding scheme would perform.
I used FastText embeddings in particular as they handle out-of-vocabulary words better than GloVe or Word2Vec.

You can download my pretrained word embeddings from here - https://drive.google.com/open?id=17MdIgSJ3J9hupjzdEOJZ3e4sQBf8xaj0 

You can also choose to train them from scratch by setting the appropriate flag during class instantiation.

###### Classifier - 
I used a Bidirectional LSTM-based classifier constructed from the Keras functional (API). I experimented with Conv1D and multi-input models but, in most cases, I got similar results with added computational overhead. So I went with a simpler model in the end.

#    

Defining the class, methods and attributes.

In [3]:

class DataGetter():
    
    """ This is a class that loads and transforms data so it can be 
    immediately usable with just the appropriate method calls.
    
    Also has attributes that contain extracted data.
    """
    
    def __init__(self, df, training=True, vectors_pretrained=True):
        
        self.training = training
        self.vectors_pretrained = vectors_pretrained
        
        if vectors_pretrained:
            self.vectors_file = 'word_vectors/fasttext.model'
         
        # Drop unwanted variables
        df.drop(["category", "image"], inplace=True, axis=1)
        
        # Handling NA and Null values
        df['cuisine'].fillna('UNK_Cuisine', inplace=True)
        df['prep_time'].fillna('UNK_Time', inplace=True)
        df['serves'].fillna('UNK_Serves', inplace=True)
        df.dropna(inplace = True)
        df = df.where(pd.notnull(df), None)
        
        # Handling duplicate values
        df = df[df.duplicated() == False]
        
        # Shuffling the dataset for randomness
        df = df.sample(frac=1, random_state=1234)
        
        # Extracting the fields from the data
        self.cooking_method = df['cooking_method']
        self.cuisine = list(df['cuisine'])
        self.ingredients = list(df['ingredients'])
        self.prep_time = list(df['prep_time'])
        self.recipe_name = list(df['recipe_name'])
        self.serves = list(df['serves'])
        self.tags = list(df['tags'])
        self.tags = [str(tag).split(",") for tag in self.tags]
        self.tags_flat = [tag for tag_list in self.tags for tag in tag_list]  
        self.tags_unique = list(set(self.tags_flat))
        
    
    def preprocess_inputs(self):
        
        """ 
        Method to pre-process the extracted fields (tokenisation, lemmatisation and general text cleaning)
        """
        
        # Preprocessing cooking method text
        self.tok_cooking_method = []
        for elem in tqdm(self.cooking_method):
            elem = elem.translate(table)
            elem = lemmatizer.lemmatize(elem)

            elem = word_tokenize(elem)
            elem = [word.lower() for word in elem if word not in stop_words and word.isalpha() and not word.endswith("ly")]
            self.tok_cooking_method.append(elem)

            
        # Preprocessing ingredients text
        self.tok_ingredients = []
        for elem in tqdm(self.ingredients):
            elem = elem.translate(table)
            elem = lemmatizer.lemmatize(elem)
            elem = word_tokenize(elem)
            elem = [word.lower() for word in elem if word not in stop_words and word.isalpha() and not word.endswith("ly")]
            self.tok_ingredients.append(elem)

        
        # Preprocessing recipe name text
        self.tok_recipe_name = []
        for elem in tqdm(self.recipe_name):
            elem = elem.translate(table)
            elem = lemmatizer.lemmatize(elem)
            elem = word_tokenize(elem)
            elem = [word.lower() for word in elem if word not in stop_words and word.isalpha()]

            self.tok_recipe_name.append(elem)
            

            
    def compute_statistics(self):
        """
        Method to compute statistics for the text data
        """
        self.len_cooking = []
        for subl in self.tok_cooking_method:
            self.len_cooking.append(len(subl))

        self.max_len_cooking = max(self.len_cooking)

        self.len_ingredients = []
        for subl in self.tok_ingredients:
            self.len_ingredients.append(len(subl))

        self.max_len_ingredients = max(self.len_ingredients)


        self.len_recipe_name = []
        for subl in self.tok_recipe_name:
            self.len_recipe_name.append(len(subl))

        self.max_len_recipe_name = max(self.len_recipe_name)

        self.ntags = len(self.tags_unique)

        self.tags2id = {tag:i for i, tag in enumerate(self.tags_unique)}


        self.total_tags = len(self.tags_flat)
        self.tag_weights = dict(Counter(self.tags_flat))

        self.tag_weights = {self.tags2id[k]:self.total_tags/v for k,v in self.tag_weights.items()}

        
    def plot_density(self, len_list):
        sns.kdeplot(len_list)
  
        
    def prepare_inputs(self, max_cook_len=200, max_ing_len=100, max_rec_len=10, vector_size=100):
        
        """
        Preparing the preprocessed inputs for entry into the classifier
        """
         
        if self.vectors_pretrained:
            vecs = FastText.load(self.vectors_file)
        else:
            vecs = model = FastText(ft_data, size=100, window=20, iter=10, workers=3)
            vecs.save("word_vectors/fasttext.model")

        self.vectors = vecs.wv
        del vecs

        for i, subl in enumerate(self.tok_cooking_method):
            subl = subl[:max_cook_len]
            self.tok_cooking_method[i] = subl

        for i, subl in enumerate(self.tok_ingredients):
            subl = subl[:max_ing_len]
            self.tok_ingredients[i] = subl

        for i, subl in enumerate(self.tok_recipe_name):
            subl = subl[:max_rec_len]
            self.tok_recipe_name[i] = subl

        self.tok_concat = []

        for i, j, k in zip(self.tok_cooking_method, self.tok_ingredients, self.tok_recipe_name):
            self.tok_concat.append(i + j + k)



        if self.training:

            self.total_size = len(self.len_cooking)

            self.train_indices = list(range(0,int(self.total_size*0.8)-1))
            self.train_size = len(self.train_indices)

            self.val_indices = list(range(max(self.train_indices), self.total_size))
            self.val_size = len(self.val_indices)

            self.max_text_len = max_cook_len + max_ing_len + max_rec_len
            self.vector_size = vector_size

            self.X_train = np.zeros((self.train_size, self.max_text_len, self.vector_size), dtype=K.floatx())

            self.X_val = np.zeros((self.val_size, self.max_text_len, self.vector_size), dtype=K.floatx())


            for index in tqdm(range(0, self.total_size)):
                for t, token in enumerate(self.tok_concat[index]):
                    if t >= self.max_text_len:
                        break


                    if index < self.train_size:
                        self.X_train[index, t, :] = self.vectors[token]

                    else:
                        self.X_val[index-self.train_size, t, :] = self.vectors[token]

            self.y = np.zeros((self.total_size, self.ntags), dtype=np.int8)

            for i, tag_list in enumerate(self.tags):
                for j, tag in enumerate(tag_list):
                    if tag in self.tags2id.keys():
                        k = self.tags2id[tag]
                        self.y[i][k] = 1


            self.y_train = self.y[:self.train_size]
            self.y_val = self.y[self.train_size-1:]

        else:

            self.total_size = len(self.len_cooking)

            self.max_text_len = max_cook_len + max_ing_len + max_rec_len
            self.vector_size = vector_size

            self.X_val = np.zeros((self.total_size, self.max_text_len, self.vector_size), dtype=K.floatx())


            for index in tqdm(range(0, self.total_size)):
                for t, token in enumerate(self.tok_concat[index]):
                    if t >= self.max_text_len:
                        break

                    self.X_val[index, t, :] = self.vectors[token]

            self.y = np.zeros((self.total_size, self.ntags), dtype=np.int8)


            for i, tag_list in enumerate(self.tags):
                for j, tag in enumerate(tag_list):
                    if tag in self.tags2id.keys():
                        k = self.tags2id[tag]
                        self.y[i][k] = 1




                
            
        
    def train_model(self, batch_size=32, nb_epochs=100):
        """
        Training the model with set parameters
        """

        input_ = Input(shape=(self.max_text_len,self.vector_size))

        model = TimeDistributed(Dense(256, use_bias=False, activation='elu'))(input_)
        model = Bidirectional(LSTM(units=50, recurrent_dropout=0.1, return_state = True, return_sequences=True))(model)
        model = Bidirectional(LSTM(units=50, recurrent_dropout=0.2))(model)
        model = Dense(64, activation="elu")(model)
        model = Dropout(0.1)(model)
        out = Dense(self.ntags, activation="sigmoid")(model)
        self.model = Model(input_, out)
        
        if self.training == False:
            self.model.load_weights("model_new_vector.h5")
            return

        self.model.summary()

        self.model.compile(loss='binary_crossentropy',
                          optimizer=Adam(lr=0.0001, decay=1e-6),
                          metrics=['accuracy'])

        self.history = self.model.fit(self.X_train, self.y_train,
                      batch_size=batch_size,
                      shuffle=True,
                      epochs=nb_epochs,
                      validation_data=(self.X_val, self.y_val),
                      callbacks=[EarlyStopping(min_delta=0.00025, patience=2,
                                               monitor='val_loss')],
                      class_weight = self.tag_weights,
                      verbose=1) 
        
        self.model.save_weights("model_new_vector.h5")



    def predict(self, batch_size=32):
        
        """
        Generating predictions on the validation dataset (validation = 100% of data if self.training=False)
        """

        self.preds = self.model.predict(self.X_val, batch_size=32)
        
        


We instantiate the DataGetter class into an object. 

We set training = True and vectors_pretrained = True because we are training a model and using pre-existing word vectors

In [4]:
data = DataGetter(df, training=True, vectors_pretrained=True)

In [None]:
We preprocess inputs like so - 

In [5]:
data.preprocess_inputs()


100%|██████████| 63506/63506 [02:10<00:00, 488.18it/s]
100%|██████████| 63506/63506 [00:59<00:00, 1074.68it/s]
100%|██████████| 63506/63506 [00:18<00:00, 3509.01it/s]


In [21]:
data.tok_cooking_method[:10]

[['in',
  'food',
  'processor',
  'pulse',
  'basil',
  'garlic',
  'parmesan',
  'cheese',
  'salt',
  'pepper',
  'smooth',
  'add',
  'olive',
  'oil',
  'pulsing',
  'set',
  'aside',
  'spiralize',
  'zucchini',
  'cut',
  'smaller',
  'strands',
  'long',
  'place',
  'work',
  'bowl',
  'toss',
  'pesto',
  'tomatoes',
  'season',
  'salt',
  'pepper',
  'needed'],
 ['move',
  'oven',
  'rack',
  'bottom',
  'preheat',
  'oven',
  'degrees',
  'f',
  'start',
  'seasoning',
  'grab',
  'peppercorns',
  'throw',
  'big',
  'plastic',
  'bag',
  'pound',
  'rolling',
  'pin',
  'break',
  'open',
  'pull',
  'leaves',
  'rosemary',
  'thyme',
  'sprigs',
  'throw',
  'crushed',
  'peppercorns',
  'bowl',
  'salt',
  'herb',
  'leaves',
  'add',
  'minced',
  'garlic',
  'use',
  'fingers',
  'toss',
  'together',
  'set',
  'aside',
  'place',
  'beef',
  'fatside',
  'rack',
  'roasting',
  'pan',
  'drizzle',
  'olive',
  'oil',
  'surface',
  'rub',
  'hands',
  'sprinkle',
  

We compute statistics like so - 

In [60]:
data.compute_statistics()

# Number of tags 
print(data.ntags)


# Max length of a cooking_method observation
print(data.max_len_cooking)

776
200


We prepare inputs for entry into the classifier like so - 

In [None]:
data.prepare_inputs()

Finally, we train the model based on the architecture defined within the class (viewable in data.summary())

In [7]:
data.train_model()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 310, 100)     0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 310, 256)     25600       input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) [(None, 310, 100), ( 122800      time_distributed_1[0][0]         
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 100)          60400       bidirectional_1[0][0]            
                                                                 bidirectional_1[0][1]            
          

Generating predictions from the validation data (validation data is 100% of the data if training is set to False during instantiation)

In [8]:
data.predict()

In [9]:
data.preds

array([[0.0102352 , 0.01021497, 0.01067653, ..., 0.01037784, 0.00013124,
        0.01604219],
       [0.00927966, 0.00927555, 0.00982   , ..., 0.00944765, 0.00011345,
        0.01470061],
       [0.00972946, 0.00967388, 0.01021533, ..., 0.00990782, 0.00012514,
        0.01532222],
       ...,
       [0.0095132 , 0.009451  , 0.00994364, ..., 0.00975599, 0.00011653,
        0.01495697],
       [0.01049462, 0.0102013 , 0.01072578, ..., 0.01048876, 0.00014113,
        0.01587533],
       [0.04395235, 0.03795771, 0.04023674, ..., 0.03973234, 0.00224399,
        0.04695705]], dtype=float32)

Testing the model on random observations from the validation set

In [55]:
testindex = 4789

test_preds = data.preds.copy()

test_preds[test_preds >= 0.1] = 1
test_preds[test_preds < 0.1] = 0

print("Number of tags the classifier got right for index {}: {}".format(testindex, 
                                                                        np.logical_and(test_preds[testindex]==1, data.y_val[testindex]==1).sum()))


print("Number of tags predicted for index {}: {}".format(testindex, len(np.argwhere(test_preds[testindex]))))

print("Number of actual tags in index {}: {}".format(testindex, len(np.argwhere(data.y_val[testindex]))))

id2tags = {i: t for t, i in data.tags2id.items()}

Number of tags the classifier got right for index 4789: 1
Number of tags predicted for index 4789: 11
Number of actual tags in index 4789: 2


In [56]:
#np.argwhere(test_preds[testindex])

pred_tags = [id2tags[i[0]] for i in np.argwhere(test_preds[testindex])]

actual_tags = [id2tags[i[0]] for i in np.argwhere(data.y_val[testindex])]

print("Predicted tags: {}\n\nActual tags: {}".format(pred_tags, actual_tags))

Predicted tags: ['Fruit', 'Cheese', 'Dessert', 'Meat', 'Poultry', 'American', 'Main Dish', 'Vegetable', 'Easy', 'Gluten Free', 'Low Sodium']

Actual tags: ['Cheese', 'Bread']


After many trials, I could find that the signal is weakly registered in the output probabilities and that there is substantial noise. I haven't had the time to figure out how to solve this issue. To the best of my knowledge, I tried to experiment with several architectures and lowered the threshold for a positive prediction to 0.1 so that more correct items are flagged. 

Basically, I tried to prioritise recall over precision.

#    

To test this already trained model on your own (identical) dataframe, instantiate the DataGetter class and run it like so - 

We flag training as False and vectors_pretrained as True because we use the pre-existing model and vectors.

In [None]:
data_test = DataGetter(your_dataframe, training=False, vectors_pretrained=True)

The rest of the steps are the same as for the previous instantiation. 