# Imports

In [2]:
import csv
import re
import numpy as np
from keras.utils.np_utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Bidirectional, LayerNormalization
from tensorflow.keras.layers import LSTM, SpatialDropout1D, GRU, Concatenate
from tensorflow.keras.layers import Dropout, Flatten, concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import model_from_json
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow as tf
from sklearn.utils import class_weight

import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='3'

import warnings
warnings.simplefilter("ignore", UserWarning)

# fix random seed for reproducibility
np.random.seed(7)

Set Root Path (Where model and results will be stored and where model will be loaded from)

In [3]:
ROOT_PATH = './'

MODEL_INDEX = 1
EMBEDDING_VECTOR_LENGTH = 50

In [4]:
if not os.path.exists(ROOT_PATH + 'Models'):
    os.makedirs(ROOT_PATH + 'Models')

if not os.path.exists(ROOT_PATH + 'Results'):
    os.makedirs(ROOT_PATH + 'Results')

# Loading Data

In [5]:
def load_data(path, name_of_data):
    data, labels = [], []
    
    if name_of_data == "test":
        
        with open(path + 'data/' + name_of_data + '.txt') as f1:
            reader1 = csv.reader(f1, delimiter = "\n")
            for line in reader1:
                data.append(line[0])
                
        return data
    
    else:
        
        with open(path + 'data/' + name_of_data + '.txt') as f1:
            reader1 = csv.reader(f1, delimiter = "\n")
            for line in reader1:
                data.append(line[0])

        with open(path + 'data/' + name_of_data +'.labels') as f2:
            reader2 = csv.reader(f2)
            for line in reader2:
                labels.append(int(line[0]))

        return data, labels

In [6]:
X_train, y_train = load_data(ROOT_PATH, "train")
X_val, y_val = load_data(ROOT_PATH, "val")
X_test = load_data(ROOT_PATH, "test")

In [7]:
print("Train Records: ", len(X_train), len(y_train))
print("Val Records: ", len(X_val), len(y_val))
print("Test Records: ", len(X_test))

Train Records:  41816 41816
Val Records:  5227 5227
Test Records:  5227


Sample From Dataset

In [8]:
print(X_train[0])
print(y_train[0])

Grease a baking sheet and line with parchment paper. Mix white chocolate chips and peanut butter together in a microwave-safe bowl; heat in microwave until half-melted, 30 seconds to 1 minutes. Stir. Place semi-sweet chocolate chips in a microwave-safe bowl; heat in microwave until half-melted, 15 to 30 seconds. Stir vanilla extract into half-melted semi-sweet chocolate. Spread peanut butter mixture onto the prepared baking sheet. Evenly distribute melted semi-sweet chocolate over peanut butter mixture. Using the tip of a sharp knife, drag semi-sweet chocolate through peanut butter mixture making a marble-pattern. Refrigerate until set, 30 minutes to 2 hours. Cut into pieces and store in an air-tight container. I just recently tried this recipe, and everyone in the family loved it! They call it Reese's® Peanut Butter Bark! It's very good!
4


# Preprocessing

The Training dataset is tokenized (words turned to integers) for input to the model

In [9]:
tokenizer = Tokenizer(num_words=10000, lower = True)
tokenizer.fit_on_texts(X_train)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1
print("Vocab Size ", vocab_size)

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

Vocab Size  19733


In [10]:
print(X_train[0])

[153, 3, 35, 130, 2, 346, 6, 560, 198, 33, 168, 120, 250, 2, 389, 40, 39, 4, 3, 344, 699, 19, 12, 4, 344, 7, 151, 170, 116, 476, 5, 17, 9, 14, 31, 2049, 210, 120, 250, 4, 3, 344, 699, 19, 12, 4, 344, 7, 151, 170, 119, 5, 116, 476, 14, 131, 276, 11, 151, 170, 2049, 210, 120, 121, 389, 40, 18, 147, 1, 128, 35, 130, 169, 1723, 170, 2049, 210, 120, 16, 389, 40, 18, 199, 1, 1299, 8, 3, 967, 481, 5848, 2049, 210, 120, 183, 389, 40, 18, 548, 3, 3126, 1866, 175, 7, 84, 116, 9, 5, 26, 127, 145, 11, 227, 2, 464, 4, 72, 980, 1179, 496, 54, 140, 2401, 1032, 23, 74, 2, 656, 4, 1, 242, 964, 34, 221, 1734, 34, 7706, 389, 40, 3077, 208, 165, 252]


The sequences are either truncated or padded so that all sequences are of equal length 

In [11]:
# truncate and pad input sequences
max_review_length = 1000
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length, padding='post')
X_val = sequence.pad_sequences(X_val, maxlen=max_review_length, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length, padding='post')

In [12]:
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

y_train = np.array(y_train)
y_val = np.array(y_val)

In [13]:
print(X_train.dtype)
print(X_val.dtype)
print(X_test.dtype)

print(y_train.dtype)
print(y_val.dtype)

int32
int32
int32
int64
int64


The labels are changed to 0 to 11 (for 12 categories) as the model labels start from 0

In [14]:
y_train = y_train - 1
y_val = y_val - 1

train_labels = to_categorical(y_train, num_classes = 12)
val_labels = to_categorical(y_val, num_classes = 12)

In [15]:
print(y_train[0])
print(train_labels[0])

3
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]


In [16]:
print(X_train.shape, train_labels.shape)
print(X_val.shape, val_labels.shape)

(41816, 1000) (41816, 12)
(5227, 1000) (5227, 12)


Word embedding file is imported for use in the embedding layer

In [21]:
embeddings_index = {}
f = open(ROOT_PATH + 'glove.6B.50d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 50d.' % len(embeddings_index))

Total 400000 word vectors in Glove 6B 50d.


In [22]:
word_index = tokenizer.word_index

print('Number of Unique Tokens',len(word_index))
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_VECTOR_LENGTH))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:

        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Number of Unique Tokens 19732


# Training

In [None]:
# The Model

####

left_branch = Sequential()
left_branch.add(Embedding(vocab_size, EMBEDDING_VECTOR_LENGTH, 
                    weights = [embedding_matrix],
                    input_length = max_review_length, 
                    trainable=True))

left_branch.add(Dropout(0.3))

left_branch.add(Bidirectional(GRU(EMBEDDING_VECTOR_LENGTH, return_sequences = True)))
left_branch.add(Bidirectional(GRU(EMBEDDING_VECTOR_LENGTH)))
left_branch.add(Dense(128, activation='relu'))

right_branch = Sequential()
right_branch.add(Embedding(vocab_size, EMBEDDING_VECTOR_LENGTH, 
                    weights=[embedding_matrix],
                    input_length = max_review_length, 
                    trainable=True))

right_branch.add(Dropout(0.3))

right_branch.add(Bidirectional(GRU(EMBEDDING_VECTOR_LENGTH, return_sequences = True)))
right_branch.add(Bidirectional(GRU(EMBEDDING_VECTOR_LENGTH)))
right_branch.add(Dense(128, activation='relu'))

merged = concatenate([left_branch.output, right_branch.output])

output_layer = Dense(12, activation='softmax')(merged)

final_model = Model(inputs = [left_branch.input, right_branch.input], outputs = [output_layer])
final_model.summary()

####

opt = Adam(learning_rate=0.001)

final_model.compile(loss = 'categorical_crossentropy',
              optimizer = opt, 
              metrics=['accuracy'])

modelname = 'PR2_' + str(MODEL_INDEX)
model_json = final_model.to_json()

# where the model will be saved 
with open(ROOT_PATH + 'Models/_' + modelname + '.json', "w") as json_file:
    json_file.write(model_json)
    
# define early stopping callback
earlystop = EarlyStopping(monitor='val_loss', 
                          min_delta=0.001, 
                          patience=7, 
                          verbose=2, 
                          mode='auto', 
                          baseline=None, 
                          restore_best_weights=True)  

# define modelcheckpoint callback
checkpointer = ModelCheckpoint(filepath = ROOT_PATH + 'Models/_' + modelname + '.hdf5',
                               monitor='val_loss', 
                               save_best_only=True)

# callbacks list
callbacks_list = [earlystop, 
                  checkpointer,
                  ReduceLROnPlateau()]

# Training of the model
final_model.fit([X_train, X_train], train_labels, 
          validation_data=([X_val, X_val], val_labels), 
          epochs=100, 
          batch_size=50, 
          callbacks=callbacks_list)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_input (InputLayer)    [(None, 1000)]       0                                            
__________________________________________________________________________________________________
embedding_1_input (InputLayer)  [(None, 1000)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1000, 50)     986650      embedding_input[0][0]            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1000, 50)     986650      embedding_1_input[0][0]          
______________________________________________________________________________________________

<tensorflow.python.keras.callbacks.History at 0x7fd49dec8050>

# Testing

In [23]:
MODEL_INDEX = 31

In [24]:
modelname='PR2_' + str(MODEL_INDEX)

json_file = open(ROOT_PATH + 'Models/_' + modelname + '.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights(filepath = ROOT_PATH + 'Models/_' + modelname + '.hdf5')
print("Loaded " + modelname +" from disk")

opt = Adam(learning_rate=0.001)
# evaluate loaded model on test data
loaded_model.compile(loss= 'categorical_crossentropy',
              optimizer = opt, 
              metrics=['accuracy'])

Loaded PR2_31 from disk


In [25]:
predicted_labels = loaded_model.predict([X_test,X_test])
print("Prediction Done")

Prediction Done


In [26]:
final_predictions = np.argmax(predicted_labels, axis=-1)
print(final_predictions.shape)
final_predictions = final_predictions + 1

with open(ROOT_PATH + "Results/" + modelname + "_Final_Results.txt", 'w', newline='') as w1:

    writer = csv.writer(w1, delimiter=' ')

    for p in list(final_predictions):
        writer.writerow([p])
        
print("File saved at ", ROOT_PATH + "Results/" + modelname + "_Final_Results.txt")

(5227,)
File saved at  ./Results/PR2_31_Final_Results.txt


# Sample Results

In [28]:
test_data = load_data(ROOT_PATH, "test")

print("Recipe 1")
print(test_data[0])

print()
print("Predicted category:", final_predictions[0])

Recipe 1
Bring a large pot of water to a boil. Add potatoes, and cook until tender but still firm, 12 to 15 minutes; drain. Place bacon in a large, deep skillet. Cook over medium high heat until evenly brown. Cut into small chunks; set aside. Place potatoes into skillet, and cook on medium heat until browned. Flip potatoes occasionally to prevent sticking. Stir in green pepper, red pepper, onion, and mushrooms. Cook until vegetables are tender. Stir in cooked bacon, and season with salt and pepper. Cover with shredded cheese, and turn mixture until cheese is melted. Keep on low heat while cooking eggs. Cook eggs to your preferred style. Place potatoes in a large serving dish, and top with eggs (2 per serving). Hearty breakfast skillets. Serve with toast or muffins.

Predicted category: 3
