In [0]:
'''
Bi Directional LSTM Autoencoder
'''
from numpy import array
import numpy as np
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.models import Model
from keras.layers import LSTM,GRU,Bidirectional
from keras.layers import Dense
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.preprocessing import sequence
import sklearn.metrics
import os
import random

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
################################
################################
################################
#Load Data

#Lang class: create unique word index dictionary
SOS_token = 0
EOS_token = 1
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

#import data
import pickle
with open('/content/drive/My Drive/CS 263 Project/Final Project Data/baking_data_title_ingredients.pickle','rb') as f:
    baking_data = pickle.load(f)

joined_limited = baking_data[0]
cleaned_recipes = baking_data[1]
numerical_tokens_train = baking_data[2]
numerical_tokens_test = baking_data[3]
numerical_tokens_test_masked = baking_data[4]
IDs_train = baking_data[5]
IDs_test = baking_data[6]
#vocabulary_dict = baking_data[7]
#vocabulary = baking_data[8]
cleaned_recipes_IDs = baking_data[9]

MAX_LENGTH = max([len(s) for s in numerical_tokens_train])

In [0]:
#cut data down to recipes len 100
#data = numerical_tokens_train
MAX_LENGTH = 75
#prep data -- subset, make into list, normalize and create vocab
def prep(dta):
    temp = dta
    temp = [temp[i] for i in range(0,len(temp)) if len(temp[i].split())<= MAX_LENGTH]
    v = Lang('vocab')
    for d in temp:
        v.addSentence(d)
    print("Counted words: ",v.n_words)
    return temp,v

data_strings, vocabulary = prep(cleaned_recipes)
titles = [joined_limited.loc[i,"title"] for i in range(0,len(cleaned_recipes)) if len(cleaned_recipes[i].split())<= MAX_LENGTH]

In [0]:
data = [[vocabulary.word2index[w] for w in s.split(" ")] for s in data_strings]

In [0]:
#pad 
data = sequence.pad_sequences(data, maxlen=MAX_LENGTH,dtype='int32',value=0.0,padding="post")

# reshape input into [samples, timesteps, features]
n_examples = len(data)
n_words = MAX_LENGTH
data = data.reshape((n_examples, n_words, 1))

In [0]:
################################
################################
################################
#Model

#define EncoderDecoder Layers
hidden_size = 512
EncoderDecoder = Sequential()
EncoderDecoder.add(Bidirectional(LSTM(hidden_size, activation='sigmoid', input_shape=(n_words,1))))
EncoderDecoder.add(RepeatVector(n_words))
EncoderDecoder.add(Bidirectional(LSTM(hidden_size, activation='sigmoid', return_sequences=True)))
EncoderDecoder.add(TimeDistributed(Dense(vocabulary.n_words ,activation="softmax")))

#define optimizer and loss
optim = keras.optimizers.RMSprop(learning_rate=0.001)#,decay=0.99)
EncoderDecoder.compile(optimizer=optim,loss="sparse_categorical_crossentropy",metrics=['sparse_categorical_accuracy'])

In [0]:
#fit EncoderDecoder
history = EncoderDecoder.fit(x=data, y=data, epochs=10, verbose=1,batch_size=100,shuffle=True,validation_split = 0.05)

In [0]:
from matplotlib import pyplot as plt
plt.plot(history.history['sparse_categorical_accuracy'])
plt.plot(history.history['val_sparse_categorical_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [0]:
# save model and architecture to single file
EncoderDecoder.save("/content/drive/My Drive/CS 263 Project/Saved Models/BiLSTM_EncoderDecoder.h5")
Encoder.save("/content/drive/My Drive/CS 263 Project/Saved Models/BiLSTM_Encoder.h5")

In [0]:
#load model 
from keras.models import load_model
EncoderDecoder = load_model('/content/drive/My Drive/CS 263 Project/Saved Models/BiLSTM_EncoderDecoder.h5')
EncoderDecoder.summary()

Encoder = load_model('/content/drive/My Drive/CS 263 Project/Saved Models/BiLSTM_Encoder.h5')
Encoder.summary()

In [0]:
#get embeddings
# make encoder LSTM output layer 
Encoder = Model(inputs=EncoderDecoder.inputs, outputs=EncoderDecoder.layers[0].output)
# get the feature vector for the input sequence
embeddings = Encoder.predict(data)

In [0]:
#get embeddings for recipies with nutritional info
with open('/content/drive/My Drive/CS 263 Project/Final Project Data/nutritional_info.pickle','rb') as f:
    nutritional_info = pickle.load(f)

embeddings_IDs = [cleaned_recipes_IDs[i] for i in range(0,len(cleaned_recipes_IDs)) if len(cleaned_recipes[i].split())<= MAX_LENGTH]
nutri_embeddings_indices = [i for i, val in enumerate(embeddings_IDs) if val in nutritional_info["id"].tolist()]

nutri_embeddings_IDs = [embeddings_IDs[i] for i in nutri_embeddings_indices]
nutri_embeddings = [embeddings[i] for i in nutri_embeddings_indices]

In [0]:
#create matrix of cosine similarities for recipes with nutritional info and save with IDs
cosine_similarities = sklearn.metrics.pairwise.cosine_similarity(nutri_embeddings, Y=None, dense_output=False)
print(cosine_similarities)

import pickle
similarity_pickle = [nutri_embeddings,nutri_embeddings_IDs,cosine_similarities]
with open('/content/drive/My Drive/CS 263 Project/Final Project Data/BiLSTM_similarity.pickle', 'wb') as f:
    pickle.dump(similarity_pickle, f)