In [None]:
import numpy as np
import pandas as pd
import datetime
import plotly.express as px
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.corpus import stopwords
import markovify
import pickle
import tensorflow as tf
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint

In [None]:
def wordListToFreqDict(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return dict(list(zip(wordlist,wordfreq)))

In [None]:
# sort dictionary according to descending frequency of words in recipe and return top N results
def sortedDict(worddict, N):
    newdict = {k: v for k, v in sorted(worddict.items(), key=lambda item: item[1], reverse=True)}
    return dict(list(newdict.items())[:N])

In [None]:
def tokenize(text):
    punctuation_map = str.maketrans('', '', string.punctuation)
    stopwords_list = stopwords.words('english')
    stopwords_list.remove('i')
    stopwords_list.remove('me')
    stopwords_list.append('com')
    stopwords_set = set(stopwords_list)
    text = text.split()
    text = [word for word in text if not ('http' in word or 'www' in word)]
    text = [word.translate(punctuation_map).lower() for word in text]
    tokenized_words = [word for word in text if word not in stopwords_set]
    return tokenized_words

In [None]:
def get_sequence(tokens, length=21):
    sequences = []
    for i in range(length, len(tokens)+1):
        seq = tokens[i-length:i]
        line = ' '.join(seq)
        sequences.append(line)
    return sequences

In [None]:
# generate predictions from model
def generate_sentence(model, tokenizer, sequence_length, starting_text, num_predicted_words):
    prediction = [starting_text]
    for _ in range(num_predicted_words):
        encoded_text = tokenizer.texts_to_sequences([starting_text])[0]
        encoded_text = pad_sequences([encoded_text], 
                                     maxlen=sequence_length, 
                                     truncating='pre')
        preds = model.predict_classes(encoded_text, verbose=0)
        out_word = ''
        for word, idx in tokenizer.word_index.items():
            if idx == preds:
                out_word = word
                break
        starting_text += ' ' + out_word
        prediction.append(out_word)
    return ' '.join(prediction)

In [None]:
data = []
with open('data/recipeInfo.txt', 'r') as f:
    line = f.readline()
    while line:
        data.append(line[:-2])
        line = f.readline()
f.close()

df_recipe = pd.DataFrame(data, columns=['Recipe'])
df_recipe['Length'] = df_recipe['Recipe'].apply(lambda x: len(x.split()))
df_recipe['Unique Words'] = df_recipe['Recipe'].apply(lambda x: len(set(x.split())))
df_recipe['Tokenized Recipe'] = df_recipe['Recipe'].apply(tokenize)
df_recipe['Cleaned Recipe'] = df_recipe['Tokenized Recipe'].str.join(' ')
df_recipe.head(10)

In [None]:
# import the wordcloud library
from wordcloud import WordCloud
# join the different processed titles together.
long_string = ','.join(list(df_recipe['Recipe'].values))
# create a WordCloud object
wordcloud = WordCloud(background_color='white', max_words=100, contour_width=3, contour_color='steelblue')
# generate a word cloud
wordcloud.generate(long_string)
# visualize the word cloud
wordcloud.to_image()

In [None]:
# generate a histogram with word frequency counts, but without stop words

recipe_split = [string.split(' ') for string in df_recipe['Recipe'].tolist()]
word_list = [word.translate(str.maketrans('', '', string.punctuation)).lower() 
             for recipe in recipe_split for word in recipe]
word_list = [word for word in word_list if word not in set(stopwords.words('english'))]
word_dict = sortedDict( wordListToFreqDict(word_list), 21 )

In [None]:
df = pd.DataFrame(list(word_dict.items())[1:], columns=['Word', 'Frequency'])
fig = px.bar(df, x=df['Word'], y=df['Frequency'], orientation='v')
fig.show()

In [None]:
# performing LDA for topic modeling

count_vectorizer = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
count_data = count_vectorizer.fit_transform(df_recipe['Cleaned Recipe'])

number_topics = 5
number_words = 15

lda = LDA(n_components=number_topics, n_jobs=-1, learning_method='batch', max_iter=50, random_state=42)
lda.fit(count_data)

# function to print n top words from each topics found by the LDA fit
def print_topics_LDA(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print('\nTopic #%d:' % topic_idx)
        print(' '.join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# print the topics found by the LDA model
print('Topics found via LDA:')
print_topics_LDA(lda, count_vectorizer, number_words)

# assign topic to each recipe entry in dataframe
topic_values = lda.transform(count_data)
df_recipe['Topic'] = topic_values.argmax(axis=1)

In [None]:
# visualize topics using pyLDAvis package
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda, count_data, count_vectorizer, mds='tsne')
panel

In [None]:
# performing NMF for topic modeling

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = tfidf_vectorizer.fit_transform(df_recipe['Cleaned Recipe'])

nmf = NMF(n_components=number_topics, random_state=42)
nmf.fit(doc_term_matrix)

# function to print n top words from each topics found by the NMF fit
def print_topics_NMF(model, tfidf_vectorizer, n_top_words):
    words = tfidf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print('\nTopic #%d:' % topic_idx)
        print(' '.join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

print('Topics found via NMF:')
print_topics_NMF(nmf, tfidf_vectorizer, number_words)

In [None]:
# build Markov Chain model using markovify to predict text

recipes = df_recipe['Cleaned Recipe'].tolist()
text_model = markovify.NewlineText(recipes, state_size=2)
for idx in range(2):
    print(idx, text_model.make_sentence())

In [None]:
def getSent(model, iters, minLength=1):
  sentences = {}
  for i in range(iters): 
    modelGen = model.chain.gen()
    prevPrevWord = "___BEGIN__"
    prevWord = next(modelGen)
    madeSentence = prevWord + " "
    
    totalScore = 0
    numWords = 1
    for curWord in modelGen:
      madeSentence += curWord + " "
      numWords += 1
      totalScore += model.chain.model[(prevPrevWord, prevWord)][curWord]
      prevPrevWord = prevWord
      prevWord = curWord
    
    madeSentence = madeSentence.strip()
    if numWords == 0: continue
    
    if numWords < minLength: continue
    if madeSentence in sentences: continue
    
    totalScore += model.chain.model[(prevPrevWord, prevWord)]["___END__"]
    
    sentences[madeSentence] = totalScore/float(numWords)
  
  # Get the sentences as (sentence, score) pairs and sort them so the sentences with the highest score appear first
  sorted(sentences.items(), key=lambda x: -x[1])
  
  return sentences.items()

list(getSent(text_model, 500, 4))[0]

In [None]:
seq_length = 20
recipes_token = df_recipe['Tokenized Recipe'].tolist()
token_sequences = [token for recipe in recipes_token for 
                   token in get_sequence(recipe, length = seq_length+1)]

In [None]:
file = open('tokenized_recipes.txt', 'w')
file.write('\n'.join(token_sequences))
file.close()

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_sequences)
num_sequences = tokenizer.texts_to_sequences(token_sequences)
num_sequences = pad_sequences(num_sequences, padding='pre')
vocab_size = len(tokenizer.word_index) + 1

In [None]:
sequences_array = np.array(num_sequences)
X, y = sequences_array[:,:-1], sequences_array[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [None]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, seq_length, input_length=seq_length))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(50, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
path = './checkpoints/recipe_model.h5'
checkpoint = ModelCheckpoint(path, monitor='loss', verbose=1, save_best_only=True, mode='min')
model.fit(X, y, batch_size=128, epochs=100, verbose=1, callbacks=[checkpoint])
pickle.dump(tokenizer, open('tokenizer_model.pkl','wb'))
model.save('trained_recipe_model.h5')

In [None]:
trained_model = load_model('model/trained_recipe_model.h5')
with open('model/tokenizer_model.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
generate_sentence(trained_model, tokenizer, seq_length, 'i made bread', 50)

In [None]:
# group recipes by yeast, levain and others