In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import pyLDAvis.sklearn
from sklearn.decomposition import NMF
import markovify
import train_NN as trainer
import pickle as pkl
import tensorflow as tf
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
# get frequency of each word in a list
def wordlist_to_freq(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return dict(list(zip(wordlist,wordfreq)))

In [None]:
# sort dictionary according to descending frequency of words in recipe and return top N results
def sorted_dict(worddict, N):
    newdict = {k: v for k, v in sorted(worddict.items(), key=lambda item: item[1], reverse=True)}
    return dict(list(newdict.items())[:N])

In [None]:
# print top words from a list of topic
def print_topic_top_words(model, vectorizer, n_top_words):
    words = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print(f'\nTopic #:{topic_idx}')
        print(' '.join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))      

In [None]:
# generate multiple sentences using a Markov bigram model and calculate their scores
def get_markov_sentence(model, iters, minLength=1):
    sentences = {}
    for i in range(iters): 
    modelGen = model.chain.gen()
    prevPrevWord = "___BEGIN__"
    prevWord = next(modelGen)
    madeSentence = prevWord + " "
    
    totalScore = 0
    numWords = 1
    for curWord in modelGen:
        madeSentence += curWord + " "
        numWords += 1
        totalScore += model.chain.model[(prevPrevWord, prevWord)][curWord]
        prevPrevWord = prevWord
        prevWord = curWord
    
    madeSentence = madeSentence.strip()
    if numWords == 0: continue
    if numWords < minLength: continue
    if madeSentence in sentences: continue
    
    totalScore += model.chain.model[(prevPrevWord, prevWord)]["___END__"]
    sentences[madeSentence] = totalScore/float(numWords)
    sorted(sentences.items(), key=lambda x: -x[1])
    return sentences.items()

In [None]:
# generate predictions from model
def generate_sentence(model, tokenizer, sequence_length, starting_text, num_predicted_words):
    prediction = [starting_text]
    for _ in range(num_predicted_words):
        encoded_text = tokenizer.texts_to_sequences([starting_text])[0]
        encoded_text = pad_sequences([encoded_text], 
                                     maxlen=sequence_length, 
                                     truncating='pre')
        preds = model.predict_classes(encoded_text, verbose=0)
        out_word = ''
        for word, idx in tokenizer.word_index.items():
            if idx == preds:
                out_word = word
                break
        starting_text += ' ' + out_word
        prediction.append(out_word)
    return ' '.join(prediction)

In [None]:
data = []
with open('data/recipeInfo.txt', 'r') as f:
    line = f.readline()
    while line:
        data.append(line[:-2])
        line = f.readline()
f.close()

df_recipe = pd.DataFrame(data, columns=['Recipe'])
df_recipe['Length'] = df_recipe['Recipe'].apply(lambda x: len(x.split()))
df_recipe['Unique Words'] = df_recipe['Recipe'].apply(lambda x: len(set(x.split())))
df_recipe['Tokenized Recipe'] = df_recipe['Recipe'].apply(trainer.tokenize)
df_recipe['Cleaned Recipe'] = df_recipe['Tokenized Recipe'].str.join(' ')
df_recipe.head(10)

In [None]:
from wordcloud import WordCloud

long_string = ','.join(list(df_recipe['Recipe'].values))
wordcloud = WordCloud(background_color='white', max_words=100, contour_width=3, contour_color='steelblue')
wordcloud.generate(long_string)
wordcloud.to_image()

In [None]:
recipe_split = [string.split(' ') for string in df_recipe['Recipe'].tolist()]
word_list = [word.translate(str.maketrans('', '', string.punctuation)).lower() 
             for recipe in recipe_split for word in recipe]
word_list = [word for word in word_list if word not in set(stopwords.words('english'))]
word_dict = sorted_dict( wordlist_to_freq(word_list), 21 )

In [None]:
df = pd.DataFrame(list(word_dict.items())[1:], columns=['Word', 'Frequency'])
fig = px.bar(df, x=df['Word'], y=df['Frequency'], orientation='v')
fig.show()

In [None]:
number_topics = 5
number_words = 15

In [None]:
# performing LDA for topic modeling
count_vectorizer = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
count_data = count_vectorizer.fit_transform(df_recipe['Cleaned Recipe'])

lda = LDA(n_components=number_topics, n_jobs=-1, learning_method='batch', max_iter=50, random_state=42)
lda.fit(count_data)

print('Topics found via LDA:')
print_topic_top_words(lda, count_vectorizer, number_words)

In [None]:
# visualize topics using pyLDAvis package
 pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda, count_data, count_vectorizer, mds='tsne')
panel

In [None]:
# performing NMF for topic modeling
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = tfidf_vectorizer.fit_transform(df_recipe['Cleaned Recipe'])

nmf = NMF(n_components=number_topics, random_state=42)
nmf.fit(doc_term_matrix)

print('Topics found via NMF:')
print_topic_top_words(nmf, tfidf_vectorizer, number_words)

In [None]:
# build Markov Chain model using markovify to predict text
recipes = df_recipe['Cleaned Recipe'].tolist()
text_model = markovify.NewlineText(recipes, state_size=2)
for idx in range(2):
    print(idx, text_model.make_sentence())

In [None]:
list(getSent(text_model, 500, 4))[0]

In [None]:
# load trained NN to generate sentences with word-level language model
trained_model = load_model('model/recipe_model_epoch100.h5')
with open('model/tokenizer_model.pkl', 'rb') as t:
    tokenizer = pkl.load(t)

In [None]:
seq_length = 20
generate_sentence(trained_model, tokenizer, seq_length, 'i made bread', 50)