In [4]:
### data source
#### https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions

In [1]:
import pandas as pd
import numpy as np
import json
import pickle

In [2]:
import re
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
from spacy.lang.en import English
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

###
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

### models
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim import models

In [3]:
df_recipes = pd.read_csv('../data/RAW_recipes.csv')

In [5]:
df_recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [11]:
all_ingriedients = df_recipes['ingredients'].map(lambda x: eval(x)).values

In [27]:
all_tags = df_recipes['tags'].map(lambda x: eval(x)).values

In [26]:
all_tags[0]

"['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']"

In [13]:
all_ingriedients[0]


['winter squash',
 'mexican seasoning',
 'mixed spice',
 'honey',
 'butter',
 'olive oil',
 'salt']

In [18]:
ingriedient_list = list(set([item for sublist in all_ingriedients for item in sublist]))

In [19]:
len(ingriedient_list)

14942

In [28]:
tag_list = list(set([item for sublist in all_tags for item in sublist]))

In [29]:
len(tag_list)

552

In [33]:
ingriedient_tags = set(tag_list).intersection(ingriedient_list)

In [35]:
other_tags = set(tag_list) - ingriedient_tags

In [95]:
other_tags

{'',
 '1-day-or-more',
 '15-minutes-or-less',
 '3-steps-or-less',
 '30-minutes-or-less',
 '4-hours-or-less',
 '5-ingredients-or-less',
 '60-minutes-or-less',
 'Throw the ultimate fiesta with this sopaipillas recipe from Food.com.',
 'a1-sauce',
 'african',
 'american',
 'amish-mennonite',
 'angolan',
 'appetizers',
 'april-fools-day',
 'argentine',
 'asian',
 'australian',
 'austrian',
 'baja',
 'baked-beans',
 'baking',
 'bar-cookies',
 'barbecue',
 'bass',
 'bean-soup',
 'beans-side-dishes',
 'bear',
 'beef-barley-soup',
 'beef-crock-pot',
 'beef-kidney',
 'beef-liver',
 'beef-organ-meats',
 'beef-ribs',
 'beef-sauces',
 'beef-sausage',
 'beginner-cook',
 'beijing',
 'belgian',
 'beverages',
 'birthday',
 'bisques-cream-soups',
 'black-bean-soup',
 'black-beans',
 'bok-choys',
 'brazilian',
 'bread-machine',
 'bread-pudding',
 'breads',
 'breakfast',
 'breakfast-casseroles',
 'breakfast-eggs',
 'breakfast-potatoes',
 'brewing',
 'british-columbian',
 'broil',
 'brown-bag',
 'brown-ri

In [None]:
[x for x in other_tags if 'diabetic' in x]

In [38]:
#topic modeling on steps

In [40]:
steps = df_recipes['steps'].map(lambda x: eval(x)).values

In [41]:
#### text preprocessing


stop_words = stopwords.words('english')
stop_words.extend(['from', 'to','in','a','the', 'and'])

nlp = spacy.load("en_core_web_sm")
config = {"punct_chars": [".","?"]}
nlp.add_pipe("sentencizer", config=config)

def sent_to_words(text):
    
    docs = nlp(text)
    sentences = [token.sent for token in docs.sents]
    words = [[token.text for token in x] for x in sentences]
    return words
    
def remove_stopwords(words):
    return [x for x in words if x not in stop_words]

def process_text(text):
        
    words = sent_to_words(text)
    words_clean = remove_stopwords(words)
    return words_clean

In [None]:
steps_processed = [process_text(".".join(x)) for x in steps]

In [180]:
steps_processed[0]

['in',
 'a',
 'medium',
 'saucepan',
 'combine',
 'all',
 'the',
 'ingredients',
 'for',
 'sauce#1',
 ',',
 'bring',
 'to',
 'a',
 'full',
 'rolling',
 'boil',
 ',',
 'reduce',
 'heat',
 'to',
 'medium',
 'low',
 'and',
 'simmer',
 'for',
 '1',
 'hour',
 ',',
 'stirring',
 'often',
 'rub',
 'the',
 'ribs',
 'with',
 'soy',
 'sauce',
 ',',
 'garlic',
 ',',
 'ginger',
 ',',
 'chili',
 'powder',
 ',',
 'pepper',
 ',',
 'salt',
 'and',
 'chopped',
 'cilantro',
 ',',
 'both',
 'sides',
 '!']

In [None]:
bigram = gensim.models.Phrases(steps_processed, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(steps_processed, threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
# See trigram example
print(trigram_mod[bigram_mod[steps_processed]])

In [67]:
print(trigram_mod[bigram_mod[steps_processed[0]]])

['in', 'a', 'medium', 'saucepan', 'combine', 'all', 'the', 'ingredients', 'for', 'sauce#1', ',', 'bring', 'to', 'a', 'full', 'rolling', 'boil', ',', 'reduce', 'heat', 'to', 'medium', 'low', 'and', 'simmer', 'for', '1', 'hour', ',', 'stirring', 'often', 'rub', 'the', 'ribs', 'with', 'soy', 'sauce', ',', 'garlic', ',', 'ginger', ',', 'chili', 'powder', ',', 'pepper', ',', 'salt', 'and', 'chopped', 'cilantro', ',', 'both', 'sides', '!']


In [68]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
data_words_bigrams = make_bigrams(steps_processed)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
#nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

In [69]:
print(data_lemmatized[:1])

[['medium', 'saucepan', 'combine', 'ingredient', 'sauce#1', 'bring', 'full', 'rolling', 'boil', 'reduce', 'heat', 'medium', 'low', 'simmer', 'hour', 'stir', 'often', 'rub', 'rib', 'soy', 'sauce', 'garlic', 'ginger', 'chili', 'powder', 'pepper', 'salt', 'chop', 'cilantro', 'side']]


In [70]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)]]


In [148]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=100, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [155]:
from collections import defaultdict

In [163]:
def lda_to_df(model,corpus):
    '''This function takes a gensim lda model as input, and outputs a df with topics probs by document'''
    topic_probs = model.get_document_topics(corpus) #get the list of topic probabilities by doc
    topic_dict = [dict(x) for x in topic_probs] #convert to dictionary to convert to data frame
    df = pd.DataFrame(topic_dict).fillna(0) #convert to data frame
    df['docs'] = df.index.values #create column with document indices (correspond to indices of dataframe)
    df.columns = df.columns.astype(str) #convert to string to make indexing easier
    return df

def get_best_docs(df, n, k, texts):
    '''Return the index of the n most representative documents from a list of topic responsibilities for each topic'''
    '''n is the number of douments you want, k is the number of topics in the model, the texts are the FULL texts used to fit the model'''
    #create column list to iterate over
    k_cols = range(0, k)
    
    #intialize empty list to hold results
    n_rep_docs_dict = defaultdict(list)
    
    #loop to extract documents for each topic
    for i in k_cols:
        if str(i) in df.columns:
            inds = df.nlargest(n = n, columns = str(i))['docs'].astype(int).tolist()
            #use list comprehension to extract documents
            n_rep_docs_dict[i] +=[texts[ind] for ind in inds]
    
    return n_rep_docs_dict

In [159]:
df_topics_docs = lda_to_df(lda_model,corpus)

In [161]:
len(df_topics_docs.columns)

84

In [164]:
docs_dict = get_best_docs(df_topics_docs, 5, 100, texts)

In [168]:
[" ".join(x) for x in docs_dict[3]]

['preheat oven degree grease round cake pan oven preheat begin make cake batter dissolve cocoa espresso stir buttermilk cocoa mixture set aside cool whisk together flour bake soda salt bowl beat butter medium speed about minute gradually add sugar butter mixture light fluffy slowly add egg vanilla butter continue beat sure scrape side bowl go so ingredient incorporate fold flour cocoa mixture butter mixture alternate flour cocoa follow order flour mixture cocoa mixture flour mixture cocoa mixture flour mixture overmix',
 'cake preheat oven medium bowl sift cocoa boil hot water mix let cool completely put refrigerator speed cooling meanwhile grease flour line parchment_paper 9x1 layer cake pan medium bowl sift flour bake soda salt bake powder set aside large bowl beat together butter sugar high speed well mixed add egg vanilla beat high speed scrape side bowl light fluffy about minute low speed mixer beat alternately start end flour mixture fourth cool cocoa mixture third scrape side bo

In [172]:
[" ".join(x) for x in docs_dict[16]]

['ratio rice water quantity rice increase cup rice cup water cup rice cup water method put water boil meanwhile wash rice time water start boil add rice salt check min see rice cook grain still firm raw break grain finger check drain cover fitting flat lid invert sink then leave inverted vessel good minute make sure water drain also drain colander drain pasta straighten',
 'large saucepan bring salt water boil simmer lentil low minute tender drain water lentil set aside pan saute onion garlic oil add raisin date spice mix set aside cook rice like sure fire way use rice cooker rinse rice cold water water runs_clear add rice cooker cover cup water add teaspoon salt drizzle oil cook rice do transfer cooked rice large bowl same pot add oil just cover bottom surface add potato slice add layer rice add layer lentil raisin mixture continue layer end final layer rice cover cook minute drizzle melt butter saffron water rice cover top pot rice cooker tea towel prevent steam escape top cook low m

In [179]:
[" ".join(x) for x in docs_dict[1]]

['make own pizza crust pizza sauce prepare first preheat oven degree fahrenheit follow direction pizza cooker spread pizza sauce brown side crust sprinkle fennel seed pizza cheese pizza special gourmet blend cheese lie slice pepperoni outer edge pizza then distribute remain slice inner circle voila mark pizza slice',
 'adjust cooking time + accord individual oven notice have very old oven well insulate recipe work heat retention residual heat important success recipe determine exact weight roast meat wrapper weight determine long cook roast preheat oven degree remove tenderloin refrigerator season meat desire place season meat uncovered roasting pan shelf bottom oven bake exactly minute pound adjust + accord oven accuracy heat retention have oven probe thermometer have wire go side oven door mean use set temperature alarm degree remove meat oven alarm alert go off turn oven off open oven door hour soon use probe thermometer alert internal meat temp degree remove pork oven lightly cover

In [None]:
[" ".join(x) for x in texts[20:25]]

In [None]:
" ".join(data_words_bigrams[0])

In [178]:
for i in range(0, lda_model.num_topics-1):
    print(lda_model.print_topic(i))
    print("topic %s"%str(i))

0.281*"be" + 0.213*"favorite" + 0.149*"equal" + 0.110*"gentle" + 0.058*"peppercorn" + 0.043*"crab" + 0.041*"left" + 0.000*"forum" + 0.000*"doughy" + 0.000*"obscure"
topic 0
0.499*"oven" + 0.254*"preheat" + 0.183*"degree" + 0.040*"breast" + 0.015*"pizza" + 0.000*"doughy" + 0.000*"adequately" + 0.000*"forum" + 0.000*"obscure" + 0.000*"vert"
topic 1
0.419*"remain" + 0.159*"transfer" + 0.136*"meanwhile" + 0.069*"scoop" + 0.058*"over" + 0.034*"rolling_pin" + 0.028*"heated" + 0.019*"firmly" + 0.012*"puff" + 0.012*"applesauce"
topic 2
0.209*"sugar" + 0.173*"flour" + 0.097*"beat" + 0.079*"bake" + 0.058*"powder" + 0.055*"vanilla" + 0.050*"batter" + 0.027*"light" + 0.024*"speed" + 0.023*"soda"
topic 3
0.000*"obscure" + 0.000*"grease"ooze" + 0.000*"arugula" + 0.000*"haricot" + 0.000*"vert" + 0.000*"adequately" + 0.000*"doughy" + 0.000*"forum" + 0.000*"pic" + 0.000*"gazpacho"
topic 4
0.407*"very" + 0.142*"go" + 0.133*"fresh" + 0.110*"nice" + 0.064*"last" + 0.041*"always" + 0.026*"say" + 0.022*"sme

In [165]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
#doc_lda = lda_model[corpus]

[(92,
  '0.000*"obscure" + 0.000*"grease"ooze" + 0.000*"arugula" + 0.000*"haricot" + '
  '0.000*"vert" + 0.000*"adequately" + 0.000*"doughy" + 0.000*"forum" + '
  '0.000*"pic" + 0.000*"gazpacho"'),
 (78,
  '0.000*"obscure" + 0.000*"grease"ooze" + 0.000*"arugula" + 0.000*"haricot" + '
  '0.000*"vert" + 0.000*"adequately" + 0.000*"doughy" + 0.000*"forum" + '
  '0.000*"pic" + 0.000*"gazpacho"'),
 (76,
  '0.000*"obscure" + 0.000*"grease"ooze" + 0.000*"arugula" + 0.000*"haricot" + '
  '0.000*"vert" + 0.000*"adequately" + 0.000*"doughy" + 0.000*"forum" + '
  '0.000*"pic" + 0.000*"gazpacho"'),
 (15,
  '0.000*"obscure" + 0.000*"grease"ooze" + 0.000*"arugula" + 0.000*"haricot" + '
  '0.000*"vert" + 0.000*"adequately" + 0.000*"doughy" + 0.000*"forum" + '
  '0.000*"pic" + 0.000*"gazpacho"'),
 (77,
  '0.000*"obscure" + 0.000*"grease"ooze" + 0.000*"arugula" + 0.000*"haricot" + '
  '0.000*"vert" + 0.000*"adequately" + 0.000*"doughy" + 0.000*"forum" + '
  '0.000*"pic" + 0.000*"gazpacho"'),
 (18,
  '0

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [110]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

RecursionError: maximum recursion depth exceeded