In [93]:


import numpy as np
import pandas as pd
import re
import string
from collections import Counter

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.manifold import TSNE 

# Maybe try Gensim for LDA
from  gensim.utils import simple_preprocess
import gensim.corpora as corpora
from gensim.corpora import Dictionary
import gensim.matutils as matutils
from gensim.models import LdaModel, Nmf, CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models
import pyLDAvis.sklearn

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import dill
import pickle

In [21]:
def get_topic_terms(model, vectorizer, top_n_terms=10):

    print(f'Here are top {top_n_terms} most contributing terms for each topic:')
    print('\n')
    
    vect_feature_names = vectorizer.get_feature_names()
    topics_dict = {}
    for topic_num in range(len(model.components_)):
    
        topic_name = 'Topic #'+ str(topic_num+1) # Proprietary topic name
        top_term_indices = np.argsort(model.components_[topic_num,:])[::-1]   # Sorts terms in topic_num'th Topic. 
        top_terms = []
        for term_idx in top_term_indices[0:top_n_terms]:
            top_terms.append(vect_feature_names[term_idx])
        
            topics_dict[topic_name] = top_terms  # For each topic, take top n terms and put them into a list

        print(f'{topic_name}:')
        print(topics_dict[topic_name])
    
    
    return topics_dict

In [33]:
def save_load(s_or_l, obj_name, file_name):
    saveorload = s_or_l
    if saveorload =='s':
        with open(file_name, 'wb') as f:
            dill.dump(obj_name, f)
            print(f'Obj saved as {file_name}')
    elif saveorload =='l':          
        with open(file_name, 'rb') as f:
            obj_name = dill.load(f)
            print(f'File {file_name} has been loaded as {obj_name}')
    else:
        print('Error')

In [61]:
df_rec = pd.read_csv('df_recipe_topic_labeled.csv',  index_col=0)

In [62]:
df_rec

Unnamed: 0,Name,RecipeInstructions,Topic
0,Low-Fat Berry Blue Frozen Dessert,toss s berries with sugar stand for s stirr...,Sauce?
1,Biryani,soak saffron in warm milk for s and puree in ...,Chicken
2,Best Lemonade,into a quart jar with tight fitting lid put s...,Cool Bev
3,Carina's Tofu-Vegetable Kebabs,drain the tofu carefully squeezing out excess ...,Meat Dish
4,Cabbage Soup,mix everything together and to a boil reduce ...,Soup & Stew
...,...,...,...
522512,Meg's Fresh Ginger Gingerbread,preheat oven to degf grease an x cake pan this...,Dessert
522513,Roast Prime Rib au Poivre with Mixed Peppercorns,position rack in center of oven and preheat to...,Meat Dish
522514,Kirshwasser Ice Cream,heat half and half and heavy cream to a simmer...,Sauce?
522515,Quick & Easy Asian Cucumber Salmon Rolls,in a bowl combine mayo and wasabi paste stir ...,Salad


In [31]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.22, min_df=0.015)


In [38]:
with open('tfidf', 'rb') as f:
    tfidf = dill.load(f)
    
with open('x_tfidf', 'rb') as f:
    x_tfidf = dill.load(f)

with open('recipe_model', 'rb') as f:
    recipe_model = dill.load(f)

In [42]:
topic_no = 5
iter_max = 500

nmf_sklearn = NMF(n_components=topic_no, max_iter = iter_max, shuffle=True, init='nndsvd', random_state=123)
recipe_model_low = nmf_sklearn.fit(x_tfidf)

In [44]:
topic_terms_low = get_topic_terms(recipe_model_low, tfidf)

Here are top 10 most contributing terms for each topic:


Topic #1:
['simmer', 'boil', 'sauce', 'garlic', 'onion', 'pot', 'stirring', 'tender', 'skil', 'onions']
Topic #2:
['beat', 'cake', 'cool', 'cream', 'flour', 'chocolate', 'batter', 'vanilla', 'eggs', 'milk']
Topic #3:
['chicken', 'sauce', 'marinade', 'grill', 'pieces', 'cooked', 'rice', 'broth', 'skil', 'coat']
Topic #4:
['cheese', 'dish', 'bread', 'sprinkle', 'cream', 'spread', 'casserole', 'sauce', 'layer', 'degrees']
Topic #5:
['dough', 'roll', 'sheet', 'cookie', 'flour', 'cut', 'inch', 'lightly', 'floured', 'surface']


In [59]:
topic_table_low = pd.DataFrame.from_dict(topic_terms_low)
topic_table_low

Unnamed: 0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5
0,simmer,beat,chicken,cheese,dough
1,boil,cake,sauce,dish,roll
2,sauce,cool,marinade,bread,sheet
3,garlic,cream,grill,sprinkle,cookie
4,onion,flour,pieces,cream,flour
5,pot,chocolate,cooked,spread,cut
6,stirring,batter,rice,casserole,inch
7,tender,vanilla,broth,sauce,lightly
8,skil,eggs,skil,layer,floured
9,onions,milk,coat,degrees,surface


In [64]:
for topic_num in range(len(recipe_model_low.components_)):
    
    topic_name = 'Topic #'+ str(topic_num+1)
    top_indices = np.argsort(recipe_model_low.transform(x_tfidf)[:,topic_num])[::-1]
    recipe_name = df_rec.Name.iloc[top_indices[0]]
    score = np.max(recipe_model_low.transform(x_tfidf)[:,topic_num])
    print(f'Top topic for {topic_name}: {recipe_name}')
    print(f'Model score: {score}')
    print('------------------------------------')

Top topic for Topic #1: BBQ Spaghetti
Model score: 0.04598472033141085
------------------------------------
Top topic for Topic #2: Chocolate Sour Cream Bundt Cake
Model score: 0.05027050094348303
------------------------------------
Top topic for Topic #3: Longhorn Steakhouses' Honey Mustard
Model score: 0.10078814078094561
------------------------------------
Top topic for Topic #4: Biscuit on Creamy Chicken
Model score: 0.0628247655993901
------------------------------------
Top topic for Topic #5: Foolproof  Flaky Butter Pastry
Model score: 0.07602148081119464
------------------------------------


In [142]:
topic_no = 15
iter_max = 500

nmf_sklearn = NMF(n_components=topic_no, max_iter = iter_max, shuffle=True, random_state=123) #init='nndsvd', 
recipe_model_high = nmf_sklearn.fit(x_tfidf)



In [144]:
topic_terms_high = get_topic_terms(recipe_model_high, tfidf)

Here are top 10 most contributing terms for each topic:




IndexError: list index out of range

In [145]:
topic_table_high = pd.DataFrame.from_dict(topic_terms_high)
topic_table_high

Unnamed: 0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7,Topic #8,Topic #9,Topic #10,Topic #11,Topic #12,Topic #13,Topic #14,Topic #15
0,meat,beat,chicken,cheese,dough,dressing,skil,potatoes,pasta,bread,boil,sauce,blender,ice,rice
1,grill,cake,pieces,dish,roll,juice,onion,potato,drain,crumbs,simmer,soy,blend,glass,cooked
2,beef,flour,broth,cream,sheet,salad,garlic,tender,according,loaf,stirring,noodles,smooth,shake,cooker
3,marinade,cool,cooked,sprinkle,cookie,lemon,onions,bacon,directions,spread,pot,hot,food,cream,shrimp
4,pork,batter,marinade,casserole,flour,toss,saute,mash,package,toast,saucepan,fish,process,garnish,dish
5,roast,chocolate,coat,spread,cut,serving,beans,sweet,toss,cut,reduce,tomato,puree,strain,casserole
6,bag,vanilla,bag,melted,inch,chill,tomatoes,peel,parmesan,egg,soup,worcestershire,enjoy,juice,liquid
7,ground,eggs,skin,layer,lightly,refrigerate,bacon,drain,pot,cubes,low,ginger,container,lime,fry
8,fat,egg,juices,degrees,floured,whisk,olive,dish,salted,eggs,broth,fry,use,orange,soy
9,marinate,cream,dish,sour,surface,vinegar,mushrooms,cut,tomatoes,dip,tender,cornstarch,seconds,cubes,coconut


In [50]:
docs_dict_test = {}
for topic_num in range(len(recipe_model_high.components_)):
    
    topic_name = 'Topic #'+ str(topic_num+1) # Proprietary topic name
        
    top_indices = np.argsort(recipe_model_high.transform(x_tfidf)[:,topic_num])[::-1]
    top_documents = []
    for doc_index in top_indices[0:5]:
        top_documents.append(df_rec.Name[doc_index])
        
        docs_dict_test[topic_name] = top_documents  # For each topic, take top n terms and put them into a list

    print(f'{topic_name}:')
    print(docs_dict_test[topic_name])

Topic #1:
["ChuckwagonCookie's Authentic NC Style Pork BBQ (Oven or Grill)", 'Carne Asada', 'Guinness Marinated Skirt Steak', 'Rosemary Pork Chops for the Grill', 'Lemony-Soy Herbed Pork Tenderloins']
Topic #2:
["Decadent Devil's Food Cake", 'Vanilla Butter-Creme Chocolate Ganache Birthday Cake', 'Red Hot Velvet Cupcakes With Cinnamon Buttercream', 'Chocolate Sour Cream Bundt Cake', 'Chocolate Butter Cake']
Topic #3:
['Apple Butter Barbecue Dipping Sauce', 'Smoked Chicken Apple Beer Mop', 'White Lightning Chili', "Longhorn Steakhouses' Honey Mustard", 'King Augustus Sauce']
Topic #4:
['Biscuit on Creamy Chicken', 'Parmesan Roasted Asparagus Spears', 'Broccoli Casserole With Rice', 'Weeknight Chicken Casserole', 'Cheese and Crabmeat Spread']
Topic #5:
['Foolproof  Flaky Butter Pastry', 'French Bread ("Rapid Rise")', 'Whole Wheat Crescent Rolls', 'Danish Orange Marmalade Cheese Pockets', 'Sufganiyot']
Topic #6:
['Fattoush', 'Peruvian Sarsa Salad', 'Chicken and Asparagus Salad with Strawb

In [None]:
recipe_model_high_transform = recipe_model_high.transform(x_tfidf)

In [152]:
topic_table_high

Unnamed: 0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7,Topic #8,Topic #9,Topic #10,Topic #11,Topic #12,Topic #13,Topic #14,Topic #15
0,meat,beat,chicken,cheese,dough,dressing,skil,potatoes,pasta,bread,boil,sauce,blender,ice,rice
1,grill,cake,pieces,dish,roll,juice,onion,potato,drain,crumbs,simmer,soy,blend,glass,cooked
2,beef,flour,broth,cream,sheet,salad,garlic,tender,according,loaf,stirring,noodles,smooth,shake,cooker
3,marinade,cool,cooked,sprinkle,cookie,lemon,onions,bacon,directions,spread,pot,hot,food,cream,shrimp
4,pork,batter,marinade,casserole,flour,toss,saute,mash,package,toast,saucepan,fish,process,garnish,dish
5,roast,chocolate,coat,spread,cut,serving,beans,sweet,toss,cut,reduce,tomato,puree,strain,casserole
6,bag,vanilla,bag,melted,inch,chill,tomatoes,peel,parmesan,egg,soup,worcestershire,enjoy,juice,liquid
7,ground,eggs,skin,layer,lightly,refrigerate,bacon,drain,pot,cubes,low,ginger,container,lime,fry
8,fat,egg,juices,degrees,floured,whisk,olive,dish,salted,eggs,broth,fry,use,orange,soy
9,marinate,cream,dish,sour,surface,vinegar,mushrooms,cut,tomatoes,dip,tender,cornstarch,seconds,cubes,coconut
