In [1]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import random
import matplotlib.pyplot as plt

In [2]:
data_clean = pd.read_pickle("data_clean_allrecipe.pkl")
data_clean.head()

Unnamed: 0_level_0,ingredient
ID,Unnamed: 1_level_1
BBC1,unsalted_butter chocolate plain_flour cocoa_po...
BBC10,butter_biscuit butter syrup dark_chocolate mil...
BBC100,digestive_biscuit butter caramel salt chocolat...
BBC1000,unsalted_butter hot_water chocolate_cocoa cast...
BBC1001,plain_flour ginger unsalted_butter muscovado_s...


In [3]:
# list of text documents
from sklearn.feature_extraction.text import TfidfVectorizer
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(data_clean['ingredient'])
# summarize
#print(vectorizer.vocabulary_)
#print(vectorizer.idf_)
# encode document
vector = vectorizer.transform(data_clean['ingredient']) # this is sparse vector
#print(vector)
# summarize encoded vector
#print(vector.shape)
#print(vector.toarray())
terms = vectorizer.get_feature_names()
terms
sums = vector.sum(axis=0)
# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append( (term, sums[0,col] ))

ranking = pd.DataFrame(data, columns=['term','rank'])
print(ranking.sort_values('rank', ascending=False))

                          term         rank
3513                     sugar  2023.351620
1224                       egg  1545.392928
382                     butter  1531.718240
1402                     flour  1513.851328
3239                      salt  1389.773572
3887                   vanilla  1211.948133
2329                      milk   988.125926
742                   cinnamon   908.944744
157              baking_powder   860.261038
3909           vanilla_extract   788.642152
4003                     water   768.982005
366                brown_sugar   741.241354
158                baking_soda   710.983843
2281                 margarine   562.732018
2082               lemon_juice   561.102624
957               cream_cheese   547.486785
1243                 egg_white   528.721888
1247                  egg_yolk   518.233576
1725          granulated_sugar   505.763098
2524                    nutmeg   490.627666
2708                     pecan   489.961484
2850            powdered_sugar  

In [4]:
#Since most cells in this matrix will be zero, this will give the percentage of cells containing non-zero values.
# Materialize the sparse data
data_dense = vector.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  0.20033588914359896 %


In [5]:
#Lets buil LDA model
from sklearn.decomposition import LatentDirichletAllocation

no_topics = 100

# Run LDA
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=10, learning_method='online', learning_decay=0.7,learning_offset=50.,random_state=100).fit(vector)
lda_output = lda_model.fit(vector)
lda_output

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=100, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=100, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [6]:
#This is to diagnose model performance with perplexity and log-likelihood
#Note: The model with higher log-likelihood and lower perplexity ( (exp(-1. * log-likelihood per word)) is considered to be good).
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(vector))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(vector))

# See model parameters
#pprint(lda_model.get_params())

Log Likelihood:  -644007.7829965665
Perplexity:  73529.6426427001


In [7]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ,".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [8]:
no_top_words = 10
display_topics(lda_model, terms, no_top_words)

Topic 0:
base ,center ,apricot_sauce ,dark_chocolate_fudge ,paste_food_coloring ,nut_cooky ,dry_cashew ,carrot_pineapple_cooky ,apple_oatmeal_cooky ,apricot_cashew_cooky
Topic 1:
pistachio_nut ,biscuit ,orange_water ,milk_cheese ,red_plum ,pate_dough ,white_corn_meal ,vanilla_pastry ,pulp ,black_food
Topic 2:
confect_sugar ,caramel_wafer ,toffee ,peanut ,vanilla_ice_cream ,banana ,milk ,soft_sugar ,white_marzipan ,nutmeg_heart
Topic 3:
jellied_cranberry_sauce ,coconut_cooky ,soda_filling ,raspberry ,blueberry ,easy_vanilla ,strawberry ,liqueur ,caster_sugar ,icing_sugar
Topic 4:
swiss_milk_pudding ,red_cinnamon ,unsalted_margarine ,banana_liqueur ,chocolate_peanut ,basil ,dark_cocoa ,butternut_flavoring ,cinnamon_graham_cracker ,fruit_apricot_jam
Topic 5:
chocolate_chip ,cheese ,chocolate ,butterscotch_chip ,ladyfinger ,peanut_butter_chip ,cocoa_powder ,matzo_meal ,chow_noodle ,candied_fruit
Topic 6:
unflavored_gelatin ,fruit_juice ,fat_free_cheese ,cordial ,sherbet ,unsweetened_orange

In [9]:
#to visualize the LDA model with pyLDAvis
#Note: A good topic model will have non-overlapping, fairly big sized blobs for each topic. 
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline



pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, vector, vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [10]:
# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_topics)]

# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = terms
df_topic_keywords.index = topicnames

# View
df_topic_keywords

TypeError: 'NoneType' object cannot be interpreted as an integer

In [None]:
#To see the dominant topic in each document
#Note: To classify a document as belonging to a particular topic, a logical approach is to see which topic has the highest contribution to that document and assign it.

# index names
docnames = ["Doc" + str(i) for i in range(len(data_clean))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

In [None]:
#Time to test the model
text = input("write the ingredients to learn about the recipe (using comma seperator)")
data_test = (text)
print("You have typed input '%s':" %data_test)

In [None]:
d = np.argmax(lda_model.transform(vectorizer.transform([data_test],)))
print(d)

In [None]:
no_top_words = 8
for topic_idx, topic in enumerate(lda_model.components_):
    if topic_idx== d:
        print ("Topic %d:" % (d))
        print (" ,".join([terms[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
    pass