## RecipeToVec
Word to vec using gensim: https://radimrehurek.com/gensim/models/word2vec.html
Here we are creating the word vectors from the recipes.
Each document is one recipe's list of clean ingredients + verbs. 
We use the Gensim model to create the similarity matrix(cosine similarity).

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
#from gensim.models.word2vec import Word2Vec
from gensim import corpora, models
import pickle
import os
import logging
import operator

Using TensorFlow backend.


In [2]:
# Enable logging (for gensim)
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
pd.set_option('display.max_rows', 50)

cleanerIngredientsDict=pd.read_pickle('cleanerIngredients.pkl')  
verbsDict=pd.read_pickle('verbs.pkl') 

In [3]:
print len(cleanerIngredientsDict)
print len(verbsDict)
print verbsDict['http://allrecipes.com/recipe/33385/best-spanish-rice/']
print verbsDict['http://allrecipes.com/recipe/257873/family-style-korean-fried-chicken/']
print cleanerIngredientsDict['http://allrecipes.com/recipe/257873/family-style-korean-fried-chicken/']

49276
49276
[u'heat', u'stir', u'cook', u'mix', u'stir', u'begin', u'reduce', u'cover', u'simmer', u'absorb']
[u'whisk', u'combine', u'whisk', u'fold', u'stir', u'cover', u'marinate', u'stir', u'fill', u'heat', u'sizzles', u'dropped', u'stir', u'fry', u'browned', u'drain', u'lined']
set([u'oil', u'flour', 'sesame seed', u'sugar', u'cornstarch', u'garlic', u'chicken', u'soy sauce', u'salt', u'green onion', 'egg'])


In [4]:
allRecipes=pd.read_pickle('CleanedIngredients.pkl')
allRecipes.drop_duplicates(subset='url', keep='first', inplace=True)

In [5]:
recipes=[]
names=[]
categories=[]
for idx, row in allRecipes.iterrows():
    name = [word for word in row["name"].lower().split()]
    url = row["url"]
    verbs = verbsDict[url]
    if type(verbs) != list:
        print url
        print '%s is not a list' % verbs
        verbs=[]
    ingredients = list(cleanerIngredientsDict[url])
    # Concatenate phrases into single tokens
    ingredients=[i.replace(' ','_') for i in ingredients]
    recipes.append(name + ingredients + verbs)
    names.append(row["name"])
    categories.append(row["categories"])

In [6]:
allRecipes.sample(1)

Unnamed: 0,categories,cookingTime,description,ingredients,instructionSteps,name,rating,ratingCount,url,cookingTimeMinutes,cleanedIngredients
110,"[Asian Recipes, Tuna Recipes, Fish Recipes, Ev...",PT35M,,"[2 (5 ounce) cans tuna, drained and flaked, 1 ...","[In a large bowl, mix tuna, egg, bread crumbs,...",Asian Tuna Patties,4.25,328,http://allrecipes.com/recipe/44816/asian-tuna-...,35.0,"[tuna, egg, bread crumbs, green onions, garlic..."


In [7]:
print 'Total recipes loaded: %s ' % len(recipes)
print recipes[0]

Total recipes loaded: 48417 
[u'fresco', u'salsa', u'tomato', u'lime_juice', u'cilantro', u'red_bell_pepper', u'onion', u'yellow_bell_pepper', u'salt', u'mix', u'lime', u'cover', u'refrigerate', u'serve']


In [8]:
# Create a dictionary and save it
dictionary = corpora.Dictionary(recipes)
dictionary.save('recipe2vec.dict')
print(dictionary)
print "The token ID of milk is: %s " % dictionary.token2id["milk"] 

Dictionary(24739 unique tokens: [u'', u'bar_milk_chocolate_crispy_rice', u'gai', u'blast-off', u'apricot_preserves']...)
The token ID of milk is: 62 


In [9]:
# Create a corpus and save it
corpus = [dictionary.doc2bow(recipe) for recipe in recipes]
corpora.MmCorpus.serialize('recipe2vec.mm', corpus)

## Now let's build a model

In [10]:
if (os.path.exists("recipe2vec.dict")):
    dictionary = corpora.Dictionary.load('recipe2vec.dict')
    corpus = corpora.MmCorpus('recipe2vec.mm')
    print("Loaded dictionary and corpus from disk")
else:
    print("Error: Could find dictionary \"recipe2vec.dict\"")

Loaded dictionary and corpus from disk


In [11]:
tfidf = models.TfidfModel(corpus, normalize=True)
corpus_tfidf = tfidf[corpus]

In [12]:
corpus_tfidf = tfidf[corpus]

In [13]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->lsi

In [28]:
print('Training a Word2vec model...')
#w2v_model = models.word2vec.Word2Vec(recipes, size=200, window=7, min_count=10, workers=4, hs=1, negative=0)
w2v_model = models.word2vec.Word2Vec(recipes, size=150, window=7, min_count=20, workers=4, iter=5, hs=0, negative=5)

Training a Word2vec model...


In [29]:
w2v_model['beef'][:10]

array([-0.9227187 ,  1.48035741, -0.17479393,  0.67896163, -0.21199563,
       -0.65792912,  0.12554877, -0.12735058, -0.40433481, -1.39427865], dtype=float32)

In [30]:
w2v_model.wv.most_similar(positive=['meat','lasagna'], topn=3)

[(u'spaghetti', 0.8760913610458374),
 (u'italian', 0.8686254024505615),
 (u'manicotti', 0.8604891300201416)]

In [31]:
w2v_model.wv.most_similar(positive=['beef','stew'],negative=['pork'], topn=3)

[(u'tomato_soup', 0.6494177579879761),
 ('ranchstyle_bean', 0.6140034794807434),
 (u'chili_seasoning_mix', 0.5937557220458984)]

In [32]:
w2v_model.wv.most_similar(positive=['italian','pasta'], topn=10)

[(u'rigatoni', 0.9439151287078857),
 (u'spaghetti', 0.9359753727912903),
 (u'fettuccini', 0.9133577346801758),
 (u'e', 0.9106569886207581),
 (u'alla', 0.9087485074996948),
 (u'florentine', 0.908035159111023),
 (u'primavera', 0.9050279259681702),
 (u'mediterranean', 0.9041897654533386),
 (u'seafood', 0.9012995958328247),
 (u'veggie', 0.8981432914733887)]

In [33]:
v=[sum([w2v_model[word] for word in cent if word in w2v_model]) for cent in recipes]
print len(v)
print len(v[0])

48417
150


In [34]:
# This runs a shell command from the notebook.
!pip install plotly

# Plotly imports.
import plotly.offline as plotly
plotly.offline.init_notebook_mode()
import plotly.graph_objs as go




In [35]:
n = 1000
data = [go.Scatter(x=v[0][:n], y=v[1][:n], text=names,
                   mode='markers', textposition='bottom', hoverinfo='text')]
fig = go.Figure(data=data, layout=go.Layout(title="Word Embeddings", hovermode='closest'))
plotly.iplot(fig)

In [36]:
print recipes[0]
print names[0]
print categories[0]
print v[0][:10]

[u'fresco', u'salsa', u'tomato', u'lime_juice', u'cilantro', u'red_bell_pepper', u'onion', u'yellow_bell_pepper', u'salt', u'mix', u'lime', u'cover', u'refrigerate', u'serve']
Fresco Salsa
[u'Tomato Salsa', u'Salsa', u'Dips and Spreads', u'Appetizers and Snacks', u'Mexican Recipes', u'Everyday Cooking', u'Recipes', u'Summer Appetizers', u'Mexican Appetizers', u'Cilantro']
[ -2.24637651   4.20188141  -1.71370387   6.43905401  -5.52758837
  -0.61713243  -5.52257061   1.72299922  -5.43359947 -12.06998825]


In [37]:
uniqueCategories=set([category for categoryList in categories for category in categoryList ])
uniqueCategories.remove('Recipes') # remove junk categories
uniqueCategories.remove('Everyday Cooking')
print len(uniqueCategories)

2741


In [38]:
# compute model's accuracy using euclidean RSS from centroid of caetgories 
import math
from tqdm import tqdm as ProgressBar
numCat=len(categories)
print numCat
print len(v)
totalRSS=0
minRecipesPerCategory=100
results=defaultdict(int)
for category in ProgressBar(list(uniqueCategories), desc="Processing categories"):
#for category in list(uniqueCategories):
    categoryEmbeddings = [v[idx] for idx in range(numCat) if category in categories[idx]]
    numRecipes = len(categoryEmbeddings)
    categoryCetroid = sum(categoryEmbeddings)/numRecipes
    RSS=math.sqrt(sum(sum([(v[idx]-categoryCetroid)**2 for idx in range(numCat) if category in categories[idx]])))
    totalRSS += RSS
    if numRecipes>=minRecipesPerCategory:
        results[category]=RSS/numRecipes
#    print 'found %s recipes in category %s. RootSumSq=%s' % (len(categoryEmbeddings), category, ss)
print len(categoryCetroid)
print totalRSS
for item in sorted(results.items(), reverse=False, key=operator.itemgetter(1))[:30]:
    print item

Processing categories:   0%|          | 1/2741 [00:00<07:51,  5.82it/s]

48417
48417


Processing categories: 100%|██████████| 2741/2741 [08:03<00:00,  5.89it/s]

150
2448652.88699
(u'Main Dishes', 0.9065379626887213)
(u'Desserts', 1.1388566147018726)
(u'Drinks', 1.1828544412419373)
(u'Side Dishes', 1.3822747812848923)
(u'Appetizers and Snacks', 1.415773014845896)
(u'Salad Recipes', 1.4686081458208187)
(u'Soups, Stews and Chili', 1.5721581361707886)
(u'Smoothies', 1.7162385369513764)
(u'Cookies', 1.7227847894077108)
(u'Soup', 1.8625447884899862)
(u'Cocktails', 1.895696231776015)
(u'Dips and Spreads', 1.9183016622287)
(u'Pasta by Shape', 1.941096824219925)
(u'Vegetable Side Dishes', 1.9818809602792218)
(u'Chicken Breasts', 1.989702287239228)
(u'Asian Recipes', 2.1039418272446806)
(u'Fruit Desserts', 2.1276387108728105)
(u'Quick Bread', 2.158310628564533)
(u'Bread Recipes', 2.1690812028261144)
(u'Banana Smoothies', 2.1845128397151985)
(u'Cheese Appetizers', 2.200555947730562)
(u'5 Ingredient Drinks', 2.2457280824470227)
(u'Breakfast and Brunch', 2.2542653754307693)
(u'Fish Recipes', 2.267536716704706)
(u'Vanilla Extract', 2.2742933451226635)
(u'Ca




In [39]:
for item in sorted(results.items(), reverse=True, key=operator.itemgetter(1))[:30]:
    print item

(u'Birthday Cake', 14.151571303865921)
(u'Easter Bread', 14.130307362116035)
(u'Poppy Seeds', 14.062785201389465)
(u'California', 12.906821244377323)
(u'Jewish Recipes', 12.75033972184721)
(u'Polish Recipes', 12.66494517632021)
(u'Mardi Gras Recipes', 12.653717322442638)
(u'Lobster Recipes', 12.503543173349735)
(u'Gourmet Beef Main Dishes', 12.233710030037681)
(u'Pastries', 12.230397189485247)
(u'Gourmet Pasta Main Dishes', 12.183472602452152)
(u'Rum Desserts', 11.982506418549839)
(u'Meat Lasagna', 11.961223048058821)
(u'Nutmeg', 11.829879894474747)
(u'Pot Pie', 11.740475412416423)
(u'French Main Dishes', 11.654160665478834)
(u'Cheddar Cheese', 11.519428418444319)
(u'Gourmet Side Dishes', 11.516854702807668)
(u'High-Fiber Recipes', 11.444875767140362)
(u'Fennel Seed', 11.408746536347886)
(u'Cajun and Creole Recipes', 11.405419026280308)
(u'Holiday Cupcakes', 11.358744106518992)
(u"Mother's Day Breakfast and Brunch", 11.349188592992125)
(u"Valentine's Day Recipes", 11.334283883490311)
(

## Results
50 per category RSS: 1,290,669 Cosine Dist: 148,855. Word2Vec(size=200, window=7, min_count=4, workers=4, hs=1, negative=0)  
**100** per category RSS: 1,289,284 Cosine Dist: 149,007. Word2Vec(size=200, window=7, min_count=4, workers=4, hs=1, negative=0)  
100 per category RSS: 1,299,309 Cosine Dist: 149,110. Word2Vec(size=**250**, window=7, min_count=4, workers=4, hs=1, negative=0)  
100 per category RSS: 1,278,714 Cosine Dist: 150,194. Word2Vec(size=200, window=7, min_count=**10**, workers=4, hs=1, negative=0)  
**replace space with underscore in ingredients**  
100 per category RSS: 1,274,982 Cosine Dist: 149,510. Word2Vec(size=200, window=7, min_count=10, workers=4, hs=1,negative=0) 
**Use negative sampling instead of softmax**
100 per category RSS: 2,536,324 Cosine Dist: 57,901. Word2Vec(size=200, window=7, min_count=10, workers=4, iter=5)   
**Increase #iterations from 5 to 10**  
100 per category RSS: 2,912,868  Cosine Dist:69,221. Word2Vec(size=200, window=7, min_count=10, workers=4, **iter=10**)   
** Size decreased to 150 **  
100 per category RSS: 2,535,669  Cosine Dist:58,247. Word2Vec(**size=150**, window=7, min_count=10, workers=4, iter=5) 
** Increase negative sampling **  
100 per cat. RSS: 2,537,084  Cosine Dist:57,940. Word2Vec(size=150, window=7, min_count=10, workers=4, iter=5, hs=0, **negative=5**)  
100 per cat. RSS:2,448,652  Cosine Dist:62,851 . Word2Vec(size=150, window=7, min_count=20, workers=4, iter=5, hs=0, negative=5)   

In [40]:
# compute model's accuracy using euclidean RSS from centroid of caetgories 
import math
from tqdm import tqdm as ProgressBar
from scipy.spatial import distance
numCat=len(categories)
results2=defaultdict(int)
minRecipesPerCategory=100
print numCat
print len(v)
totalCosineDist=0
for category in ProgressBar(list(uniqueCategories), desc="Processing categories"):
#for category in list(uniqueCategories):
    categoryEmbeddings = [v[idx] for idx in range(numCat) if category in categories[idx]]
    numRecipes = len(categoryEmbeddings)
    categoryCetroid = sum(categoryEmbeddings)/numRecipes
    CosineDist=sum([distance.cosine(v[idx],categoryCetroid) for idx in range(numCat) if category in categories[idx]])
    totalCosineDist += CosineDist
    if numRecipes>=minRecipesPerCategory:
        results2[category]= CosineDist/numRecipes
#    print 'found %s recipes in category %s. RootSumSq=%s' % (len(categoryEmbeddings), category, ss)
print len(categoryCetroid)
print totalCosineDist
for item in sorted(results2.items(), reverse=True, key=operator.itemgetter(1))[:30]:
    print item

Processing categories:   0%|          | 0/2741 [00:00<?, ?it/s]

48417
48417


Processing categories: 100%|██████████| 2741/2741 [07:21<00:00,  7.33it/s]

150
62851.5051468
(u'MyPlate Fruit', 0.36960849232037563)
(u'Low-Fat Appetizers', 0.35320600108336075)
(u'Low-Cholesterol Appetizers', 0.34175655371735436)
(u'5 Ingredient Recipes', 0.33951940780767803)
(u'Blender Recipes', 0.33820453464464462)
(u'Healthy Kid Recipes', 0.33756140406991397)
(u'Kid-Friendly Recipes', 0.32857910572145954)
(u'Superfoods - Fruit', 0.31778891256661512)
(u'On-The-Go Breakfasts', 0.31693142793696283)
(u'High-Fiber Breakfast and Brunch', 0.3164578050619074)
(u'Melon Recipes', 0.31227057604503672)
(u'Summer Fruits and Vegetables', 0.31126391754857458)
(u'Mandarin Orange Recipes', 0.30633239345195079)
(u'Back to School Recipes', 0.30505099838232463)
(u'Quick and Easy Vegetarian Recipes', 0.30400710741479536)
(u'5 Ingredient Appetizers', 0.30388172130769708)
(u'Mango Recipes', 0.30148013629747772)
(u'Watermelon Recipes', 0.29683338760033984)
(u'Picnic Recipes', 0.29576332158148544)
(u'More Meal Ideas', 0.29445175999924339)
(u'Spring Fruits and Vegetables', 0.29403




In [41]:
for item in sorted(results2.items(), reverse=False, key=operator.itemgetter(1))[:30]:
    print item

(u'Fruit Bread', 0.034838052071181756)
(u'Banana Bread', 0.0360309051582553)
(u'Oatmeal Cookies', 0.036499914746091891)
(u'Spice Cake', 0.037316860244705805)
(u'Chocolate Chip Cookies', 0.038887521395101821)
(u'Coffee Cake', 0.039730058276979618)
(u'Whole Wheat Muffins', 0.039845575335132132)
(u'Bundt Cake', 0.039923853257155112)
(u'Cut-Out Cookies', 0.041770094705960784)
(u'Spice Cookies', 0.041932455841117301)
(u'Sheet Cake', 0.043421002118910396)
(u'Filled Cookies', 0.044996579970067445)
(u'Cupcakes', 0.04515369934161139)
(u'Zucchini Bread', 0.046791457496185539)
(u'Italian Cookies', 0.047413257123738554)
(u'Apple Cake', 0.048696239730089311)
(u'Drop Cookies', 0.049219343577816266)
(u'Chicken Breast Stir-Fry', 0.050034100275570217)
(u'Cheesecake', 0.051223828148812767)
(u'Muffins', 0.051820087773918166)
(u'Chicken Stir-Fry', 0.052637534907644321)
(u'Chocolate Cake', 0.052647044421519802)
(u'Indian Vegetarian Main Dishes', 0.054434425159493843)
(u'Meat Lasagna', 0.05515414079755062)


In [62]:
def is_ascii(text):
    if isinstance(text, unicode):
        try:
            text.encode('ascii')
        except UnicodeEncodeError:
            return False
    else:
        try:
            str('text')
        except UnicodeDecodeError:
            return False
    return True

# Build a vocabulary
vocab=list(set([word for cent in recipes for word in cent ]))
vocab[:10]
e=[value for vector in v for value in vector for char in value if all(ord(char) < 128)]

TypeError: 'numpy.float32' object is not iterable

In [59]:
import tf_embed_viz
reload(tf_embed_viz)

ev = tf_embed_viz.TFEmbeddingVizWrapper()
ev.write_vocab_file(words=vocab)
ev.write_embeddings(v)

UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 4: ordinal not in range(128)