## RecipeToVec
Word to vec using gensim: https://radimrehurek.com/gensim/models/word2vec.html
Here we are creating the word vectors from the recipes.
Each document is one recipe's list of clean ingredients + verbs. 
We use the Gensim model to create the similarity matrix(cosine similarity).

In [69]:
import numpy as np
import pandas as pd
from collections import defaultdict
#from gensim.models.word2vec import Word2Vec
from gensim import corpora, models, utils
import pickle
import os
import logging
import operator

In [70]:
# Enable logging (for gensim)
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
pd.set_option('display.max_rows', 50)

cleanerIngredientsDict=pd.read_pickle('cleanerIngredients.pkl')  
verbsDict=pd.read_pickle('verbs.pkl') 

In [71]:
print len(cleanerIngredientsDict)
print len(verbsDict)
print verbsDict['http://allrecipes.com/recipe/33385/best-spanish-rice/']

48417
48417
[u'heat', u'stir', u'cook', u'mix', u'stir', u'begin', u'reduce', u'cover', u'simmer', u'absorb']


In [72]:
allRecipes=pd.read_pickle('CleanedIngredients.pkl')
allRecipes.drop_duplicates(subset='url', keep='first', inplace=True)
indexedRecipes=allRecipes.set_index("url")

In [73]:
recipes=[]
names=[]
categories=[]
for idx, row in allRecipes.iterrows():
    name = [word for word in row["name"].lower().split()]
    url = row["url"]
    verbs = verbsDict[url]
    if type(verbs) != list:
        print url
        print '%s is not a list' % verbs
        verbs=[]
    ingredients = list(cleanerIngredientsDict[url])
    # Concatenate phrases into single tokens
    # ingredients=[i.replace(' ','_') for i in ingredients]
    recipes.append(name + ingredients + verbs)
    names.append(row["name"])
    categories.append(row["categories"])

In [74]:
allRecipes.sample(1)

Unnamed: 0,categories,cookingTime,description,ingredients,instructionSteps,name,rating,ratingCount,url,cookingTimeMinutes,cleanedIngredients
16507,"[Whiskey Drinks, Rum Drinks, Cocktails, Drinks...",PT5M,,"[ice, 1 fluid ounce coconut flavored rum, 1/2 ...",[Fill a cocktail shaker with ice. Pour in the ...,Wendy's Drunken Snow Cone,5.0,2,http://allrecipes.com/recipe/154624/wendys-dru...,5.0,"[ice, coconut flavored rum, chambord raspberry..."


In [75]:
print 'Total recipes loaded: %s ' % len(recipes)
print recipes[0]

Total recipes loaded: 48417 
[u'fresco', u'salsa', u'tomato', u'lime juice', u'cilantro', u'red bell pepper', u'onion', u'yellow bell pepper', u'salt', u'mix', u'lime', u'cover', u'refrigerate', u'serve']


In [76]:
# Create a dictionary and save it
dictionary = corpora.Dictionary(recipes)
dictionary.save('recipe2vec.dict')
print(dictionary)
print "The token ID of milk is: %s " % dictionary.token2id["milk"] 

Dictionary(24739 unique tokens: [u'', u'butter flavoring', u'gai', u'blast-off', u'gag']...)
The token ID of milk is: 62 


In [77]:
# Create a corpus and save it
corpus = [dictionary.doc2bow(recipe) for recipe in recipes]
corpora.MmCorpus.serialize('recipe2vec.mm', corpus)

## Now let's build a model

if (os.path.exists("recipe2vec.dict")):
    dictionary = corpora.Dictionary.load('recipe2vec.dict')
    corpus = corpora.MmCorpus('recipe2vec.mm')
    print("Loaded dictionary and corpus from disk")
else:
    print("Error: Could find dictionary \"recipe2vec.dict\"")

tfidf = models.TfidfModel(corpus, normalize=True)
corpus_tfidf = tfidf[corpus]

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->lsi

In [78]:
print len(recipes)

48417


In [None]:
taggedRecipes=[models.doc2vec.TaggedDocument(recipes[i], [i]) for i in range(len(recipes))]
print taggedRecipes[0]

TaggedDocument([u'fresco', u'salsa', u'tomato', u'lime juice', u'cilantro', u'red bell pepper', u'onion', u'yellow bell pepper', u'salt', u'mix', u'lime', u'cover', u'refrigerate', u'serve'], [0])


## Train a Doc2Vec model

In [None]:
print('Training a Doc2vec model...')
w2v_model = models.doc2vec.Doc2Vec(taggedRecipes, size=100, window=4, min_count=5, workers=4, iter=30)

Training a Doc2vec model...


In [None]:
print('Let us see what this looks like...')
print taggedRecipes[0]
print  w2v_model.infer_vector(taggedRecipes[0].words)[:10]

In [None]:
print "We can turn enything into a vector now"
print  w2v_model.infer_vector([u'chicken',u'masala'])[:10]

print "This is what the documents look like:"
for doc_id in range(50,55):
    print('Document ({}): «{}»\n'.format(doc_id, ' '.join(taggedRecipes[doc_id].words)))

In [None]:
w2v_model.docvecs.most_similar([w2v_model.infer_vector(taggedRecipes[0].words)], topn=5)

## How well of a representation is it? Are documents the most similar to themselves?  
Let see how many documents the model thinks are the most similar to themselves 

In [None]:
from tqdm import tqdm as ProgressBar

ranks = []
second_ranks = []
for doc_id in ProgressBar(range(len(taggedRecipes)), desc="Processing recipes"):
    inferred_vector = w2v_model.infer_vector(taggedRecipes[doc_id].words)
    sims = w2v_model.docvecs.most_similar([inferred_vector], topn=len(w2v_model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])
    if doc_id==10000:
        break

In [None]:
import collections
c= collections.Counter(ranks)
print "{}% of the recipes were the model's top match. For another {}% they were #2.".format(c[0]/10000.0*100, c[1]/10000.0*100)

## Save the results:
0: 83.85%, 1:4.53%  size=100, window=7, min_count=2, workers=4, iter=50  
0: 68.84%  1:5.31%  size=100, window=7, min_count=5, workers=4, iter=30  
size=100, window=4, min_count=5, workers=4, iter=30)
size=100, window=4, min_count=30, workers=4, iter=30)

## Search by URL (find recipes like this one)  
Here we look for for the parsed recipe in our data frame. In a more realistic (and useful) implementation, the recipe will be read and parsed and a BOW (ingredients, actions, name) will be used to look for similarities.

In [None]:
doc_id=indexedRecipes.index.get_loc('http://allrecipes.com/recipe/166638/baked-buffalo-wings/')
bag_of_words=taggedRecipes[doc_id].words
inferred_vector = w2v_model.infer_vector(bag_of_words)
sims=w2v_model.docvecs.most_similar([inferred_vector], topn=len(w2v_model.docvecs))
print allRecipes.iloc[doc_id]["name"]
print allRecipes.iloc[doc_id]["url"]
print 'Best match: '
print allRecipes.iloc[sims[0][0]]["name"]
print allRecipes.iloc[sims[0][0]]["url"]
print 'Next best: '
print allRecipes.iloc[sims[1][0]]["name"]
print allRecipes.iloc[sims[1][0]]["url"]

## Keywords Search

In [None]:
wordsVec=utils.simple_preprocess(u'chickem tikka masala')
inferred_vector = w2v_model.infer_vector(wordsVec)
sims=w2v_model.docvecs.most_similar([inferred_vector], topn=len(w2v_model.docvecs))
print 'Best match: '
print allRecipes.iloc[sims[0][0]]["name"]
print allRecipes.iloc[sims[0][0]]["url"]
print 'Next best: '
print allRecipes.iloc[sims[1][0]]["name"]
print allRecipes.iloc[sims[1][0]]["url"]

In [None]:
# This runs a shell command from the notebook.
!pip install plotly

# Plotly imports.
import plotly.offline as plotly
plotly.offline.init_notebook_mode()
import plotly.graph_objs as go


In [None]:
n = 1000
data = [go.Scatter(x=v[0][:n], y=v[1][:n], text=names,
                   mode='markers', textposition='bottom', hoverinfo='text')]
fig = go.Figure(data=data, layout=go.Layout(title="Word Embeddings", hovermode='closest'))
plotly.iplot(fig)

## Results
