In [157]:
import pandas as pd
import numpy as np
import json

##text pre-processing
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import NMF

##visualization
import pyLDAvis

##similarity
from sklearn.metrics.pairwise import cosine_similarity  

## Concatenate json files and create dataframe

In [2]:
def concatJson(file_list):
    full = []
    with open("allRecipes.json", "w") as outfile:
        for f in file_list:
            with open(f, 'rb') as infile:
                file_data = json.load(infile)
                full += file_data
        json.dump(full, outfile)

In [4]:
def openJson(name):
    with open(name, 'rb') as infile:
        full = json.load(infile)
    return full

In [28]:
file_list = ["fullhelping_items.json", "iLoveVegan_items.json","MinimalistBaker_items.json", "ohsheglows_items.json",
             "PaleOMG_items.json","roastedRoot_items.json","ohmyveggies_items.json",]

In [29]:
concatJson(file_list)
full = openJson('allRecipes.json')

In [31]:
fullDF = pd.DataFrame(full)
fullDF1 = fullDF[(fullDF.ingredientsList.str.len() != 0)]
recipeDF = fullDF1.reset_index(drop=True)

In [33]:
recipeDF.to_pickle('recipeDF_v3')

In [294]:
recipeDF = pd.read_pickle('recipeDF_v3')

In [295]:
recipeDF.tail(3)

Unnamed: 0,category,ingredientsList,instructions,summary,title,url
2432,[],"[Eggplant bacon:, 1/4 cup soy sauce, 1/4 cup a...","[Make the eggplant bacon:, In a medium bowl, s...",[],[],http://ohmyveggies.com/the-best-vegan-blt-with...
2433,[],"[8 oz. tempeh, cut into thin strips, 3/4 c. ba...",[Spread 1/4 cup of barbecue sauce on the botto...,[],[],http://ohmyveggies.com/recipe-bbq-tempeh-sandw...
2434,[],"[4 red bell peppers, 2 tablespoons safflower s...","[Preheat oven to 350ºF., Bring a large pot of ...",[],[],http://ohmyveggies.com/thai-stuffed-peppers/


##Write ingredients to file format for NYT ingredient tagger

In [296]:
def nytIngredientInput(df):
    f = open("input.txt", "a")
    ingredientList = []
    for row in range(0,len(df)):
        ingredients = df.ingredientsList.iloc[row]
        for i in range(0,len(ingredients)):
            line = ingredients[i].encode('ascii','ignore')
            ingredientList.append([row,i,line])
            f.write(line + '\n')
    f.close()   
    return ingredientList

In [334]:
ingredientList = nytIngredientInput(recipeDF)

In [335]:
ingredientDF = pd.DataFrame(ingredientList, columns=['recipeIndex','ingredientIndex','recipeText'])

In [336]:
ingredientDF.tail(3)

Unnamed: 0,recipeIndex,ingredientIndex,recipeText
29750,2434,9,3/4 cup arborio rice
29751,2434,10,1 3/4 cup coconut milk
29752,2434,11,"Salt and black pepper, to taste"


In [337]:
len(ingredientDF)

29753

## Merge NYT ingredient tags with ingredientDF
<p>(see NYT_CRF_IngredientPhraseTagger.txt for tags) </p>

In [586]:
ingredientTags = openJson('results.json')

In [587]:
ingredientTagsDF = pd.DataFrame(ingredientTags)
ingredientDF2 = ingredientDF.replace({'\n': '', '\t':'',',':'',':':''}, regex=True)#ingredientDF[ingredientDF['recipeText'].str.contains("\n|\t") == False]
len(ingredientDF2)

29753

In [1022]:
mask1 = (ingredientDF2['recipeText'].str.len() > 3)
ingredientDF3 = ingredientDF2.loc[mask1]
ingredientDF4 = ingredientDF3[(ingredientDF3.recipeText != ' or ') & (ingredientDF3.recipeText != ' cup ') 
                              & (ingredientDF3.recipeText != '1.5 ') & (ingredientDF3.recipeText != ' (or ')
                              & (ingredientDF3.recipeText != '3-4 ') & (ingredientDF3.recipeText != '1/2 ')
                            & (ingredientDF3.recipeText != ' tsp ')] ##& (ingredientDF3.recipeText != '1lb ')
len(ingredientDF4)

28798

In [1023]:
mask = (ingredientTagsDF['input'].str.len() > 3)
ingredientTagsDF2 = ingredientTagsDF.loc[mask]
len(ingredientTagsDF2)

28790

In [1019]:
#ingredientDF4 = ingredientDF4.reset_index(drop=True)
#ingredientTagsDF2 = ingredientTagsDF2.reset_index(drop=True)

In [1026]:
ingredientDF3[ingredientDF3.recipeText == '1lb ']

Unnamed: 0,recipeIndex,ingredientIndex,recipeText
14612,1267,1,1lb
14613,1267,2,1lb
14638,1268,0,1lb
15288,1327,10,1lb
15365,1332,0,1lb
15583,1350,0,1lb
16107,1390,1,1lb
16328,1409,0,1lb


In [1035]:
ingredientTagsDF2.loc[14638]

comment                                                    NaN
display      <span class='qty'>1</span><span class='unit'>t...
input                                    1 teaspoon fish sauce
name                                                fish sauce
other                                                      NaN
qty                                                          1
range_end                                                  NaN
unit                                                  teaspoon
Name: 14638, dtype: object

In [944]:
ingredientDF3[ingredientDF3.recipeText == ' tsp ']

Unnamed: 0,recipeIndex,ingredientIndex,recipeText
4477,358,4,tsp
16677,1437,4,tsp


In [1024]:
ingredientDF4[10000:10010] ##[4385:4395] ##16010:16020

Unnamed: 0,recipeIndex,ingredientIndex,recipeText
10191,910,8,1/2 tsp vanilla extract
10192,910,9,bourbon caramel sauce*
10193,911,0,1.5 oz gin (or 3 Tbsp)
10194,911,1,4-6 cucumber slices
10195,911,2,1/4 lime sliced
10196,911,3,4 oz tonic water
10197,911,4,6 mint leaves
10198,911,5,1 Tbsp sugar (optional)
10199,912,0,1/2 cup white chocolate chips (or chopped)
10200,912,1,2.5 cups milk (I used 2%)


In [1025]:
ingredientTagsDF2[10000:10010] ##[4385:4395]

Unnamed: 0,comment,display,input,name,other,qty,range_end,unit
10190,Tbsp melted,<span class='qty'>3</span><span class='comment...,"3 Tbsp coconut oil, melted",coconut oil,",",3,,
10191,,<span class='qty'>1/2</span><span class='unit'...,1/2 tsp vanilla extract,vanilla extract,,1/2,,tsp
10192,,<span class='qty'>bourbon</span><span class='n...,bourbon caramel sauce*,caramel sauce*,,bourbon,,
10193,(or 3 Tbsp),<span class='qty'>1.5</span><span class='name'...,1.5 oz gin (or 3 Tbsp),oz gin,,1.5,,
10194,,<span class='qty'>4-6</span><span class='name'...,4-6 cucumber slices,cucumber,,4-6,,slice
10195,sliced,<span class='qty'>1/4</span><span class='name'...,"1/4 lime, sliced",lime,",",1/4,,
10196,,<span class='qty'>4</span><span class='name'>o...,4 oz tonic water,oz tonic water,,4,,
10197,,<span class='qty'>6</span><span class='name'>m...,6 mint leaves,mint leaves,,6,,
10198,optional),<span class='qty'>1</span><span class='name'>T...,1 Tbsp sugar (optional),Tbsp sugar,(,1,,
10199,(or chopped),<span class='qty'>1/2</span><span class='unit'...,1/2 cup white chocolate chips (or chopped),white chocolate chips,,1/2,,cup


In [898]:
ingredientDF2 = ingredientDF2.reset_index(drop=True)
ingredientTaggedDF = pd.concat([ingredientDF2,ingredientTagsDF], axis = 1)

In [270]:
ingredientDF.tail(3)

Unnamed: 0,recipeIndex,ingredientIndex,recipeText
29750,2434,9,3/4 cup arborio rice
29751,2434,10,1 3/4 cup coconut milk
29752,2434,11,"Salt and black pepper, to taste"


In [232]:
ingredientTaggedDF.to_pickle('ingredientTaggedDF_v2')

In [233]:
ingredientTaggedDF = pd.read_pickle('/home/rachaelrho/ds/metis/final_project/foodBlogs/ingredientTaggedDF_v2')

## Create Ingredient Documents

In [234]:
ingredientTaggedDF['name'] = ingredientTaggedDF['name'].astype(str)

In [235]:
def ingredientTokens(df,taggeddf):
    df['ingredientTokens'] = 0
    for i in range(0,len(taggeddf.recipeIndex.unique())):
        ingredients = taggeddf.name[taggeddf.recipeIndex == i]
        df['ingredientTokens'][i] = str(' '.join(ingredients))  
    df['ingredientTokens'] = df['ingredientTokens'].astype(str)
    
    return df

In [236]:
def nlpTokens(document):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(document)
    
    tokens = [PorterStemmer().stem(x) for x in tokens]
    tokens = [WordNetLemmatizer().lemmatize(x, 'v') for x in tokens]
    
    stop = stopwords.words('english')
    tokens = [x for x in tokens if x not in stop] 
    return ' '.join(tokens)

In [237]:
recipeDF = ingredientTokens(recipeDF,ingredientTaggedDF)
recipeDF.loc[:, 'ingredientTokens'] = recipeDF.loc[:, 'ingredientTokens'].apply(nlpTokens)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [238]:
recipeDF.to_pickle('recipeDF_tokens_v2')

In [239]:
recipeDF = pd.read_pickle('recipeDF_tokens_v2')

In [142]:
recipeDF = recipeDF.drop(recipeDF.index[[531,555,596,962,1182,1263,2434]])

## TFIDF vectorizer

In [240]:
def tfidfVectorizer(stopList,df):
    
    tfidf_vectorizer = TfidfVectorizer(max_df=.2, max_features=1300, min_df=0.01, stop_words= stopList, use_idf=True, ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(df.ingredientTokens)
    terms = sorted([(i,v) for v,i in tfidf_vectorizer.vocabulary_.items()])
    vocab = [x[1] for x in terms]
    
    return tfidf_vectorizer,tfidf_matrix,terms,vocab

In [241]:
tfidf_vectorizer,tfidf_matrix,terms,vocab = tfidfVectorizer(['nan','tbsp','tsp'],recipeDF)

##Non-Negative Matrix Factorization

In [242]:
U, Sigma, VT = randomized_svd(tfidf_matrix, n_components=25,
                                      n_iter=5,
                                      random_state=None)

In [243]:
Sigma

array([ 11.61051335,   8.67547051,   6.27670232,   6.14134336,
         5.66499954,   5.53454361,   5.34680769,   5.27247656,
         5.18015558,   5.10820031,   4.985408  ,   4.9572764 ,
         4.91417745,   4.77863364,   4.70853471,   4.63092139,
         4.56309018,   4.46231014,   4.42105339,   4.38307695,
         4.23157556,   4.18686674,   4.15867876,   4.05321742,   4.04993842])

In [244]:
def NMFmodeltopics(tfidf_matrix):
    NMFmodel = NMF(n_components=15, init='random', random_state=0)
    NMF_topics = NMFmodel.fit_transform(tfidf_matrix)
    
    return NMFmodel,NMF_topics

In [245]:
NMFmodel,NMF_topics = NMFmodeltopics(tfidf_matrix)

## Visualize

In [149]:
def pyLDA(docs,lda, doc_topic_dists):
    get_normed = lambda data: pd.DataFrame(data).div(data.sum(axis = 1), axis = 0) 
    prepared = pyLDAvis.prepare(
            doc_lengths = docs.str.len(),
            vocab = vocab,
            term_frequency = np.asarray(tfidf_matrix.sum(axis = 0)).ravel().tolist(),
            topic_term_dists = get_normed(lda.components_), # topics x terms 
            doc_topic_dists = get_normed(doc_topic_dists)) # docs x topics 
    return prepared

In [150]:
dftest = pd.DataFrame(NMF_topics).div(NMF_topics.sum(axis = 1), axis = 0)
dftest['total'] = dftest.sum(axis=1)
dftest[dftest.total < .99]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,total


In [153]:
pyNMF = pyLDA(recipeDF.ingredientTokens,NMFmodel,NMF_topics)
pyLDAvis.save_html(pyNMF, 'pyldavisIngNMF.html')

## Cosine similarity

In [246]:
document = 'chicken onion avocado'

In [252]:
def topCosine(testTokens, NMF_topics,recipeDF):
    tf = tfidf_vectorizer.transform([testTokens])
    NMF = NMFmodel.transform(tf)
    cosine = cosine_similarity(NMF, NMF_topics) 
    cosineList = sorted(((e,i) for i,e in enumerate(list(cosine[0]))), reverse = True)

    index = [x[1] for x in cosineList]
    recipes = []

    for i in index:
        ing = [i,list(recipeDF.ingredientTokens[i:i+1])]
        recipes.append(ing)
    
    finalList = []

    for i in range(0,len(recipes)): 
        if "chicken" and "avocado" in recipes[i][1]: ##"avocado" and
            finalList.append(recipes[i])
            
    return cosineList,recipes,finalList

In [253]:
cosineList,recipes,finalList = topCosine(createTokens(document),NMF_topics,recipeDF)

In [254]:
recipes

[[1178,
  [u'oliv oil nan yellow onion chili powder cumin salt pepper oliv oil garlic nan yellow onion']],
 [1415,
  [u'lime chili powder cayenn pepper salt pepper spaghetti squash avocado spinach chicken yellow onion']],
 [1412,
  [u'tomato oliv oil bacon jalapeno garlic yellow onion garlic powder cayenn pepper cilantro lime']],
 [1339,
  [u'egg salt avocado pineappl dice skinni yam potato plantain jalapeno red onion cilantro garlic garlic cayenn pepper salt pepper']],
 [339,
  [u'nan nan nan nan oliv oil chipotl powder chili powder paprika garlic powder onion powder salt recip vegan cheddar shred tomato onion onion nan cilantro']],
 [1111,
  [u'salsa chili powder onion powder garlic powder cilantro Salt batch nan ghee']],
 [1615,
  [u'yellow onion garlic jalapeo pull chicken tomato garbanzo bean chicken broth chili powder salt bbq sauc Smoke gouda chees cilantro spaghetti squash grapese oil garlic']],
 [676,
  [u'garlic powder cayenn pepper mango kiwi strawberri blueberri lime Tbsp']

In [258]:
list(recipeDF.url[recipeDF.ingredientTokens == 'honey cinnamon salt coconut oil garlic yellow plantain chicken garlic powder'])

[u'http://paleomg.com/soft-chewy-double-chocolate-cookies/']

In [221]:
recipeDF.ingredientsList[1200:1201]

1205    [1 large spaghetti squash, cut in half lengthw...
Name: ingredientsList, dtype: object

In [None]:
## Personalization