In [812]:
import pandas as pd
import numpy as np
from statistics import mode
import unicodedata as ucd
import random
import json
from pprint import pprint

##text pre-processing
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity  

import pyLDAvis

In [5]:
nyt = pd.read_csv("https://raw.githubusercontent.com/NYTimes/ingredient-phrase-tagger/master/nyt-ingredients-snapshot-2015.csv")

In [6]:
nyt.head(3)

Unnamed: 0,index,input,name,qty,range_end,unit,comment
0,0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,2,"1 medium-size onion, peeled and chopped",onion,1.0,0.0,,"medium-size, peeled and chopped"


In [59]:
pd.set_option('max_colwidth', 800)
nyt['input'][0:1]

0    1 1/4 cups cooked and pureed fresh butternut squash, or 1 10-ounce package frozen squash, defrosted
Name: input, dtype: object

## concatenate json files and create dataframe

In [198]:
file_list = ["fullhelping_items.json", "iLoveVegan_items.json", "MinimalistBaker_items.json", "ohsheglows_items.json",
             "PaleOMG_items.json"]
full = []
with open("allRecipes.json", "w") as outfile:
    for f in file_list:
        with open(f, 'rb') as infile:
            file_data = json.load(infile)
            full += file_data
    json.dump(full, outfile)


In [214]:
with open('allRecipes.json', 'rb') as infile:
    full = json.load(infile)

In [215]:
fullDF = pd.DataFrame(full)

In [216]:
fullDF1 = fullDF[(fullDF.ingredientsList.str.len() != 0)]

In [217]:
recipeDF = fullDF1.reset_index(drop=True)

In [218]:
recipeDF.shape

(1342, 6)

In [209]:
recipeDF.to_pickle('recipeDF v2')

In [210]:
recipeDF = pd.read_pickle('recipeDF v2')

In [211]:
recipeDF.head(3)

Unnamed: 0,category,ingredientsList,instructions,summary,title,url
0,[],"[1 pound purple asparagus, washed, woody ends ...",[Preheat the oven to 400F. Toss the asparagus ...,[],[Purple Asparagus and Quinoa Salad with Peas a...,http://www.thefullhelping.com/purple-asparagus...
1,[],[1 package DOLE® Power Up Greens™ (or 6 oz. ba...,"[1. Place the greens, quinoa, blueberries, and...",[],"[Power Up Salad with Baby Kale, Quinoa, Bluebe...",http://www.thefullhelping.com/power-up-salad-w...
2,[],"[1¾ cups gluten-free, all purpose flour or who...",[Preheat your oven to 350F. Line a 12-muffin b...,[],"[Vegan, Gluten Free Pumpkin Gingerbread Spice ...",http://www.thefullhelping.com/vegan-gluten-fre...


## Write ingredients to file for NYT ingredient tagger

In [70]:
f = open("input.txt", "a")
ingredientList = []

for row in range(0,len(recipeDF)):
    ingredients = recipeDF.ingredientsList.iloc[row]
    for i in range(0,len(ingredients)):
        line = ingredients[i].encode('ascii','ignore')
        ingredientList.append([row,i,line])
        f.write(line + '\n')
f.close()
        

In [71]:
ingredientDF = pd.DataFrame(ingredientList, columns=['recipeIndex','ingredientIndex','recipeText'])

In [72]:
len(ingredientDF)

15654

## merge NYT ingredient tags with ingredientDF

In [73]:
with open('results.json', 'rb') as infile:
    ingredientTags = json.load(infile)

In [74]:
ingredientTagsDF = pd.DataFrame(ingredientTags)

In [75]:
ingredientDF2 = ingredientDF[ingredientDF['recipeText'].str.contains("\n|\t") == False]
ingredientDF2 = ingredientDF2.reset_index(drop=True)

In [76]:
len(ingredientDF2)

15296

In [77]:
ingredientTaggedDF = pd.concat([ingredientDF2,ingredientTagsDF], axis = 1)

In [78]:
ingredientTaggedDF.to_pickle('ingredientTaggedDF')

In [None]:
ingredientTaggedDF = pd.read_pickle('/home/rachaelrho/ds/metis/final_project/foodBlogs/ingredientTaggedDF')

## create Ingredient Documents / Strings

In [79]:
len(ingredientTaggedDF.recipeIndex.unique())

1340

In [91]:
ingredientTaggedDF['name'] = ingredientTaggedDF['name'].astype(str)

In [92]:
len(ingredientTaggedDF.recipeIndex.unique())

1340

In [93]:
recipeDF['ingredientTokens'] = 0

for i in range(0,len(ingredientTaggedDF.recipeIndex.unique())):
    ingredients = ingredientTaggedDF.name[ingredientTaggedDF.recipeIndex == i]
    recipeDF['ingredientTokens'][i] = str(' '.join(ingredients))   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [98]:
recipeDF['ingredientTokens'] = recipeDF['ingredientTokens'].astype(str)

In [23]:
def createTokens(document):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(document)
    
    tokens = [PorterStemmer().stem(x) for x in tokens]
    tokens = [WordNetLemmatizer().lemmatize(x, 'v') for x in tokens]
    
    stop = stopwords.words('english')
    tokens = [x for x in tokens if x not in stop] 
    return ' '.join(tokens)

In [100]:
recipeDF.loc[:, 'ingredientTokens'] = recipeDF.loc[:, 'ingredientTokens'].apply(createTokens)

In [101]:
recipeDFnull = recipeDF[recipeDF.ingredientTokens.isnull() == True]

In [103]:
recipeDF.to_pickle('recipeDF_tokens')

In [660]:
recipeDF = pd.read_pickle('recipeDF_tokens')

In [362]:
recipeDF[381:382]

Unnamed: 0,category,ingredientsList,instructions,summary,title,url,ingredientTokens
381,[Dessert],"[~2.5 cups , 16 , (made in circles, not shapes)]",[See cookie recipe and follow instructions. On...,[Creamy ice cream sandwiches made with a sweet...,[Chai Ginger Ice Cream Sandwiches],http://minimalistbaker.com/chai-ginger-ice-cre...,nan nan nan


In [689]:
recipeDF = recipeDF.drop(recipeDF.index[[363,381,428,503,512,677,715,1098,1179,1340,1341]]) ##new line to take out nan tokens.drop(fullDF1.index[[381,512,677,1098,1179,1340,1341]]) ##new line to take out nan tokens

In [690]:
recipeDF.shape

(1331, 7)

## TFIDF vectorizer

In [691]:
stop = ['nan','tbsp','tsp']

In [692]:
tfidf_vectorizer = TfidfVectorizer(max_df=.2, max_features=1300, min_df=0.01, stop_words= stop, use_idf=True, ngram_range=(1,3))

In [693]:
tfidf_matrix = tfidf_vectorizer.fit_transform(recipeDF.ingredientTokens)

In [694]:
terms = sorted([(i,v) for v,i in tfidf_vectorizer.vocabulary_.items()])

In [695]:
vocab = [x[1] for x in terms]

In [696]:
feature_array = np.array(tfidf_vectorizer.get_feature_names())
tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]

n = 300
top_n = feature_array[tfidf_sorting][:n]

In [697]:
random.sample(top_n,3)

[u'garlic oliv oil', u'cider vinegar', u'mapl syrup salt']

In [698]:
' '.join(random.sample(top_n,3))

u'chocol garlic salt chocol chip'

## NMF

In [699]:
U, Sigma, VT = randomized_svd(tfidf_matrix, n_components=25,
                                      n_iter=5,
                                      random_state=None)

In [700]:
Sigma

array([ 8.3424093 ,  6.68604512,  5.03850913,  4.9130097 ,  4.59116261,
        4.49135099,  4.40257653,  4.27639909,  4.13437523,  3.92875692,
        3.80029697,  3.75193363,  3.73394517,  3.64916834,  3.57065936,
        3.55146463,  3.5073692 ,  3.41596696,  3.34552493,  3.27206199,
        3.21667515,  3.16616908,  3.1425937 ,  3.10537652,  3.06208194])

In [741]:
NMFmodel = NMF(n_components=15, init='random', random_state=0)
NMF_topics = NMFmodel.fit_transform(tfidf_matrix)

In [763]:
len(clusters)

1331

In [766]:
recipeDF['cluster'] = clusters

In [814]:
recipeDF.title[recipeDF.cluster == 7]

2       [Vegan, Gluten Free Pumpkin Gingerbread Spice ...
4            [Vegan Slow Cooker Apple Pie Steel Cut Oats]
5                             [Vegan Carob Chip Pancakes]
9                        [Walnut Cheddar and Apple Toast]
40               [Gluten Free, Vegan Zucchini Date Bread]
67      [Mango Curry Tofu from Vegan Richa’s Indian Ki...
72                             [Berry Hemp Spelt Muffins]
87      [Vegan, Gluten Free Pumpkin Skillet Cornbread ...
91      [Gingery Pink Rice with Roasted Butternut Squa...
109     [Sweet Dijon Vinaigrette and Chickpea, Sweet P...
112     [Creamy Polenta with Barbecue Tofu and Mango a...
120     [Spicy Peanut Kale Salad (gluten and soy free ...
138     [Roasted Beet, Baby Spinach, and Toasted Quino...
147     [Quinoa, Corn, Black Bean, and Tempeh Salad wi...
165                   [Fifteen Minute Tempeh Lunch Salad]
172                   [Almond Butter and Sesame Dressing]
180     [Beet Tartare with Cashew Cheese (vegan, glute...
183          [

In [813]:
mode(list(recipeDF.cluster[recipeDF.ingredientTokens.str.contains("vinegar")]))

7

In [None]:
###### clusters = [list(r).index(max(r)) for r in NMF_topics]
counts = [0]*NMF_topics.shape[1]
for c in clusters:
    counts[c]+=1
counts
#clusters[0:20]
#recipeDF.ingredientTokens[6]

## User input test

In [744]:
test = ['chicken']

In [820]:
document = 'cocoa' #chicken avocado tomato'

In [821]:
def testCosine(testTokens, NMF_topics):
    tf = tfidf_vectorizer.transform([testTokens])
    NMF = NMFmodel.transform(tf)
    cosine = cosine_similarity(NMF, NMF_topics) 
    cosineList = sorted(((e,i) for i,e in enumerate(list(cosine[0]))), reverse = True)
    return cosineList

In [832]:
def topRecipes(recipeDF,cosineList): ##top recipes that contain keywords
    
    top100 = [x[1] for x in cosineList[0:100]]
    topRecipes = []

    for i in top100:
        ing = [i,list(recipeDF.ingredientTokens[i:i+1])]
        topRecipes.append(ing)
        
    
    return topRecipes

In [833]:
cosineList = testCosine(createTokens(document), NMF_topics)

In [834]:
topRecipes = topRecipes(recipeDF,cosineList)

In [874]:
topRecipes

[[1200,
  [u'chicken breast yellow onion green bell pepper jalapeno nan coconut oil tomato tomato sauc garlic cumin chili powder oregano salt pepper cilantro avocado']],
 [1025,
  [u'egg avocado salsa chili powder onion powder garlic powder cilantro Salt']],
 [195,
  [u'English muffin nan tempeh bacon Daiya chees butter lettuc tomato avocado garlic guacamol mayo']],
 [1253,
  [u'avocado pineappl dice skinni yam potato plantain jalapeno red onion cilantro garlic garlic cayenn pepper salt pepper coconut oil aluminum foil']],
 [1159, [u'bacon sweet pepper smoke salmon salt smoke paprika']],
 [1288,
  [u'potato make lb yellow onion garlic chili powder red pepper cayenn pepper cumin oregano tomato tomato past veget broth salt pepper choic']],
 [1122,
  [u'cauliflow broth garlic powder cayenn pepper fresh cilantro lime pork chorizo nan white onion avocado choic salt pepper']],
 [1329,
  [u'sweet potato choic fat lime chili powder cayenn pepper salt pepper']],
 [255,
  [u'nan nan nan nan oliv

In [884]:
list(recipeDF.url[1200:1201])

[u'http://paleomg.com/enchilada-chicken-stew/']

In [872]:
finalList = []

for i in range(0,len(topRecipes)): 
    if "chicken" and "avocado"  in topRecipes[i][1]: ##and "tomato"
        finalList.append(topRecipes[i])

In [873]:
finalList

[]

In [751]:
list(recipeDF.url[1000:1001])

[u'http://paleomg.com/simple-sausage-and-bacon-butternut-squash-soup/']

In [752]:
instructions = list(recipeDF.instructions[1000:1001])

In [753]:
instructions[0][0].encode('ascii','ignore')

'Preheat oven to 400 degrees.'

In [754]:
[x[1] for x in cosineList[0:100]]

[1262,
 1256,
 1020,
 1289,
 1291,
 1158,
 1054,
 982,
 1297,
 1189,
 998,
 1295,
 678,
 1153,
 1070,
 1198,
 1287,
 1285,
 1129,
 1210,
 1112,
 1308,
 1283,
 1281,
 1323,
 1132,
 1212,
 1250,
 1222,
 1311,
 1215,
 1181,
 1175,
 1214,
 1078,
 1150,
 1002,
 1203,
 1037,
 1085,
 1141,
 1125,
 1202,
 1309,
 1188,
 1207,
 1313,
 1216,
 1316,
 1229,
 1230,
 1160,
 1145,
 729,
 1163,
 1127,
 1080,
 1033,
 1330,
 1115,
 1073,
 1047,
 1030,
 1304,
 1223,
 1018,
 1183,
 1321,
 1149,
 980,
 1288,
 1239,
 1028,
 1096,
 1074,
 1326,
 1068,
 611,
 1095,
 1318,
 1140,
 1089,
 1294,
 1259,
 1329,
 1197,
 707,
 1199,
 1048,
 504,
 1206,
 1113,
 249,
 1174,
 1286,
 1135,
 1122,
 1180,
 1128,
 1159]

## Visualize

In [755]:
def pyLDA(docs,lda, doc_topic_dists):
    get_normed = lambda data: pd.DataFrame(data).div(data.sum(axis = 1), axis = 0) 
    prepared = pyLDAvis.prepare(
            doc_lengths = docs.str.len(),
            vocab = vocab,
            term_frequency = np.asarray(tfidf_matrix.sum(axis = 0)).ravel().tolist(),
            topic_term_dists = get_normed(lda.components_), # topics x terms 
            doc_topic_dists = get_normed(doc_topic_dists)) # docs x topics 
    return prepared

In [756]:
dftest = pd.DataFrame(NMF_topics).div(NMF_topics.sum(axis = 1), axis = 0)

In [757]:
dftest['total'] = dftest.sum(axis=1)

In [758]:
dftest[dftest.total < .99]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,total


In [759]:
pyNMF = pyLDA(recipeDF.ingredientTokens,NMFmodel,NMF_topics)

In [760]:
pyLDAvis.save_html(pyNMF, 'pyldavisIngNMF.html')

## Personalized health-states logic