In [1]:
import pandas as pd
import numpy as np
import unicodedata as ucd
import json
from pprint import pprint

##text pre-processing
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity  




In [5]:
nyt = pd.read_csv("https://raw.githubusercontent.com/NYTimes/ingredient-phrase-tagger/master/nyt-ingredients-snapshot-2015.csv")

In [6]:
nyt.head(3)

Unnamed: 0,index,input,name,qty,range_end,unit,comment
0,0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,2,"1 medium-size onion, peeled and chopped",onion,1.0,0.0,,"medium-size, peeled and chopped"


In [59]:
pd.set_option('max_colwidth', 800)
nyt['input'][0:1]

0    1 1/4 cups cooked and pureed fresh butternut squash, or 1 10-ounce package frozen squash, defrosted
Name: input, dtype: object

## concatenate json files and create dataframe

In [59]:
file_list = ["fullhelping_items.json", "iLoveVegan_items.json", "MinimalistBaker_items.json", "ohsheglows_items.json",
             "PaleOMG_items.json"]
full = []
with open("allRecipes.json", "w") as outfile:
    for f in file_list:
        with open(f, 'rb') as infile:
            file_data = json.load(infile)
            full += file_data
    json.dump(full, outfile)


In [60]:
with open('allRecipes.json', 'rb') as infile:
    full = json.load(infile)

In [61]:
fullDF = pd.DataFrame(full)

In [62]:
fullDF1 = fullDF[(fullDF.ingredientsList.str.len() != 0)]

In [63]:
recipeDF = fullDF1.reset_index(drop=True)

In [64]:
recipeDF.to_pickle('recipeDF')

In [2]:
recipeDF = pd.read_pickle('recipeDF')

In [3]:
recipeDF.head(3)

Unnamed: 0.1,Unnamed: 0,category,ingredientsList,instructions,summary,title,url
0,0,[],"[u'1 pound purple asparagus, washed, woody end...","[u""Preheat the oven to 400F. Toss the asparagu...",[],[u'Purple Asparagus and Quinoa Salad with Peas...,http://www.thefullhelping.com/purple-asparagus...
1,1,[],[u'1 package DOLE\xae Power Up Greens\u2122 (o...,"[u'1. Place the greens, quinoa, blueberries, a...",[],"[u'Power Up Salad with Baby Kale, Quinoa, Blue...",http://www.thefullhelping.com/power-up-salad-w...
2,2,[],"[u""1\xbe cups gluten-free, all purpose flour o...",[u'Preheat your oven to 350F. Line a 12-muffin...,[],"[u'Vegan, Gluten Free Pumpkin Gingerbread Spic...",http://www.thefullhelping.com/vegan-gluten-fre...


## Write ingredients to file for NYT ingredient tagger

In [70]:
f = open("input.txt", "a")
ingredientList = []

for row in range(0,len(recipeDF)):
    ingredients = recipeDF.ingredientsList.iloc[row]
    for i in range(0,len(ingredients)):
        line = ingredients[i].encode('ascii','ignore')
        ingredientList.append([row,i,line])
        f.write(line + '\n')
f.close()
        

In [71]:
ingredientDF = pd.DataFrame(ingredientList, columns=['recipeIndex','ingredientIndex','recipeText'])

In [72]:
len(ingredientDF)

15654

## merge NYT ingredient tags with ingredientDF

In [73]:
with open('results.json', 'rb') as infile:
    ingredientTags = json.load(infile)

In [74]:
ingredientTagsDF = pd.DataFrame(ingredientTags)

In [75]:
ingredientDF2 = ingredientDF[ingredientDF['recipeText'].str.contains("\n|\t") == False]
ingredientDF2 = ingredientDF2.reset_index(drop=True)

In [76]:
len(ingredientDF2)

15296

In [77]:
ingredientTaggedDF = pd.concat([ingredientDF2,ingredientTagsDF], axis = 1)

In [78]:
ingredientTaggedDF.to_pickle('ingredientTaggedDF')

In [4]:
ingredientTaggedDF = pd.read_pickle('ingredientTaggedDF')

  interactivity=interactivity, compiler=compiler, result=result)


## create Ingredient Documents / Strings

In [79]:
len(ingredientTaggedDF.recipeIndex.unique())

1340

In [91]:
ingredientTaggedDF['name'] = ingredientTaggedDF['name'].astype(str)

In [92]:
len(ingredientTaggedDF.recipeIndex.unique())

1340

In [93]:
recipeDF['ingredientTokens'] = 0

for i in range(0,len(ingredientTaggedDF.recipeIndex.unique())):
    ingredients = ingredientTaggedDF.name[ingredientTaggedDF.recipeIndex == i]
    recipeDF['ingredientTokens'][i] = str(' '.join(ingredients))   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [98]:
recipeDF['ingredientTokens'] = recipeDF['ingredientTokens'].astype(str)

In [99]:
def createTokens(document):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(document)
    
    tokens = [PorterStemmer().stem(x) for x in tokens]
    tokens = [WordNetLemmatizer().lemmatize(x, 'v') for x in tokens]
    
    stop = stopwords.words('english')
    tokens = [x for x in tokens if x not in stop] 
    return ' '.join(tokens)

In [100]:
recipeDF.loc[:, 'ingredientTokens'] = recipeDF.loc[:, 'ingredientTokens'].apply(createTokens)

In [101]:
recipeDFnull = recipeDF[recipeDF.ingredientTokens.isnull() == True]

In [102]:
recipeDFnull

Unnamed: 0,category,ingredientsList,instructions,summary,title,url,ingredientTokens


In [103]:
recipeDF.to_pickle('recipeDF_tokens')

In [106]:
recipeDF = pd.read_pickle('recipeDF_tokens')

In [108]:
recipeDF[1100:1101]

Unnamed: 0,category,ingredientsList,instructions,summary,title,url,ingredientTokens
1100,[],"[2 pounds boneless, skinless chicken (or whate...","[Preheat oven to 400 degres., Place chicken in...",[],[Enchilada Lasagna],http://paleomg.com/chicken-enchilada-lasagna/,chicken oliv oil nan yellow onion chili powder...


## TFIDF vectorizer

In [109]:
stop = ['nan']

In [110]:
tfidf_vectorizer = TfidfVectorizer(max_df=.99, max_features=1300, min_df=0.01, stop_words= stop, use_idf=True, ngram_range=(1,3))

In [111]:
tfidf_matrix = tfidf_vectorizer.fit_transform(recipeDF.ingredientTokens)

In [112]:
terms = sorted([(i,v) for v,i in tfidf_vectorizer.vocabulary_.items()])

In [113]:
vocab = [x[1] for x in terms]

## NMF

In [114]:
U, Sigma, VT = randomized_svd(tfidf_matrix, n_components=15,
                                      n_iter=5,
                                      random_state=None)

In [115]:
Sigma

array([ 10.54430843,   7.71903456,   5.72599525,   4.86166141,
         4.68451929,   4.51820347,   4.27080074,   4.1620107 ,
         4.09972479,   3.99463508,   3.98368869,   3.82097501,
         3.71322493,   3.62741917,   3.5718115 ])

In [116]:
NMFmodel = NMF(n_components=4, init='random', random_state=0)
NMF_topics = NMFmodel.fit_transform(tfidf_matrix)

In [117]:
NMF_topics.shape

(1342, 4)

In [382]:
clusters = [list(r).index(max(r)) for r in NMF_topics]
counts = [0,0,0,0]
for c in clusters:
    counts[c]+=1
counts
clusters[0:20]
recipeDF.ingredientTokens[6]

u'nan medjool date vanilla extract oat chia seed rhubarb water cup sugar coconut sugar'

## User input test

In [383]:
test = 'chicken tomato asparagus avocado vanilla milk'

In [384]:
testTokens = createTokens(test)

In [385]:
tf = tfidf_vectorizer.transform([testTokens])

In [386]:
tf

<1x552 sparse matrix of type '<type 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [387]:
test1 = NMFmodel.transform(tf)

In [388]:
cosine = cosine_similarity(test1, NMF_topics) 

In [389]:
NMF_topics[1001]

array([ 0.22909669,  0.00249022,  0.        ,  0.        ])

In [390]:
test1

array([[ 0.04650726,  0.01539358,  0.        ,  0.04269284]])

In [391]:
cosineList = sorted(((e,i) for i,e in enumerate(list(cosine[0]))), reverse = True)

In [393]:
cosineList[0:20]

[(0.99883351698871969, 1081),
 (0.99634304348762637, 1303),
 (0.98844048294962583, 1285),
 (0.98708266902136699, 1153),
 (0.98057730204906868, 139),
 (0.97591675163357738, 363),
 (0.97442666115079291, 1068),
 (0.97153351820114153, 1310),
 (0.96975395457217339, 10),
 (0.96972561893615261, 1044),
 (0.96762772055105328, 1279),
 (0.96690479434197352, 1024),
 (0.96675219781722732, 981),
 (0.96609710752746891, 1234),
 (0.96514368997559852, 138),
 (0.96489191414353037, 76),
 (0.96466934596948961, 1117),
 (0.96464871501720773, 1034),
 (0.96454616718593211, 158),
 (0.96426798896976607, 1188)]

<1x552 sparse matrix of type '<type 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [394]:
top10 = [x[1] for x in cosineList[0:100]]
top10ingredients = []

for i in top10:
    ing = list(recipeDF.ingredientTokens[i:i+1])
    top10ingredients.append(ing)
    
    #if any("chicken" in s for s in ing) and any("tomato" in s for s in ing):
        #print ing

In [395]:
top10ingredients

[[u'nan coconut milk nan Coconut Flour parsley garlic powder sage smoke paprika salt pepper'],
 [u'lb turkey last time yellow onion almond flour egg curri powder garam masala ginger cinnamon salt pepper coconut oil coconut milk chicken broth onion dice curri powder garam masala ginger'],
 [u'beef carrot scallion almond flour meal coconut milk coconut amino curri powder sesam oil ginger red pepper salt pepper coconut oil'],
 [u'nan butternut squash pecan nan nan bacon fat cinnamon salt'],
 [u'cantaloup nan arugula almond flax appl cider vinegar mapl syrup salt black pepper'],
 [u'nan salt pretzel nan'],
 [u'egg coconut milk coconut flour salt pork short rib mapl syrup garlic powder salt bacon nan sauc green onion'],
 [u'red grape nan bacon balsam vinegar coconut oil salt'],
 [u'coconut oil mustard seed onion carrot ginger ginger turmer cumin coriand salt black pepper yellow split pea water coconut milk lime juic brown rice scallion green onion cilantro coconut milk'],
 [u'nan nan leek a

## Personalized health-states logic