In [187]:
import pandas as pd
import numpy as np
import json

#html parsing
import urllib2
from lxml import etree

##text pre-processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity  

## Concatenate json files and create dataframe

In [2]:
def concatJson(file_list):
    full = []
    with open("allRecipes.json", "w") as outfile:
        for f in file_list:
            with open(f, 'rb') as infile:
                file_data = json.load(infile)
                full += file_data
        json.dump(full, outfile)

In [12]:
def openJson(name):
    with open(name, 'rb') as infile:
        full = json.load(infile)
    return full

In [28]:
file_list = ["fullhelping_items.json", "iLoveVegan_items.json","MinimalistBaker_items.json", "ohsheglows_items.json",
             "PaleOMG_items.json","roastedRoot_items.json","ohmyveggies_items.json",]

In [29]:
concatJson(file_list)
full = openJson('allRecipes.json')

In [31]:
fullDF = pd.DataFrame(full)
fullDF1 = fullDF[(fullDF.ingredientsList.str.len() != 0)]
recipeDF = fullDF1.reset_index(drop=True)

In [33]:
recipeDF.to_pickle('recipeDF_v3')

In [4]:
recipeDF = pd.read_pickle('recipeDF_v3')

In [8]:
recipeDF.tail(3)

Unnamed: 0,category,ingredientsList,instructions,summary,title,url
2432,[],"[Eggplant bacon:, 1/4 cup soy sauce, 1/4 cup a...","[Make the eggplant bacon:, In a medium bowl, s...",[],[],http://ohmyveggies.com/the-best-vegan-blt-with...
2433,[],"[8 oz. tempeh, cut into thin strips, 3/4 c. ba...",[Spread 1/4 cup of barbecue sauce on the botto...,[],[],http://ohmyveggies.com/recipe-bbq-tempeh-sandw...
2434,[],"[4 red bell peppers, 2 tablespoons safflower s...","[Preheat oven to 350ºF., Bring a large pot of ...",[],[],http://ohmyveggies.com/thai-stuffed-peppers/


##Write ingredients to file format for NYT ingredient tagger

In [2]:
def nytIngredientInput(df):
    f = open("input.txt", "a")
    ingredientList = []
    for row in range(0,len(df)):
        ingredients = df.ingredientsList.iloc[row]
        for i in range(0,len(ingredients)):
            line = ingredients[i].encode('ascii','ignore')
            ingredientList.append([row,i,line])
            f.write(line + ' /' + str(row) + ' ' + '\n')
    f.close()   
    return ingredientList

In [5]:
ingredientList = nytIngredientInput(recipeDF)

In [6]:
ingredientDF = pd.DataFrame(ingredientList, columns=['recipeIndex','ingredientIndex','recipeText'])

In [7]:
recipeDF.to_pickle('ingredientDF_v1')

In [3]:
recipeDF = pd.read_pickle('ingredientDF_v1')

In [9]:
ingredientDF[1534:1535]

Unnamed: 0,recipeIndex,ingredientIndex,recipeText
1534,120,15,or


In [10]:
len(ingredientDF)

29753

## Merge NYT ingredient tags with ingredientDF
<p>(see NYT_CRF_IngredientPhraseTagger.txt for tags) </p>

In [13]:
ingredientTags = openJson('results_6-21.json')
ingredientTagsDF = pd.DataFrame(ingredientTags)
ingredientTagsDF.to_pickle('ingredientTagsDF')

In [15]:
ingredientTaggedDF = pd.concat([ingredientDF,ingredientTagsDF], axis = 1)

In [24]:
def mergedIndex(df):

    df['mergedrecipeIndex'] = 'NA'

    for i in range(0,len(df)): 
        try:
            text = str(df.name.loc[i])
            df.name.loc[i] = text.rsplit(' ', 1)[0]
            df.mergedrecipeIndex.loc[i] = int(text.rsplit(' ', 1)[1])
        except:
            pass        

In [None]:
ingredientTaggedDF = mergedIndex(ingredientTaggedDF)

In [17]:
ingredientTaggedDF.to_pickle('ingredientTaggedDF_v4')

In [4]:
ingredientTaggedDF = pd.read_pickle('/home/rachaelrho/ds/metis/final_project/foodBlogs/ingredientTaggedDF_v3')

## Create Ingredient Documents

In [158]:
ingredientTaggedDF['name'] = ingredientTaggedDF['name'].astype(str)

In [159]:
def ingredientTokens(df,taggeddf):
    df['ingredientTokens'] = 0
    for i in range(0,len(taggeddf.recipeIndex.unique())):
        ingredients = taggeddf.name[taggeddf.recipeIndex == i]
        df['ingredientTokens'][i] = str(' '.join(ingredients))  
    df['ingredientTokens'] = df['ingredientTokens'].astype(str)
    
    return df

In [46]:
def nlpTokens(document):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(document)
    
    tokens = [PorterStemmer().stem(x) for x in tokens]
    tokens = [WordNetLemmatizer().lemmatize(x, 'v') for x in tokens]
    
    stop = stopwords.words('english')
    tokens = [x for x in tokens if x not in stop] 
    return ','.join(tokens)

In [167]:
recipeDF = ingredientTokens(recipeDF,ingredientTaggedDF)
recipeDF.loc[:, 'ingredientTokens'] = recipeDF.loc[:, 'ingredientTokens'].apply(nlpTokens)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [170]:
recipeDF.to_pickle('recipeDF_tokens_v3')

In [5]:
recipeDF = pd.read_pickle('recipeDF_tokens_v3')

## TFIDF vectorizer

In [6]:
def tfidfVectorizer(stopList,df):
    
    tfidf_vectorizer = TfidfVectorizer(max_df=.2, max_features=1300, min_df=0.01, stop_words= stopList, use_idf=True, ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(df.ingredientTokens)
    terms = sorted([(i,v) for v,i in tfidf_vectorizer.vocabulary_.items()])
    vocab = [x[1] for x in terms]
    
    return tfidf_vectorizer,tfidf_matrix,terms,vocab

In [7]:
tfidf_vectorizer,tfidf_matrix,terms,vocab = tfidfVectorizer(['nan','tbsp','tsp',','],recipeDF)

##Non-Negative Matrix Factorization

In [8]:
U, Sigma, VT = randomized_svd(tfidf_matrix, n_components=25,
                                      n_iter=5,
                                      random_state=None)

In [9]:
Sigma

array([ 11.70401137,   9.22868088,   6.49565647,   6.27837486,
         6.09858484,   5.69471553,   5.6495889 ,   5.48005114,
         5.38532986,   5.31662413,   5.19239357,   5.03113223,
         4.98280373,   4.91815987,   4.85493618,   4.67701469,
         4.64942793,   4.61276374,   4.50970307,   4.41528737,
         4.39363677,   4.30951498,   4.23886397,   4.20516999,   4.14052437])

In [29]:
def NMFactorize(tfidf_matrix,n):
    NMFmodel = NMF(n_components=n, init='random', random_state=0)
    NMF_matrix = NMFmodel.fit_transform(tfidf_matrix)
    
    return NMFmodel,NMF_matrix

In [30]:
NMFmodel,NMF_matrix = NMFactorize(tfidf_matrix,15)

## Most frequent terms (across documents)

In [31]:
NMFmodelred,NMF_matrixred = NMFactorize(tfidf_matrix,4)

In [32]:
def NMFgroup(matrix):
    
    groupList = []

    for i in range(0,len(matrix)):
        row = list(matrix[i])
        group = row.index(max(row)) 
        groupList.append(group)
        
    return groupList

In [33]:
groupList = NMFgroup(NMF_matrixred)
recipeDF['groupReduced'] = groupList

In [34]:
recipeDF.head(3)

Unnamed: 0,category,ingredientsList,instructions,summary,title,url,ingredientTokens,group,groupReduced
0,[],"[1 heaping cup fresh blueberries, ¼ cup olive ...",[Place all ingredients in a high speed blender...,[],[Blueberry Chia Salad Dressing],http://www.thefullhelping.com/blueberry-chia-s...,"blueberri,oliv,oil,tablepsoon,appl,cider,vineg...",10,2
1,[soup],"[1 medium (3-4 lb) kabocha squash, opened, see...",[Preheat your oven to 400F. Spread the kabocha...,[],"[Roasted Kabocha Squash, Pear, and Ginger Soup]",http://www.thefullhelping.com/roasted-kabocha-...,"kabocha,pear,onion,safflow,oil,salt,pepper,gin...",13,2
2,[],"[2 cups new potatoes or fingerling potatoes, q...",[1. Preheat oven to 400. Toss the potatoes in ...,[],[Vegan Tuna-Less Nicoise],http://www.thefullhelping.com/vegan-tuna-less-...,"potato,oliv,oil,rosemari,salt,bean,cherri,mix,...",0,3


In [35]:
def mostCommonTerms(recipeDF):

    mostCommon = []

    for i in list(set(groupList)):
        df = recipeDF.ingredientTokens[recipeDF.groupReduced == i]
        corpus = str(' '.join(df))
        tokens = nltk.word_tokenize(corpus)
        fdist = FreqDist(tokens)
        comm = fdist.most_common(100)
        mostCommon.append([i,comm])

    return mostCommon

In [36]:
mostCommon = mostCommonTerms(recipeDF)

In [41]:
mostCommon[0]

[0,
 [(',', 7263),
  ('bake', 382),
  ('coconut', 342),
  ('salt', 335),
  ('vanilla', 286),
  ('flour', 277),
  ('nan', 271),
  ('sugar', 268),
  ('powder', 261),
  ('extract', 232),
  ('Tbsp', 187),
  ('egg', 183),
  ('tsp', 179),
  ('soda', 178),
  ('cinnamon', 168),
  ('almond', 150),
  ('milk', 124),
  ('brown', 101),
  ('oil', 101),
  ('butter', 99),
  ('chocol', 78),
  ('c', 75),
  ('pumpkin', 73),
  ('sea', 72),
  ('mapl', 68),
  ('wheat', 66),
  ('honey', 61),
  ('purpos', 59),
  ('cocoa', 55),
  ('oat', 54),
  ('water', 53),
  ('banana', 48),
  ('syrup', 45),
  ('pastri', 39),
  ('ginger', 39),
  ('appl', 37),
  ('walnut', 36),
  ('For', 36),
  ('pure', 35),
  ('lemon', 35),
  ('meal', 34),
  ('juic', 33),
  ('cane', 32),
  ('seed', 29),
  ('vegan', 28),
  ('flax', 28),
  ('unsalt', 28),
  ('cream', 27),
  ('orang', 26),
  ('nutmeg', 26),
  ('pie', 25),
  ('chip', 24),
  ('tbsp', 24),
  ('rice', 24),
  ('whole', 22),
  ('white', 21),
  ('spice', 21),
  ('strawberri', 21),
  (

##Personalization

In [None]:
##find cosine similarity of docs to final list (adapt topCosine List) --> add weight

In [192]:
with open('/home/rachaelrho/ds/metis/final_project/User2_bookmarks.txt') as f:
    User2 = []
    for line in f:
        User2.append(line) 

In [277]:
def userIng(userLinks):

    userIngsList = []
    
    for link in range(0,len(userLinks)):
        url = userLinks[link]
        response = urllib2.urlopen(url)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(response, htmlparser)
        ings = tree.xpath('//*[contains(@class, "name")]/text()')

        ingsClean = []

        for ing in range(0,len(ings)):
            ingClean = ings[ing].rstrip(" ").rstrip("\n").lstrip("\n")
            ingsClean.append(ingClean)

        userIngsList.append(' '.join(ingsClean))
        
    
    
    return userIngList

In [278]:
user2IngList = userIng(User2) ##for some reason this function version doesn't work

In [287]:
userIngsList = []
    
for link in range(0,len(userLinks)):
    url = userLinks[link]
    response = urllib2.urlopen(url)
    htmlparser = etree.HTMLParser()
    tree = etree.parse(response, htmlparser)
    ings = tree.xpath('//*[contains(@class, "name")]/text()')

    ingsClean = []

    for ing in range(0,len(ings)):
        ingClean = ings[ing].rstrip(" ").rstrip("\n").lstrip("\n")
        ingsClean.append(ingClean)

    userIngsList.append(ingsClean) ##' '.join(ingsClean)

In [327]:
usertopCos = topCosine(userIngsList[0],NMF_matrix)

In [1]:
def usertopCos(userIngsList,NMF_matrix,nutrList):
    usertopCos = topCosine(userIngsList[0],NMF_matrix)
    usercosList = sorted([x for x in usertopCos if x[1] in (x[0] for x in nutrList)])
    
    return usercosList

In [None]:
usercosList = usertopCos(userIngsList,NMF_matrix,nutrList)

In [338]:
sorted([x for x in usertopCos if x[1] in (x[0] for x in nutrList)]) ##need to apply this function to every list element above

[(0.50278111667637326, 1690),
 (0.54690665991919474, 1737),
 (0.56508670983178411, 1293),
 (0.56519245225065595, 1519)]

## Cosine similarity

In [113]:
ingInput = ['chicken','avocado','tomato']

In [308]:
def topCosine(ingInput, NMF_matrix):
    
    document = ','.join(ingInput)
    testTokens = nlpTokens(document)
    tf = tfidf_vectorizer.transform([testTokens])
    NMF = NMFmodel.transform(tf)
    cosine = cosine_similarity(NMF, NMF_matrix) 
    cosineList = sorted(((e,i) for i,e in enumerate(list(cosine[0]))), reverse = True)
            
    return cosineList

In [293]:
def topIng(cosineList,recipeDF):
    
    index = [x[1] for x in cosineList]
    recipes = []

    for i in index:
        ing = [i,list(recipeDF.ingredientTokens[i:i+1])]
        recipes.append(ing)
    
    finalList = []

    for i in range(0,len(recipes)): 
        if all(ing in recipes[i][1][0] for ing in ingInput): 
            finalList.append(recipes[i])
    
    return recipes,finalList

In [309]:
cosineList = topCosine(ingInput,NMF_matrix)

In [295]:
recipes, finalList = topIng(cosineList,recipeDF)

In [125]:
list(recipeDF.url.loc[[x[0] for x in finalList]])

[u'http://www.theroastedroot.net/cobb-salad-green-goddess-dressing/',
 u'http://www.theroastedroot.net/grilled-chicken-kale-caesar-salad/',
 u'http://paleomg.com/enchilada-chicken-stew/',
 u'http://www.theroastedroot.net/grilled-tequila-lime-chicken-salad-tequila-lime-vinaigrette/']

## Nutritional preferences

In [126]:
nutrDict = {}

nutrDict['glutenFree'] = ['wheat','wheatberries','durum','emmer','semolina','spelt',
             'farina','farro','graham','rye','barley','tricale','malt']

nutrDict['nutFree'] = ['almond','brazil nut','cashew','chestnut','filbert',
           'hazelnut','hickory nut','macadamia nut','pecans','pine nut',
           'pistachio','walnut']

nutrDict['dairyFree'] = ['milk','butter','cream','half & half','sour cream','ghee','yogurt','cheese']

nutrDict['vegetarian'] = ['meat','lamb','beef','fish','shrimp','lobster','chicken']

nutrDict['vegan'] = ['milk','butter','cream','half & half','sour cream','ghee','yogurt','cheese','egg','meat','lamb']

In [182]:
def nutrPref(pref,finalList):
    
    delIndex = []
    
    for i in range(0,len(finalList)):
        if any(ing in finalList[i][1][0] for ing in nutrDict[pref]):
            delIndex.append(i) ##delIndex.append(i)
            
    nutrList = [x for i, x in enumerate(finalList) if i not in delIndex]
    return nutrList

In [183]:
nutrList = nutrPref('glutenFree',finalList)

In [311]:
nutrList

[[1737,
  [u'chicken,bacon,spring,green,mix,tomato,egg,avocado,feta,Mayo,Free,Green,Goddess,Dress']],
 [1690,
  [u'Caesar,Marin,Grill,Chicken,chicken,1690,Tessama,Caesar,Dress,1690,Kale,Caesar,Salad,lacinato,avocado,tomato,goat,chevr,dress']],
 [1293,
  [u'chicken,breast,yellow,green,bell,jalapeno,nan,coconut,oil,tomato,tomato,sauc,garlic,cumin,chili,powder,oregano,salt,cilantro,avocado']],
 [1519,
  [u'Grill,Tequila,Lime,Chicken,chicken,breast,oliv,oil,tequila,lime,juic,chili,powder,onion,powder,garlic,powder,sea,salt,Vinaigrett,oliv,oil,tequila,lime,juic,honey,cilantro,For,Salad,mix,green,babi,kale,arugula,avocado,tomato,red,onion,carrot,fresno']]]