In [1]:
import pandas as pd
import numpy as np
import json

#html parsing
import urllib2
from lxml import etree

##text pre-processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist

##feature extraction + similarity
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity  




## Concatenate json files and create dataframe

In [2]:
def concatJson(file_list):
    full = []
    with open("allRecipes.json", "w") as outfile:
        for f in file_list:
            with open(f, 'rb') as infile:
                file_data = json.load(infile)
                full += file_data
        json.dump(full, outfile)

In [3]:
def openJson(name):
    with open(name, 'rb') as infile:
        full = json.load(infile)
    return full

In [4]:
file_list = ["fullhelping_items.json", "iLoveVegan_items.json","MinimalistBaker_items.json", "ohsheglows_items.json",
             "PaleOMG_items.json","roastedRoot_items.json","ohmyveggies_items.json",]

In [5]:
concatJson(file_list)
full = openJson('allRecipes.json')

In [6]:
fullDF = pd.DataFrame(full)
fullDF1 = fullDF[(fullDF.ingredientsList.str.len() != 0)]
recipeDF = fullDF1.reset_index(drop=True)

In [7]:
recipeDF.to_pickle('recipeDF_v3')

In [8]:
recipeDF = pd.read_pickle('recipeDF_v3')

##Write ingredients to file format for NYT ingredient tagger

In [9]:
def nytIngredientInput(df):
    f = open("input.txt", "a")
    ingredientList = []
    for row in range(0,len(df)):
        ingredients = df.ingredientsList.iloc[row]
        for i in range(0,len(ingredients)):
            line = ingredients[i].encode('ascii','ignore')
            ingredientList.append([row,i,line])
            f.write(line + ' /' + str(row) + ' ' + '\n')
    f.close()   
    return ingredientList

In [10]:
ingredientList = nytIngredientInput(recipeDF)

In [11]:
ingredientDF = pd.DataFrame(ingredientList, columns=['recipeIndex','ingredientIndex','recipeText'])

In [12]:
recipeDF.to_pickle('ingredientDF_v1')

In [13]:
recipeDF = pd.read_pickle('ingredientDF_v1')

## Merge NYT ingredient tags with ingredientDF
<p>(see NYT_CRF_IngredientPhraseTagger.txt for tags) </p>

In [13]:
ingredientTags = openJson('results_6-21.json')
ingredientTagsDF = pd.DataFrame(ingredientTags)
ingredientTagsDF.to_pickle('ingredientTagsDF')

In [15]:
ingredientTaggedDF = pd.concat([ingredientDF,ingredientTagsDF], axis = 1)

In [24]:
def mergedIndex(df):

    df['mergedrecipeIndex'] = 'NA'

    for i in range(0,len(df)): 
        try:
            text = str(df.name.loc[i])
            df.name.loc[i] = text.rsplit(' ', 1)[0]
            df.mergedrecipeIndex.loc[i] = int(text.rsplit(' ', 1)[1])
        except:
            pass        

In [None]:
ingredientTaggedDF = mergedIndex(ingredientTaggedDF)

In [17]:
ingredientTaggedDF.to_pickle('ingredientTaggedDF_v4')

In [4]:
ingredientTaggedDF = pd.read_pickle('ingredientTaggedDF_v3')

## Create Ingredient Documents

In [158]:
ingredientTaggedDF['name'] = ingredientTaggedDF['name'].astype(str)

In [159]:
def ingredientTokens(df,taggeddf):
    df['ingredientTokens'] = 0
    for i in range(0,len(taggeddf.recipeIndex.unique())):
        ingredients = taggeddf.name[taggeddf.recipeIndex == i]
        df['ingredientTokens'][i] = str(' '.join(ingredients))  
    df['ingredientTokens'] = df['ingredientTokens'].astype(str)
    
    return df

In [30]:
def nlpTokens(document):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(document)
    
    tokens = [PorterStemmer().stem(x) for x in tokens]
    tokens = [WordNetLemmatizer().lemmatize(x, 'v') for x in tokens]
    
    stop = stopwords.words('english')
    tokens = [x for x in tokens if x not in stop] 
    return ','.join(tokens)

In [167]:
recipeDF = ingredientTokens(recipeDF,ingredientTaggedDF)
recipeDF.loc[:, 'ingredientTokens'] = recipeDF.loc[:, 'ingredientTokens'].apply(nlpTokens)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [170]:
recipeDF.to_pickle('recipeDF_tokens_v3')

In [127]:
recipeDF = pd.read_pickle('recipeDF_tokens_v3')

## TFIDF vectorizer

In [71]:
stopList = ('tbsp','From')

In [72]:
def tfidfVectorizer(stopList,df):
    
    tfidf_vectorizer = TfidfVectorizer(max_df=.2, max_features=1300, min_df=0.01, stop_words= stopList, use_idf=True, ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(df.ingredientTokens)
    terms = sorted([(i,v) for v,i in tfidf_vectorizer.vocabulary_.items()])
    vocab = [x[1] for x in terms]
    
    return tfidf_vectorizer,tfidf_matrix,terms,vocab

In [73]:
tfidf_vectorizer,tfidf_matrix,terms,vocab = tfidfVectorizer(['nan','tbsp','tsp',','],recipeDF)

##Non-Negative Matrix Factorization

In [74]:
U, Sigma, VT = randomized_svd(tfidf_matrix, n_components=25,
                                      n_iter=5,
                                      random_state=None)

In [75]:
Sigma

array([ 11.70401137,   9.2286809 ,   6.4956524 ,   6.27837369,
         6.09858166,   5.69478327,   5.64946554,   5.48057607,
         5.38515481,   5.3163961 ,   5.19272557,   5.03023903,
         4.98172928,   4.91637011,   4.85159163,   4.6862408 ,
         4.64935523,   4.62623189,   4.52034741,   4.42738316,
         4.38788085,   4.29596209,   4.26232211,   4.17663352,   4.12821984])

In [76]:
def NMFactorize(tfidf_matrix,n):
    NMFmodel = NMF(n_components=n, init='random', random_state=0)
    NMF_matrix = NMFmodel.fit_transform(tfidf_matrix)
    
    return NMFmodel,NMF_matrix

In [25]:
NMFmodel,NMF_matrix = NMFactorize(tfidf_matrix,15) ##chose 15 components since eigen vector largely falls under general food types

## Cosine similarity with example input ingredients

In [137]:
def topCosine(ingInput, NMF_matrix):
    
    document = ','.join(ingInput)
    testTokens = nlpTokens(document)
    tf = tfidf_vectorizer.transform([testTokens])
    NMF = NMFmodel.transform(tf)
    cosine = cosine_similarity(NMF, NMF_matrix) 
    cosineList = [(e,i) for i,e in enumerate(list(cosine[0]))] #, reverse = True) ##sorted
            
    return document,cosineList

In [248]:
def topIng(cosineList,recipeDF): ##may need to run this without defined function (was previously malfunctioning)
    
    recipes = []

    for i in range(0,len(cosineList)): 
        ing = [cosineList[i][0], cosineList[i][1], list(recipeDF.ingredientTokens[i:i+1])]
        recipes.append(ing)
    
    finalList = []

    for i in range(0,len(recipes)): 
        if all(ing in recipes[i][2][0] for ing in ingInput): 
            finalList.append(recipes[i])
    
    return recipes,finalList

In [249]:
ingInput = ['chicken','avocado','tomato']

In [250]:
document,cosineList = topCosine(ingInput,NMF_matrix)

In [251]:
recipes, finalList = topIng(cosineList,recipeDF)

In [254]:
finalList

[[0.91685352610868631,
  1293,
  [u'chicken,breast,yellow,green,bell,jalapeno,nan,coconut,oil,tomato,tomato,sauc,garlic,cumin,chili,powder,oregano,salt,cilantro,avocado']],
 [0.79922038806708151,
  1519,
  [u'Grill,Tequila,Lime,Chicken,chicken,breast,oliv,oil,tequila,lime,juic,chili,powder,onion,powder,garlic,powder,sea,salt,Vinaigrett,oliv,oil,tequila,lime,juic,honey,cilantro,For,Salad,mix,green,babi,kale,arugula,avocado,tomato,red,onion,carrot,fresno']],
 [0.92034505593664095,
  1690,
  [u'Caesar,Marin,Grill,Chicken,chicken,1690,Tessama,Caesar,Dress,1690,Kale,Caesar,Salad,lacinato,avocado,tomato,goat,chevr,dress']],
 [0.92339329182205332,
  1737,
  [u'chicken,bacon,spring,green,mix,tomato,egg,avocado,feta,Mayo,Free,Green,Goddess,Dress']]]

In [255]:
nutrDict = {}

nutrDict['glutenFree'] = ['wheat','wheatberries','durum','emmer','semolina','spelt',
             'farina','farro','graham','rye','barley','tricale','malt']

nutrDict['nutFree'] = ['almond','brazil nut','cashew','chestnut','filbert',
           'hazelnut','hickory nut','macadamia nut','pecans','pine nut',
           'pistachio','walnut']

nutrDict['dairyFree'] = ['milk','butter','cream','half & half','sour cream','ghee','yogurt','cheese']

nutrDict['vegetarian'] = ['meat','lamb','beef','fish','shrimp','lobster','chicken']

nutrDict['vegan'] = ['milk','butter','cream','half & half','sour cream','ghee','yogurt','cheese','egg','meat','lamb']

##Low Glycemic

In [256]:
def nutrPref(pref,finalList):
    
    delIndex = []
    
    for i in range(0,len(finalList)):
        if any(ing in finalList[i][2][0] for ing in nutrDict[pref]):
            delIndex.append(i) ##delIndex.append(i)
            
    nutrList = [x for i, x in enumerate(finalList) if i not in delIndex]
    return nutrList

In [257]:
nutrList = nutrPref('glutenFree',finalList)

## Personalization with example bookmarked recipes

In [198]:
with open('/home/rachaelrho/ds/metis/final_project/User2_bookmarks.txt') as f:
    User2 = []
    for line in f:
        User2.append(line) 

In [199]:
def userIng(userLinks):

    userIngsList = []
    
    for link in range(0,len(userLinks)):
        url = userLinks[link]
        response = urllib2.urlopen(url)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(response, htmlparser)
        ings = tree.xpath('//*[contains(@class, "name")]/text()')

        ingsClean = []

        for ing in range(0,len(ings)):
            ingClean = ings[ing].rstrip(" ").rstrip("\n").lstrip("\n")
            ingsClean.append(ingClean)

        userIngsList.append(ingsClean) 
        
    
    
    return userIngsList

In [200]:
user2IngList = userIng(User2)

In [201]:
def usertopCos(userIngsList,NMF_matrix,nutrList):
    
    userdocFull = []
    usercosListFull = []
    
    for i in range(0,len(userIngsList)):
        
        userdoc,usertopCos = topCosine(userIngsList[i],NMF_matrix)
        usercosList = sorted([x for x in usertopCos if x[1] in (x[0] for x in nutrList)])
        
        userdocFull.append([userdoc])
        usercosListFull.append(usercosList)
    
    return userdocFull,usercosListFull

In [202]:
userdocFull,usercosListFull = usertopCos(user2IngList,NMF_matrix,nutrList)

In [287]:
def userfinalList(nutrList,bookmarkList,nutrW,bookmarkW):
    
    nutrList = sorted([r[0:2] for r in nutrList], key =  lambda x: x[1])
    bookmarkList = [sorted(usercosListFull[r], key = lambda tup: tup[1]) for r in range(0,len(usercosListFull))]
    userAvg = np.mean(usercosListFull, dtype=np.float64, axis=0)
    
    userList = []
                                                                
    for i in range(0,len(nutrList)):
        finalW = nutrList[i][0] * nutrW  + userAvg[i][0] * bookmarkW
        userList.append((finalW,nutrList[i][1]))
        
    return userList

In [288]:
userList = userfinalList(nutrList,usercosListFull,0.8,0.2)

In [289]:
userList ##shows that recipe no. 1293 is best choice for user2 based on preferences, despite third best choice on pure cosine similarity

[(0.85620964053111936, 1293),
 (0.7495382741747022, 1519),
 (0.84037637700442991, 1690),
 (0.84747264382126775, 1737)]

In [None]:
##json data load of majority of ingredient combinations and recommended recipes in upcoming notebook