In [34]:
import pandas as pd
import numpy as np
import pickle
import json
import itertools

#html parsing
import urllib2
from lxml import etree

##text pre-processing (for input)
import nltk
from nltk import FreqDist
from extract_funcs import nlpTokens,NMFactorize
from sklearn.metrics.pairwise import cosine_similarity 

In [23]:
recipeDF = pd.read_pickle('recipeDF_tokens_v3')
NMFmodel = pickle.load(open( "NMFmodel_v6.p", "rb"))
NMF_matrix = pickle.load(open( "NMFmatrix_v6.p", "rb"))
tfidf_vectorizer = pickle.load(open( "tfidf_vectorizer_v6.p", "rb"))
tfidf_matrix = pickle.load(open( "tfidf_matrix_v6.p", "rb"))

##Cosine similarity with example input ingredients

In [3]:
def topCosine(ingInput, NMF_matrix):
    
    document = ','.join(ingInput)
    testTokens = nlpTokens(document)
    tf = tfidf_vectorizer.transform([testTokens])
    NMF = NMFmodel.transform(tf)
    cosine = cosine_similarity(NMF, NMF_matrix) 
    cosineList = [(e,i) for i,e in enumerate(list(cosine[0]))] #, reverse = True) ##sorted
            
    return document,cosineList

In [4]:
def topIng(cosineList,recipeDF): ##may need to run this without defined function (was previously malfunctioning)
    
    recipes = []

    for i in range(0,len(cosineList)): 
        ing = [cosineList[i][0], cosineList[i][1], list(recipeDF.ingredientTokens[i:i+1])]
        recipes.append(ing)
    
    finalList = []

    for i in range(0,len(recipes)): 
        if all(ing in recipes[i][2][0] for ing in ingInput): 
            finalList.append(recipes[i])
    
    return recipes,finalList

In [195]:
ingInput = ['chicken','avocado','tomato']

In [196]:
document,cosineList = topCosine(ingInput,NMF_matrix)

In [197]:
recipes, finalList = topIng(cosineList,recipeDF)

In [198]:
finalList

[[0.91685352610868631,
  1293,
  [u'chicken,breast,yellow,green,bell,jalapeno,nan,coconut,oil,tomato,tomato,sauc,garlic,cumin,chili,powder,oregano,salt,cilantro,avocado']],
 [0.79922038806708151,
  1519,
  [u'Grill,Tequila,Lime,Chicken,chicken,breast,oliv,oil,tequila,lime,juic,chili,powder,onion,powder,garlic,powder,sea,salt,Vinaigrett,oliv,oil,tequila,lime,juic,honey,cilantro,For,Salad,mix,green,babi,kale,arugula,avocado,tomato,red,onion,carrot,fresno']],
 [0.92034505593664095,
  1690,
  [u'Caesar,Marin,Grill,Chicken,chicken,1690,Tessama,Caesar,Dress,1690,Kale,Caesar,Salad,lacinato,avocado,tomato,goat,chevr,dress']],
 [0.92339329182205332,
  1737,
  [u'chicken,bacon,spring,green,mix,tomato,egg,avocado,feta,Mayo,Free,Green,Goddess,Dress']]]

In [9]:
nutrDict = {}

nutrDict['glutenFree'] = ['wheat','wheatberries','durum','emmer','semolina','spelt',
             'farina','farro','graham','rye','barley','tricale','malt']

nutrDict['nutFree'] = ['almond','brazil nut','cashew','chestnut','filbert',
           'hazelnut','hickory nut','macadamia nut','pecans','pine nut',
           'pistachio','walnut']

nutrDict['dairyFree'] = ['milk','butter','cream','half & half','sour cream','ghee','yogurt','cheese']

nutrDict['vegetarian'] = ['meat','lamb','beef','fish','shrimp','lobster','chicken']

nutrDict['vegan'] = ['milk','butter','cream','half & half','sour cream','ghee','yogurt','cheese','egg','meat','lamb']

##Low Glycemic

In [10]:
def nutrPref(pref,finalList):
    
    delIndex = []
    
    for i in range(0,len(finalList)):
        if any(ing in finalList[i][2][0] for ing in nutrDict[pref]):
            delIndex.append(i) ##delIndex.append(i)
            
    nutrList = [x for i, x in enumerate(finalList) if i not in delIndex]
    return nutrList

In [11]:
nutrList = nutrPref('glutenFree',finalList)

##Personalization with example bookmarked recipes

In [12]:
with open('/home/rachaelrho/ds/metis/final_project/User2_bookmarks.txt') as f:
    User2 = []
    for line in f:
        User2.append(line) 

In [13]:
def userIng(userLinks):

    userIngsList = []
    
    for link in range(0,len(userLinks)):
        url = userLinks[link]
        response = urllib2.urlopen(url)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(response, htmlparser)
        ings = tree.xpath('//*[contains(@class, "name")]/text()')

        ingsClean = []

        for ing in range(0,len(ings)):
            ingClean = ings[ing].rstrip(" ").rstrip("\n").lstrip("\n")
            ingsClean.append(ingClean)

        userIngsList.append(ingsClean) 
    
    return userIngsList

In [14]:
user2IngList = userIng(User2)

In [15]:
def usertopCos(userIngsList,NMF_matrix,nutrList):
    
    userdocFull = []
    usercosListFull = []
    
    for i in range(0,len(userIngsList)):
        
        userdoc,usertopCos = topCosine(userIngsList[i],NMF_matrix)
        usercosList = sorted([x for x in usertopCos if x[1] in (x[1] for x in nutrList)])
        
        userdocFull.append([userdoc])
        usercosListFull.append(usercosList)
    
    return userdocFull,usercosListFull

In [16]:
userdocFull,usercosListFull = usertopCos(user2IngList,NMF_matrix,nutrList)

In [17]:
def userfinalList(nutrList,bookmarkList,nutrW,bookmarkW):
    
    nutrList = sorted([r[0:2] for r in nutrList], key =  lambda x: x[1])
    bookmarkList = [sorted(usercosListFull[r], key = lambda tup: tup[1]) for r in range(0,len(usercosListFull))]
    userAvg = np.mean(bookmarkList, dtype=np.float64, axis=0)
    
    userList = []
                                                                
    for i in range(0,len(nutrList)):
        finalW = nutrList[i][0] * nutrW  + userAvg[i][0] * bookmarkW
        userList.append((finalW,nutrList[i][1]))
        
    return userList

In [18]:
userList = userfinalList(nutrList,usercosListFull,0.8,0.2)

In [19]:
userList ##shows that recipe no. 1293 is best choice for user2 based on preferences, despite third best choice on pure cosine similarity

[(0.85620964053111936, 1293),
 (0.7495382741747022, 1519),
 (0.84037637700442991, 1690),
 (0.84747264382126775, 1737)]

## Most frequent ingredients + recommended recipes

In [26]:
NMFmodelred,NMF_matrixred = NMFactorize(tfidf_matrix,10)

In [27]:
def NMFgroup(matrix):
    
    groupList = []

    for i in range(0,len(matrix)):
        row = list(matrix[i])
        group = row.index(max(row)) 
        groupList.append(group)
        
    return groupList

In [28]:
groupList = NMFgroup(NMF_matrixred)
recipeDF['groupReduced'] = groupList

In [30]:
def mostCommonTerms(recipeDF):

    mostCommon = []

    for i in list(set(groupList)):
        df = recipeDF.ingredientTokens[recipeDF.groupReduced == i]
        corpus = str(' '.join(df))
        tokens = nltk.word_tokenize(corpus)
        fdist = FreqDist(tokens)
        comm = fdist.most_common(100)
        mostCommon.append([i,comm])

    return mostCommon

In [35]:
mostCommon = mostCommonTerms(recipeDF)

In [270]:
def combRecipe(combs,pref,user2IngList,nutrW,bookmarkW):
    
    d = {}
    rejInput = []
    
    for i in range(0,len(combs)): 
        ingInput = combs[i]
        document,cosineList = topCosine(ingInput,NMF_matrix)
        recipes, finalList = topIng(cosineList,recipeDF)
        nutrList = nutrPref(pref,finalList)
        userdocFull,usercosListFull = usertopCos(user2IngList,NMF_matrix,nutrList)
        userList = userfinalList(nutrList,usercosListFull,nutrW,bookmarkW)
        
        try:
            recipeMax = max(userList)[1]
            recipeName = str(recipeDF.title.loc[recipeMax][0]) 
            ingName = ','.join(ingInput)
            d[ingName] = recipeName
            print recipeName
            
        except:
            rejInput.append(ingInput)
            print ingInput
    
    with open('recipes.json', 'w') as fp:
        json.dump(d, fp, sort_keys=True, indent=4) 
    
    return d, rejInput   

In [284]:
recipeDict, rejected = combRecipe(combs,"glutenFree",user2IngList,0.8,0.2)