# Recommendations based on item similarity 
A simple recommender based on ingredient tokenized tfidf.  

In [3]:
import pymongo 
import cPickle
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import numpy as np
from sklearn.decomposition import NMF, PCA, TruncatedSVD 
from sklearn.cluster import KMeans
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

client = pymongo.MongoClient("mongodb://localhost/27017")

ar_db = client.allrecipes

ar_db.members.find_one().keys()

[u'nutrition',
 u'followers_dict',
 u'reviews_dict',
 u'following_id_list',
 u'madeits_recipe_id_list',
 u'favorites_dict',
 u'reviews_recipe_id_list',
 u'nutrition_avg_vals',
 u'following_dict',
 u'member_ID',
 u'favorites_recipe_id_list',
 u'_id',
 u'aboutme',
 u'madeits_dict',
 u'followers_id_list']

In [29]:
# Previously tokenized recipe corpus
# dict of recipe_ID:list of preprocessed, tokenized ingredient phrases
with open('../data/allrecipes/recipe_tokens.pkl', 'rb') as f: 
    recipe_tokens = cPickle.load(f)

In [30]:
recipe_tokens.items()[0]

(u'73468',
 [[u'water', u',', u'divided'],
  [u'plain', u'yogurt', u'with', u'active', u'cultures'],
  [u'dry', u'milk', u'powder']])

In [32]:
import itertools
recipe_tokens = [(a[0], list(itertools.chain(*[b for b in a[1]]))) \
                 for a in recipe_tokens.items()]

In [33]:
recipe_tokens[0]

(u'73468',
 [u'water',
  u',',
  u'divided',
  u'plain',
  u'yogurt',
  u'with',
  u'active',
  u'cultures',
  u'dry',
  u'milk',
  u'powder'])

In [34]:
len(recipe_tokens)

11111

In [46]:
recipe_tokens[3][1]

u'chopped'

In [49]:
X = [" ".join(a[1]) for a in recipe_tokens]

In [50]:
X[0]

u'water , divided plain yogurt with active cultures dry milk powder'

In [94]:
# instantiate stemmer, analyzer 
stemmer = nltk.SnowballStemmer("english")
analyzer = TfidfVectorizer().build_analyzer()
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))


# instantiate vectorizer for ingredients
vectorizer = TfidfVectorizer(stop_words='english', 
                             strip_accents='ascii',
                             analyzer=stemmed_words)


In [96]:
X_vectorized = vectorizer.fit_transform(X)

In [53]:
features = vectorizer.get_feature_names()

In [56]:
# Use sklearn's pairwise_distances
from sklearn.metrics.pairwise import pairwise_distances

In [57]:
# cosine similarity as metric
item_similarity_cosine = pairwise_distances(X_vectorized, metric='cosine')

In [58]:
item_similarity_cosine.shape

(11111, 11111)

In [59]:
# Get index of 5 most similar members
item_similarity_cosine[1].argsort()[:-1][:6]

array([    1,  5344,  9927,  7952, 10567, 10601])

In [77]:
# Get similar recipes of Coq au vin
recipe = 9927
print "For recipe {}:".format(ar_db.recipes.find_one({"recipe_ID":recipe_tokens[recipe][0]})['name'])
for i,a in enumerate(item_similarity_cosine[recipe].argsort()[:-1][1:6]):
    print "   Similar recipe #{} is {}".format(i+1,ar_db.recipes.find_one({"recipe_ID":recipe_tokens[a][0]})['name'])

For recipe [u'Coq Au Vin with Rosemary and Thyme']:
   Similar recipe #1 is [u'Buffalo Veggie Quinoa Meatloaf']
   Similar recipe #2 is [u'Skillet-Braised Brussels Sprouts']
   Similar recipe #3 is [u"Cary's Cast Iron Skillet Chicken Recipe"]
   Similar recipe #4 is [u'Eggs Poached in Tomato Sauce']
   Similar recipe #5 is [u"Chef John's Panzanella"]


In [78]:
recipe = 85
print "For recipe {}:".format(ar_db.recipes.find_one({"recipe_ID":recipe_tokens[recipe][0]})['name'])
for i,a in enumerate(item_similarity_cosine[recipe].argsort()[:-1][1:6]):
    print "   Similar recipe #{} is {}".format(i+1,ar_db.recipes.find_one({"recipe_ID":recipe_tokens[a][0]})['name'])

For recipe [u'BBQ NY Strip']:
   Similar recipe #1 is [u'New York Strip Chicago Style']
   Similar recipe #2 is [u'Thyme-Rubbed Steaks with Sauteed Mushrooms']
   Similar recipe #3 is [u'Xavier Steak']
   Similar recipe #4 is [u'Marinated Flank Steak']
   Similar recipe #5 is [u'Chimichurri Sauce']


In [79]:
recipe = 99
print "For recipe {}:".format(ar_db.recipes.find_one({"recipe_ID":recipe_tokens[recipe][0]})['name'])
for i,a in enumerate(item_similarity_cosine[recipe].argsort()[:-1][1:6]):
    print "   Similar recipe #{} is {}".format(i+1,ar_db.recipes.find_one({"recipe_ID":recipe_tokens[a][0]})['name'])

For recipe [u'Garlic Teriyaki Edamame']:
   Similar recipe #1 is [u'Simple Roasted Edamame ']
   Similar recipe #2 is [u'Sesame Broccoli Salad']
   Similar recipe #3 is [u'Sesame Noodles']
   Similar recipe #4 is [u'Easy Grilled Chicken Teriyaki']
   Similar recipe #5 is [u'Spicy Chinese Mustard Green Beans ']


They look ok. 

## Todo: 
1. Given a member's recipe history, recommend based on item similarity.  

In [None]:
recipe_tokens_dict = 

In [80]:
member_recipes = [(a['member_ID'], a['madeits_recipe_id_list']+a['favorites_recipe_id_list']) for a in ar_db.members.find()]

member_recipes[0]

(u'17117019', [u'26297', u'26297', u'10462', u'10462'])

In [82]:
with open('../data/allrecipes/recipe_tokens.pkl', 'rb') as f: 
    recipe_tokens_dict = cPickle.load(f) 

In [84]:
member_ingredients = [(a[0], [recipe_tokens_dict.get(b, "") for b in a[1]]) for a in member_recipes]

In [85]:
# stackoverflow solution for flattening lists of lists 
def flatten(items, seqtypes=(list, tuple)):
    for i, x in enumerate(items):
        while i < len(items) and isinstance(items[i], seqtypes):
            items[i:i+1] = items[i]
    return items

In [86]:
member_ingredient_tokens = [(a[0], " ".join(flatten(a[1]))) for a in member_ingredients]

In [87]:
member_ingredient_tokens[0]

(u'17117019',
 u') package rotini pasta vegetable oil fresh lime juice chili powder , or to taste ground cumin salt garlic , crushed whole kernel corn ) can black beans , drained and rinsed diced green bell pepper diced red bell pepper fresh cilantro leaves chopped roma tomatoes ) package rotini pasta vegetable oil fresh lime juice chili powder , or to taste ground cumin salt garlic , crushed whole kernel corn ) can black beans , drained and rinsed diced green bell pepper diced red bell pepper fresh cilantro leaves chopped roma tomatoes white sugar baking powder all-purpose flour shortening egg salt ground cinnamon fresh blueberries white sugar cornstarch white sugar baking powder all-purpose flour shortening egg salt ground cinnamon fresh blueberries white sugar cornstarch')

In [88]:
member_ingredients = [a[1] for a in member_ingredient_tokens]

In [97]:
# use previous vectorizer vocabularies 
member_ing_vectorized = vectorizer.transform(member_ingredients)

In [None]:
# cosine similarity as metric
item_similarity_cosine = pairwise_distances(member_ing_vectorized, metric='cosine')

In [98]:
member_ing_vectorized.shape

(1430, 2791)

In [146]:
from scipy.spatial.distance import cosine
def recommend_recipe(member_rec_tokenized):
    """Given a tokenized list of ingedients, return most similar item"""
    rec_vectorized = vectorizer.transform(member_rec_tokenized)
    distances = np.zeros(X_vectorized.shape[0])
    for i,v in enumerate(X_vectorized): 
        distances[i] = cosine(rec_vectorized.todense(), v.todense())
    return distances.argsort()[:-10:-1]

In [139]:
member_ing_vectorized[0]

<1x2791 sparse matrix of type '<type 'numpy.float64'>'
	with 48 stored elements in Compressed Sparse Row format>

In [154]:
print "for memeber ", member_ingredient_tokens[1][0]
for i,a in enumerate(recommend_recipe([member_ingredient_tokens[1][0]])):
    print "   Recommended recipe #{} is {}".format(i+1,ar_db.recipes.find_one({"recipe_ID":recipe_tokens[a][0]})['name'])

for memeber  5049040
   Recommended recipe #1 is []
   Recommended recipe #2 is [u'California Chicken Spaghetti']
   Recommended recipe #3 is [u'Vegetarian Four Cheese Lasagna']
   Recommended recipe #4 is [u'Italian Stewed Tomatoes']
   Recommended recipe #5 is [u'Burgundy Pork Tenderloin']
   Recommended recipe #6 is [u'Tamale Casserole']
   Recommended recipe #7 is [u'Lavender Tea Bread']
   Recommended recipe #8 is [u"Quick Hoppin' John Soup-KT"]
   Recommended recipe #9 is [u'Comfort Breakfast Bake']


In [161]:
ar_db.members.find_one({'member_ID':'5049040'},['madeits_dict','favorites_dict'])

{u'_id': ObjectId('58d1a3f99d477a1e01c32ea0'),
 u'favorites_dict': {u'cashew-chicken-with-water-chestnuts': u'162392',
  u'cauliflower-mac-and-cheese-bake': u'246102',
  u'dads-caesar-salad': u'64559440',
  u'doctor-bird-cake': u'20906',
  u'dorm-room-chili-mac': u'139471',
  u'garlicky-tortellini-soup-with-sausage-tomatoes-and-spinach': u'62177657',
  u'goat-stew': u'228238',
  u'homemade-browning-sauce': u'64360001',
  u'irish-potato-nachos': u'235983',
  u'italian-spinach-sausage-pie': u'222705',
  u'mexican-shepherds-pie': u'68806',
  u'mommas-hot-dog-casserole': u'231624',
  u'nanny-129s-roasted-potatoes': u'64168799',
  u'nunus-and-hot-dogs': u'161522',
  u'parmesan-crusted-au-gratin-potatoes-and-onion': u'234645',
  u'pork-sausages-with-caramelized-onion-sauce': u'89948',
  u'shyams-goat-biryani': u'237347',
  u'vegan-and-gluten-free-broth-powder': u'240511',
  u'wet-meat': u'62937442',
  u'zimbabwean-chicken-and-vegetable-soup': u'219483'},
 u'madeits_dict': {u' BBQ NY Strip ':

In [162]:
# Looks good.  Make sure recommended recipes aren't ones the member alraedy made.  