In [77]:
# Based on https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/
import pandas as pd 
import numpy as np

In [78]:
df = pd.read_csv('ingredients.csv', index_col=0)
df.head()

Unnamed: 0,ingredient,title
0,tomate,Galinha com quiabo
1,frango,Galinha com quiabo
2,cebola,Galinha com quiabo
3,alho,Galinha com quiabo
4,azeite,Galinha com quiabo


In [79]:
recipes = list(df.title.unique())
n_recipes = len(recipes)
ingredients = list(df.ingredient.unique())
n_ingredients = len(ingredients)
print(str(n_recipes) + ' recipes')
print(str(n_ingredients) + ' ingredients')

60 recipes
88 ingredients


In [80]:
ratings = np.zeros((n_recipes, n_ingredients))
for _, row in df.iterrows():
    i_index = ingredients.index(row['ingredient'])
    r_index = recipes.index(row['title'])
    if i_index <= n_ingredients:
        ratings[r_index, i_index] = 1
ratings

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [81]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 9.32%


In [82]:
def fast_similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

%time fast_similarity(ratings)

CPU times: user 432 µs, sys: 778 µs, total: 1.21 ms
Wall time: 1.39 ms


array([[1.        , 0.42640143, 0.375     , ..., 0.47140452, 0.40089186,
        0.26726124],
       [0.42640143, 1.        , 0.42640143, ..., 0.30151134, 0.22792115,
        0.56980288],
       [0.375     , 0.42640143, 1.        , ..., 0.47140452, 0.13363062,
        0.26726124],
       ...,
       [0.47140452, 0.30151134, 0.47140452, ..., 1.        , 0.12598816,
        0.12598816],
       [0.40089186, 0.22792115, 0.13363062, ..., 0.12598816, 1.        ,
        0.28571429],
       [0.26726124, 0.56980288, 0.26726124, ..., 0.12598816, 0.28571429,
        1.        ]])

In [83]:
def predict_fast_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

user_similarity = fast_similarity(ratings, kind='user')
%time predict_fast_simple(ratings, user_similarity, kind='user')

item_similarity = fast_similarity(ratings, kind='item')
%time predict_fast_simple(ratings, item_similarity, kind='item')

CPU times: user 501 µs, sys: 514 µs, total: 1.01 ms
Wall time: 557 µs
CPU times: user 471 µs, sys: 256 µs, total: 727 µs
Wall time: 406 µs


array([[0.28900754, 0.3582252 , 0.21897371, ..., 0.24818596, 0.14704109,
        0.14704109],
       [0.20804584, 0.41210679, 0.2431682 , ..., 0.18202986, 0.08500485,
        0.08500485],
       [0.26087036, 0.17620441, 0.21932061, ..., 0.21770447, 0.03862996,
        0.03862996],
       ...,
       [0.27066903, 0.36131916, 0.21174399, ..., 0.99999998, 0.03862996,
        0.03862996],
       [0.13524304, 0.12416851, 0.11521412, ..., 0.04226558, 0.99999999,
        0.99999999],
       [0.1535271 , 0.1758594 , 0.1520718 , ..., 0.04226558, 0.08500485,
        0.08500485]])

In [84]:
idx_to_ingredients = {}
for i, ingredient in enumerate(ingredients):
    idx_to_ingredients[i] = ingredient

idx_to_ingredients

{0: 'tomate',
 1: 'frango',
 2: 'cebola',
 3: 'alho',
 4: 'azeite',
 5: 'quiabo',
 6: 'milho',
 7: 'sal',
 8: 'maionese',
 9: 'manteiga',
 10: 'ketchup',
 11: 'mostarda',
 12: 'cogumelo',
 13: 'leite',
 14: 'batata',
 15: 'carne',
 16: 'palmito',
 17: 'oleo',
 18: 'salsinha',
 19: 'parmesao',
 20: 'mussarela',
 21: 'farinha',
 22: 'trigo',
 23: 'fermento',
 24: 'acucar',
 25: 'fuba',
 26: 'cenoura',
 27: 'ovo',
 28: 'pimento',
 29: 'pimenta',
 30: 'oregano',
 31: 'tomilho',
 32: 'cominho',
 33: 'louro',
 34: 'cabeca',
 35: 'pimentao',
 36: 'coentro',
 37: 'limao',
 38: 'mandioca',
 39: 'camarao',
 40: 'suco',
 41: 'cebolinha',
 42: 'merluza',
 43: 'picadinho',
 44: 'azeitona',
 45: 'shoyu',
 46: 'alcaparra',
 47: 'papel',
 48: 'espaguete',
 49: 'bacon',
 50: 'presunto',
 51: 'verde',
 52: 'manjericao',
 53: 'macarrao',
 54: 'linguica',
 55: 'margarina',
 56: 'requeijao',
 57: 'minas',
 58: 'lentilha',
 59: 'arroz',
 60: 'calabresa',
 61: 'salsa',
 62: 'preto',
 63: 'porco',
 64: 'coste

In [85]:
def top_k_ingredients(similarity, mapper, ingredient_idx, k=10):
    return [mapper[x] for x in np.argsort(similarity[ingredient_idx,:])[:-k-1:-1]]

In [86]:
idx = ingredients.index("frango") 
predicted_ingredients = top_k_ingredients(item_similarity, idx_to_ingredients, idx)
predicted_ingredients

['frango',
 'cebola',
 'alho',
 'milho',
 'manteiga',
 'gergelim',
 'batata',
 'sal',
 'azeite',
 'maionese']

In [87]:
from sklearn.metrics import pairwise_distances
# Convert from distance to similarity
item_correlation = 1 - pairwise_distances(ratings.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0.

In [88]:
idx = ingredients.index("frango") 
predicted_ingredients = top_k_ingredients(item_correlation, idx_to_ingredients, idx)
display(predicted_ingredients)

['frango',
 'milho',
 'gergelim',
 'manteiga',
 'farinha',
 'maionese',
 'batata',
 'azeitona',
 'requeijao',
 'shoyu']

In [93]:
ratings = np.zeros((n_recipes + 1, n_ingredients))
for _, row in df.iterrows():
    i_index = ingredients.index(row['ingredient'])
    r_index = recipes.index(row['title'])
    if i_index <= n_ingredients:
        ratings[r_index, i_index] = 1

new_recipe = ["badejo", "alcaparra"]
for ingredient in new_recipe:
    i_index = ingredients.index(ingredient)
    ratings[n_recipes, i_index] = 1

# ratings
user_similarity = fast_similarity(ratings, kind='user')

def get_recipe(idx):
    if idx >= len(recipes):
        return "New Recipe"
    else:
        return recipes[idx]

k = 3
idx = n_recipes # New recipe
print("Similar to", idx, new_recipe)

most_similar = np.flip(np.argsort(user_similarity[idx,:]))

similar_users = [ recipes[i] for i in most_similar[0:k+1] if i != idx ]
print(similar_users)
df[df['title'].isin(similar_users)].set_index('title').sort_values('title')#['title'].value_counts()

Similar to 60 ['badejo', 'alcaparra']
['Peixe à Belle Meunière', 'Peixe no papelote', 'Torresmo sequinho sem estouro']


Unnamed: 0_level_0,ingredient
title,Unnamed: 1_level_1
Peixe no papelote,merluza
Peixe no papelote,alho
Peixe no papelote,cebola
Peixe no papelote,alcaparra
Peixe no papelote,limao
Peixe no papelote,sal
Peixe no papelote,papel
Peixe à Belle Meunière,alho
Peixe à Belle Meunière,champignon
Peixe à Belle Meunière,alcaparra
