## Food Mama project: 

In [1]:
import pandas as pd
import numpy as np
import os, io, re
import mama
import matplotlib.pyplot as plt
%matplotlib inline

### Loading the Datasets into Pandas dataframe

In [2]:
path = "datasets/"
files = os.listdir(path)

dfs = []
for file in files:
    if file.endswith('.csv'):
        filename = file.split("_")
        dfs.append(filename[0])
        exec(dfs[-1] + '= pd.read_csv(path + file)')

print("Name of the dataframes created: ", *dfs, sep="\n")

Name of the dataframes created: 
category
food
item
recipe


### Run `preprocessing` module to modify and to clean the dataframes (drop unnecessary columns, rename columns, replace index with "id", fill the missing data, and change data type).

In [3]:
dataframes = [food, item, recipe]
foods, items, recipes = mama.preprocessing(dataframes)
recipes.head()

Unnamed: 0,recipe_id,title,servings,ingredients,instructions,created_at,updated_at,origin,link,rating
0,9,"Salade mâche, jambon de Bayonne, mozzarella",4,4 bonne poignée de mâche \r\n40 tomate cerise ...,"Après avoir coupé en dés la mozzarella, couper...",2017-12-14 14:56:37.166524,2018-10-05 09:16:42.390163,www.marmiton.org,http://www.marmiton.org/recettes/recette_salad...,limit
1,10,Ciabattina al pesto,1,Ciabattina\r\nJambon de Parme\r\nTomates confi...,Ouvrir le pain Ciabattina en deux et tartiner ...,2017-12-14 14:56:46.270433,2018-10-05 09:16:42.45289,www.club-sandwich.net,http://www.club-sandwich.net/mobile/fiche.php?...,limit
2,402,Tortilla aux champignons et salade,4,250 g de champignons de Paris\r\r\n4 oeufs\r\r...,Préchauffer le four à 180 °C (th. 6).\r\nCoupe...,2017-12-15 16:41:37.056079,2018-10-05 09:16:42.505825,www.mangerbouger.fr,http://www.mangerbouger.fr/Manger-Mieux/Recett...,limit
3,2,Steak haché et pâtes,1,2.0 filets huile d'olive\r\n1.0 steak haché\...,"Dans une casserole, portez à ébullition un gra...",2017-12-13 16:17:24.125137,2018-10-05 09:16:42.55582,www.wecook.fr,https://www.wecook.fr/recette/steak-hache-et-p...,good
4,4,Beef Bagel,1,Pain Bagel\r\nCarpaccio de boeuf\r\nFromage fr...,Placer les tranches de carpaccio dans un plat ...,2017-12-13 17:13:29.069001,2018-10-05 09:16:42.597349,www.club-sandwich.net,http://www.club-sandwich.net/mobile/fiche.php?...,limit


## Text Cleaning

In [4]:
recipes_text = recipes.drop(['servings', 'created_at', 'updated_at', 'origin', 'rating'], axis=1)

# Keep only web site name in link column
for index, url in enumerate(recipes_text.link):
    domain = url.split("//")[-1].split("/")[0].rsplit(".",1)[0]
    if domain.startswith('www'):
        domain = domain.split("www.")[-1]
    
    recipes_text.loc[index, "link"] = domain.replace('-','').lower()

recipes_text.head()

Unnamed: 0,recipe_id,title,ingredients,instructions,link
0,9,"Salade mâche, jambon de Bayonne, mozzarella",4 bonne poignée de mâche \r\n40 tomate cerise ...,"Après avoir coupé en dés la mozzarella, couper...",marmiton
1,10,Ciabattina al pesto,Ciabattina\r\nJambon de Parme\r\nTomates confi...,Ouvrir le pain Ciabattina en deux et tartiner ...,clubsandwich
2,402,Tortilla aux champignons et salade,250 g de champignons de Paris\r\r\n4 oeufs\r\r...,Préchauffer le four à 180 °C (th. 6).\r\nCoupe...,mangerbouger
3,2,Steak haché et pâtes,2.0 filets huile d'olive\r\n1.0 steak haché\...,"Dans une casserole, portez à ébullition un gra...",wecook
4,4,Beef Bagel,Pain Bagel\r\nCarpaccio de boeuf\r\nFromage fr...,Placer les tranches de carpaccio dans un plat ...,clubsandwich


- Functions to clean French text for tokenization

In [5]:
from bs4 import BeautifulSoup
from string import punctuation
import unicodedata
import nltk

# Read the list of all French stop words such as 'le', 'au'
filename = "french_stopwords.txt"
with io.open(filename, 'r', encoding='utf-8') as f:
    french = f.read()
    
fr_stopwords = [w for w in french.split(",")]

#nltk_stopwords = nltk.corpus.stopwords.words('french')
stemmer = nltk.stem.SnowballStemmer('french')

def clean_text(text):
    txt = BeautifulSoup(text).get_text().lower() 
    # remove accent 
    txt = unicodedata.normalize('NFD', txt).encode('ascii', 'ignore').decode("utf-8")
    # special characters
    txt = txt.replace('œ','oe').replace('æ','ae').replace('ᴁ','AE').replace('Œ','OE')
    # remove non alphanumeric char (or with number r'\W+')
    txt = re.sub(r'[^a-z_]', ' ', txt)
    # remove french stop words 
    tokens = [w for w in txt.split() if (w not in fr_stopwords) and (len(w)>2)] 
    # french stemming
    #tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

def clean_name(name):
    n = BeautifulSoup(name).get_text()
    n = unicodedata.normalize('NFD', n).encode('ascii', 'ignore').decode("utf-8")
    n = n.replace('œ','oe').replace('æ','ae').replace('ᴁ','AE').replace('Œ','OE')
    n = re.sub(r'[^ a-z0-9]', '', n)
    words = [w for w in n.split() if len(w) > 2] 
    return ' '.join(words)

def clean_ingredient(val):
    v = BeautifulSoup(val).get_text().lower()
    v = unicodedata.normalize('NFD', v).encode('ascii', 'ignore').decode("utf-8")
    v = v.replace('œ','oe').replace('æ','ae').replace('ᴁ','AE').replace('Œ','OE')
    v = re.sub(r'[^ a-z]', ' ', v)
    token = [w for w in v.split() if w not in fr_stopwords]
    return ''.join(token)

- Convert the title recipe to distinct names for better vectorization

In [6]:
dic_word = {' des ':'', ' de la ':'', " de l'":'', ' de ':'', ' du ':'', " d'":'',
             ' aux ':'', ' au ':'', ' al ':'', ' à la ':'', " à l'":'', ' à ':''}

for index, r_names in enumerate(recipes_text.title):
    t_names = r_names.split(",")
    n_lst = []
    for t_items in t_names:
        t_item = t_items.strip(" ").lower().split(" et ")
        t_item = pd.Series(t_item).replace(dic_word, regex=True)
        for t_words in t_item:
            t_word = t_words.replace(' ','',1).split(" ")
            for t_w in t_word:
                n_lst.append(t_w)          
    recipes_text.loc[index, "title"] = clean_name(" ".join(n_lst))

recipes_text.head()   

Unnamed: 0,recipe_id,title,ingredients,instructions,link
0,9,salademache jambonbayonne mozzarella,4 bonne poignée de mâche \r\n40 tomate cerise ...,"Après avoir coupé en dés la mozzarella, couper...",marmiton
1,10,ciabattinapesto,Ciabattina\r\nJambon de Parme\r\nTomates confi...,Ouvrir le pain Ciabattina en deux et tartiner ...,clubsandwich
2,402,tortillachampignons salade,250 g de champignons de Paris\r\r\n4 oeufs\r\r...,Préchauffer le four à 180 °C (th. 6).\r\nCoupe...,mangerbouger
3,2,steakhache pates,2.0 filets huile d'olive\r\n1.0 steak haché\...,"Dans une casserole, portez à ébullition un gra...",wecook
4,4,beefbagel,Pain Bagel\r\nCarpaccio de boeuf\r\nFromage fr...,Placer les tranches de carpaccio dans un plat ...,clubsandwich


- Replace `ingredients` column values with only ingredient names using `meta_data` table

In [8]:
tables = [recipes, items, foods]
meta_data = mama.merging(tables)
n_array = np.repeat("", meta_data.recipe_id.max()+1).astype("object")
for i in meta_data.index:
    index = meta_data.recipe_id[i]
    n_array[index] += meta_data.name[i] + ";"
    
for i, str_val in enumerate(n_array):
    v_lst = []
    if len(str_val) > 0:
        str_val = str_val.rsplit(";",1)[0]
        val_lst = str_val.split(";")
        for val in val_lst:
            v_lst.append(clean_ingredient(val))
        recipes_text.loc[(recipes_text[recipes_text.recipe_id == i].index.values[0]), "ingredients"] = " ".join(v_lst)
            
recipes_text.head()

Unnamed: 0,recipe_id,title,ingredients,instructions,link
0,9,salademache jambonbayonne mozzarella,mache tomatecerise mozzarella jamboncru crouto...,"Après avoir coupé en dés la mozzarella, couper...",marmiton
1,10,ciabattinapesto,painciabattina jambonparme tomateconfite pesto...,Ouvrir le pain Ciabattina en deux et tartiner ...,clubsandwich
2,402,tortillachampignons salade,champignonparis oeuf haricotsverts feuillebric...,Préchauffer le four à 180 °C (th. 6).\r\nCoupe...,mangerbouger
3,2,steakhache pates,huileolive steakhache pates,"Dans une casserole, portez à ébullition un gra...",wecook
4,4,beefbagel,tomatecerise roquette sel huileolive bagel car...,Placer les tranches de carpaccio dans un plat ...,clubsandwich


- Find keywords from recipe instruction

In [9]:
from collections import Counter
stop_tokens = ['ajouter','ajoutez','assiette','couper','coupez','chauffer','chauffez','cuire','casserole','commencer','decoupez','deposer','disposer',
               'deposez','decouper','eplucher','epluchez','etaler','etalez','egouttez','feu','four','mettre','revenir','rechauffez','cuisson','rajouter',
               'laisser','laissez','lavez','laver','min','minutes','melangez','melanger','morceaux','nettoyer','nettoyez','portez','prechauffer',
               'ouvrir','ouvrez','porter','plat','poele','passez','passer','temps','tranche','tranches','temperature','verser','versez','saladier']
key_array = recipes_text.instructions.values
for k, token in enumerate(key_array):
    key_array[k] = clean_text(token)
    token_lst = []
    dic_array = Counter(key_array[k].split())
    for key, value in sorted(dic_array.items(), key=lambda x: x[1], reverse=True):
        if key not in stop_tokens:
            token_lst.append(key)
    recipes_text.loc[k, "instructions"] = " ".join(token_lst[:10])
    
recipes_text.head()

Unnamed: 0,recipe_id,title,ingredients,instructions,link
0,9,salademache jambonbayonne mozzarella,mache tomatecerise mozzarella jamboncru crouto...,coupe mozzarella tomates cerises jambon bayonn...,marmiton
1,10,ciabattinapesto,painciabattina jambonparme tomateconfite pesto...,pain copeaux parmesan ciabattina tartiner pest...,clubsandwich
2,402,tortillachampignons salade,champignonparis oeuf haricotsverts feuillebric...,omelette feuille brick lamelles fines champign...,mangerbouger
3,2,steakhache pates,huileolive steakhache pates,pates steak hache filet huile olive ebullition...,wecook
4,4,beefbagel,tomatecerise roquette sel huileolive bagel car...,placer frais bagel moitie carpaccio arroser co...,clubsandwich


## `TfIdfVectorizer` class that produces the TF-IDF Matrix

- Compute Term Frequency-Inverse Document Frequency (TF-IDF) vectors for each recipe

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize(df, columns):
    text_array = np.repeat("", len(df)).astype("object")
    for c in columns:
        text_array += df[c].values + " "
    
    for index, a_item in enumerate(text_array):
        text_array[index] = clean_text(a_item)
        
    # Define a TF-IDF Vectorizer Object  
    #tfidf_vec = TfidfVectorizer(stop_words=None)
    ## (bi-grams)
    tfidf_vec = TfidfVectorizer(min_df=1,stop_words=None,smooth_idf=True,norm='l2',sublinear_tf=True,use_idf=True,ngram_range=(1,2)) 
    
    # Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf_vec.fit_transform(text_array)
    
    return tfidf_matrix

In [11]:
# Call function to generate TF-IDF matrix
features = ['title', 'ingredients', 'instructions', 'link']
columns = ('title', 'instructions')
tfidf_matrix = vectorize(recipes, columns)
print(tfidf_matrix.shape)

(1234, 20856)


- TF-IDF vectorizer calculate the dot product that will directly give the cosine similarity score

In [12]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Construct a reverse map of indices and recipe titles
indices = pd.Series(recipes.index, index = recipes['title'])

## Cosine Similarity matrix with `CountVectorizer`

In [13]:
# Create a new soup feature in dataframe
def create_soup(x):
    return x['title'] + " " + x['ingredients'] + " " + x['instructions'] + " " + x['link']

recipes_text['soup'] = recipes_text.apply(create_soup, axis=1)

# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer(stop_words=None)
count_matrix = count_vec.fit_transform(recipes_text['soup'])

In [14]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

# Construct a reverse map of indices and recipe titles
indices2 = pd.Series(recipes_text.index, index = recipes_text['recipe_id']) #title (.drop_duplicates())
# indices = pd.Series(df.index)  =>   idx = indices[indices == title].index[0]

In [15]:
recipes_text.head()

Unnamed: 0,recipe_id,title,ingredients,instructions,link,soup
0,9,salademache jambonbayonne mozzarella,mache tomatecerise mozzarella jamboncru crouto...,coupe mozzarella tomates cerises jambon bayonn...,marmiton,salademache jambonbayonne mozzarella mache tom...
1,10,ciabattinapesto,painciabattina jambonparme tomateconfite pesto...,pain copeaux parmesan ciabattina tartiner pest...,clubsandwich,ciabattinapesto painciabattina jambonparme tom...
2,402,tortillachampignons salade,champignonparis oeuf haricotsverts feuillebric...,omelette feuille brick lamelles fines champign...,mangerbouger,tortillachampignons salade champignonparis oeu...
3,2,steakhache pates,huileolive steakhache pates,pates steak hache filet huile olive ebullition...,wecook,steakhache pates huileolive steakhache pates p...
4,4,beefbagel,tomatecerise roquette sel huileolive bagel car...,placer frais bagel moitie carpaccio arroser co...,clubsandwich,beefbagel tomatecerise roquette sel huileolive...


## Recommendations

- Function that takes in recipe title as input and return most similar recipes

In [16]:
def get_recommendations(title, model=1):
    # Get the index of the recipe that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all recipes with that title
    if model == 1:
        sim_scores = list(enumerate(cosine_sim[idx]))
    elif model == 2:
        sim_scores = list(enumerate(cosine_sim2[idx]))

    # Sort the recipes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar recipes
    top_10 = sim_scores[1:11] #.index

    # Get the recipe indices
    top_indices = [i[0] for i in top_10]

    # Return the top 10 most similar recipes
    print("Similar recipes: \n----------------")
    for i, r in enumerate(recipes['title'].iloc[top_indices]):
        print('{:2d}) {}'.format(i+1, r))
        
    return #recipes['title'].iloc[top_indices]

# Demo:


In [17]:
get_recommendations('Ciabattina al pesto', model=1)

Similar recipes: 
----------------
 1) Sandwich Antipasti
 2) Pesto Burger
 3) Croque-monsieur Roma
 4) Spaghetti, sauce pesto maison, jambon de parme
 5) Pâtes, sauce pesto, tomates séchées, jambon de parme, mozzarella
 6) Croque-monsieur à la Roquette
 7) Croque-monsieur Italia
 8) Pâtes, sauce pesto, parmesan, basilic
 9) Sandwich Grand Frais Roux
10) Bruschetta Mâche/Parmesan


In [18]:
get_recommendations('Ciabattina al pesto', model=2)

Similar recipes: 
----------------
 1) Sandwich Antipasti
 2) Gnocchis au pesto
 3) Croque-monsieur Roma
 4) Pâtes, sauce pesto, parmesan, basilic
 5) Bruschetta Mâche/Parmesan
 6) Pâtes, sauce pesto, tomates séchées, jambon de parme, mozzarella
 7) Pâtes, sauce pesto, parmesan, salade verte, vinaigrette
 8) Pâtes, sauce pesto maison, parmesan, mozzarella, brocoli, basilic
 9) Pâtes, sauce pesto maison
10) Pâtes, sauce tomate maison, parmesan, basilic


In [19]:
get_recommendations('Pizza Margherita', model=1)

Similar recipes: 
----------------
 1) Pizza jambon mozzarella et asperges
 2) Lasagnes
 3) Pizza sauce tomate, mozzarella, jambon blanc, basilic
 4) Poivrons marinés
 5) Pizza sauce tomate maison, mozzarella
 6) Pizza sauce tomate, mozzarella, origan
 7) Pizza sauce tomate, mozzarella, jambon blanc
 8) Pizza sauce tomate, mozzarella, jambon de parme, basilic
 9) Spaghettis à la bolognaise
10) Potimarron au four


In [20]:
get_recommendations('Pizza Margherita', model=2)

Similar recipes: 
----------------
 1) Pizza jambon mozzarella et asperges
 2) Bruschetta tomates mozzarella
 3) Salade de tomates mozzarella au vinaigre balsamic
 4) Lasagnes
 5) Croque-monsieur à la mozzarella
 6) Pizza au jambon de Parme
 7) Roulé de jambon à la tomate
 8) Sandwich Romain
 9) Pizza sauce tomate, mozzarella, origan
10) Pommes de terre à l'italienne
