## Food Mama project: 

In [1]:
import pandas as pd
import numpy as np
import os, io, re
import mama
import matplotlib.pyplot as plt
%matplotlib inline

### Loading the Datasets into Pandas dataframe

In [2]:
path = "datasets/"
files = os.listdir(path)

dfs = []
for file in files:
    if file.endswith('.csv'):
        filename = file.split("_")
        dfs.append(filename[0])
        exec(dfs[-1] + '= pd.read_csv(path + file)')

print("Name of the dataframes created: ", *dfs, sep="\n")

Name of the dataframes created: 
category
food
item
recipe


### Run `preprocessing` module to modify and to clean the dataframes (drop unnecessary columns, rename columns, replace index with "id", fill the missing data, and change data type).

In [3]:
dataframes = [food, item, recipe]
foods, items, recipes = mama.preprocessing(dataframes)
recipes.head()

Unnamed: 0,recipe_id,title,servings,ingredients,instructions,created_at,updated_at,origin,link,rating
0,9,"Salade mâche, jambon de Bayonne, mozzarella",4,4 bonne poignée de mâche \r\n40 tomate cerise ...,"Après avoir coupé en dés la mozzarella, couper...",2017-12-14 14:56:37.166524,2018-10-05 09:16:42.390163,www.marmiton.org,http://www.marmiton.org/recettes/recette_salad...,limit
1,10,Ciabattina al pesto,1,Ciabattina\r\nJambon de Parme\r\nTomates confi...,Ouvrir le pain Ciabattina en deux et tartiner ...,2017-12-14 14:56:46.270433,2018-10-05 09:16:42.45289,www.club-sandwich.net,http://www.club-sandwich.net/mobile/fiche.php?...,limit
2,402,Tortilla aux champignons et salade,4,250 g de champignons de Paris\r\r\n4 oeufs\r\r...,Préchauffer le four à 180 °C (th. 6).\r\nCoupe...,2017-12-15 16:41:37.056079,2018-10-05 09:16:42.505825,www.mangerbouger.fr,http://www.mangerbouger.fr/Manger-Mieux/Recett...,limit
3,2,Steak haché et pâtes,1,2.0 filets huile d'olive\r\n1.0 steak haché\...,"Dans une casserole, portez à ébullition un gra...",2017-12-13 16:17:24.125137,2018-10-05 09:16:42.55582,www.wecook.fr,https://www.wecook.fr/recette/steak-hache-et-p...,good
4,4,Beef Bagel,1,Pain Bagel\r\nCarpaccio de boeuf\r\nFromage fr...,Placer les tranches de carpaccio dans un plat ...,2017-12-13 17:13:29.069001,2018-10-05 09:16:42.597349,www.club-sandwich.net,http://www.club-sandwich.net/mobile/fiche.php?...,limit


## Text Cleaning

In [4]:
recipes_text = recipes.drop(['servings', 'created_at', 'updated_at', 'link', 'rating'], axis=1)

In [5]:
recipes_text.instructions.head()

0    Après avoir coupé en dés la mozzarella, couper...
1    Ouvrir le pain Ciabattina en deux et tartiner ...
2    Préchauffer le four à 180 °C (th. 6).\r\nCoupe...
3    Dans une casserole, portez à ébullition un gra...
4    Placer les tranches de carpaccio dans un plat ...
Name: instructions, dtype: object

In [6]:
filename = "lucene_stopwords.txt"
with io.open(filename, 'r', encoding='utf-8') as f:
    lucene = f.read()

import nltk
# List of all French stop words such as 'le', 'au'
nltk_stopwords = nltk.corpus.stopwords.words('french')
lucene_stopwords = [w for w in lucene.split(",")]
stop_words = nltk_stopwords + lucene_stopwords 

stemmer = nltk.stem.SnowballStemmer('french')

In [7]:
from bs4 import BeautifulSoup
import unicodedata

def clean_text(text):
    # remove html stuff and to lower case (from_encoding='utf-8')
    txt = BeautifulSoup(text).get_text().lower() 
    # special characters
    txt = txt.replace('œ','oe').replace('æ','ae').replace('ᴁ','AE').replace('Œ','OE')
    # remove accent 
    txt = unicodedata.normalize('NFD', txt).encode('ascii', 'ignore').decode("utf-8")
    # remove non alphanumeric char (or without number r'[^a-z_]')
    txt = re.sub(r'\W+', ' ', txt)
    # remove french stop words (extra: and (len(w)>2))
    tokens = [w for w in txt.split() if (w not in stop_words)] 
    # french stemming
    #tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)

#import html
#print(html.unescape('&pound;682m'))

## `TfIdfVectorizer` class that produces the TF-IDF Matrix

- Compute Term Frequency-Inverse Document Frequency (TF-IDF) vectors for each recipe

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize(df, columns):
    text_array = np.repeat("", len(df)).astype("object")
    for c in columns:
        text_array += df[c].values + " "
    
    for index, item in enumerate(text_array):
        text_array[index] = clean_text(item)
        
    # Define a TF-IDF Vectorizer Object  
    #tfidf_vec = TfidfVectorizer(stop_words=None)
    ## (bi-grams)
    tfidf_vec = TfidfVectorizer(min_df=1,stop_words=None,smooth_idf=True,norm='l2',sublinear_tf=True,use_idf=True,ngram_range=(1,2)) 
    
    # Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf_vec.fit_transform(text_array)
    
    return tfidf_matrix

In [9]:
# Call functions to generate TF-IDF matrix
columns = ('title', 'instructions')
tfidf_matrix = vectorize(recipes_text, columns)
print(tfidf_matrix.shape)

# Construct a reverse map of indices and recipe titles
indices = pd.Series(recipes_text.index, index = recipes_text['title'])

(1234, 21991)


- TF-IDF vectorizer calculate the dot product that will directly give the cosine similarity score

In [10]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

## Recommendations

In [11]:
# Function that takes in recipe title as input and outputs most similar recipes
def get_recommendations(title, model=cosine_sim):
    # Get the index of the recipe that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all recipes with that title
    sim_scores = list(enumerate(model[idx]))

    # Sort the recipes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar recipes
    top_10 = sim_scores[1:11]

    # Get the recipe indices
    top_indices = [i[0] for i in top_10]

    # Return the top 10 most similar recipes
    return recipes_text['title'].iloc[top_indices]

In [12]:
get_recommendations('Pizza Margherita')

429                   Pizza jambon mozzarella et asperges
404                            Spaghettis à la bolognaise
431                                  Quiche multi légumes
861                                              Lasagnes
274     Pizza sauce tomate, mozzarella, jambon blanc, ...
631                 Pizza sauce tomate maison, mozzarella
161                                    Potimarron au four
1010               Pizza sauce tomate, mozzarella, origan
503                                     Gratin dauphinois
290     Pizza sauce tomate, mozzarella, jambon de parm...
Name: title, dtype: object