## Create a `metadata` dataframe for Food Mama project: 

In [1]:
import pandas as pd
import mama
import os
import matplotlib.pyplot as plt
%matplotlib inline

### Loading the Datasets into Pandas dataframe

In [2]:
path = "datasets/"
files = os.listdir(path)

dfs = []
for file in files:
    filename = file.split("_")
    dfs.append(filename[0])
    exec(dfs[-1] + '= pd.read_csv(path + file)')

print("Name of the dataframes created: ", *dfs, sep="\n")

Name of the dataframes created: 
category
food
item
recipe


In [3]:
recipe.head()

Unnamed: 0,id,title,servings,ingredients,instructions,created_at,updated_at,recommendable,status,origin,link,rating
0,9,"Salade mâche, jambon de Bayonne, mozzarella",4,4 bonne poignée de mâche \r\n40 tomate cerise ...,"Après avoir coupé en dés la mozzarella, couper...",2017-12-14 14:56:37.166524,2018-10-05 09:16:42.390163,,dismissed,www.marmiton.org,http://www.marmiton.org/recettes/recette_salad...,limit
1,10,Ciabattina al pesto,1,Ciabattina\r\nJambon de Parme\r\nTomates confi...,Ouvrir le pain Ciabattina en deux et tartiner ...,2017-12-14 14:56:46.270433,2018-10-05 09:16:42.45289,,dismissed,www.club-sandwich.net,http://www.club-sandwich.net/mobile/fiche.php?...,limit
2,402,Tortilla aux champignons et salade,4,250 g de champignons de Paris\r\r\n4 oeufs\r\r...,Préchauffer le four à 180 °C (th. 6).\r\nCoupe...,2017-12-15 16:41:37.056079,2018-10-05 09:16:42.505825,,dismissed,www.mangerbouger.fr,http://www.mangerbouger.fr/Manger-Mieux/Recett...,limit
3,2,Steak haché et pâtes,1,2.0 filets huile d'olive\r\n1.0 steak haché\...,"Dans une casserole, portez à ébullition un gra...",2017-12-13 16:17:24.125137,2018-10-05 09:16:42.55582,,dismissed,www.wecook.fr,https://www.wecook.fr/recette/steak-hache-et-p...,good
4,4,Beef Bagel,1,Pain Bagel\r\nCarpaccio de boeuf\r\nFromage fr...,Placer les tranches de carpaccio dans un plat ...,2017-12-13 17:13:29.069001,2018-10-05 09:16:42.597349,,dismissed,www.club-sandwich.net,http://www.club-sandwich.net/mobile/fiche.php?...,limit


### Run `preprocessing` module to modify and to clean the dataframes (drop unnecessary columns, rename columns, replace index with "id", fill the missing data, and change data type).

In [4]:
dataframes = [food, item, recipe]
foods, items, recipes = mama.preprocessing(dataframes)
items.head()

Unnamed: 0_level_0,recipe-ingredient,name_food,title_recipe,ingredients_recipe,origin_recipe
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2.0 filets huile d'olive,huile d'olive,Steak haché et pâtes,2.0 filets huile d'olive\r\n1.0 steak haché\...,www.wecook.fr
2,1.0 steak haché,steak haché,Steak haché et pâtes,2.0 filets huile d'olive\r\n1.0 steak haché\...,www.wecook.fr
3,70.0 grammes pâtes,pâtes,Steak haché et pâtes,2.0 filets huile d'olive\r\n1.0 steak haché\...,www.wecook.fr
4,500 g de semoule moyenne,semoule,Taboulé ultra-facile,500 g de semoule moyenne \r\n500 g de tomate e...,www.marmiton.org
5,500 g de tomate environ (ébouillantées pour ot...,tomate,Taboulé ultra-facile,500 g de semoule moyenne \r\n500 g de tomate e...,www.marmiton.org


### Merging tables to create a meta_data table

In [5]:
merge_table = pd.merge(recipes, items, left_on='title', right_on='title_recipe')
merge_table = merge_table.loc[:, ['recipe_id', 'title', 'servings', 'origin', 'rating', 'name_food']]
merge_table.head(10)

Unnamed: 0,recipe_id,title,servings,origin,rating,name_food
0,9,"Salade mâche, jambon de Bayonne, mozzarella",4,www.marmiton.org,limit,mâche
1,9,"Salade mâche, jambon de Bayonne, mozzarella",4,www.marmiton.org,limit,tomate cerise
2,9,"Salade mâche, jambon de Bayonne, mozzarella",4,www.marmiton.org,limit,mozzarella
3,9,"Salade mâche, jambon de Bayonne, mozzarella",4,www.marmiton.org,limit,jambon cru
4,9,"Salade mâche, jambon de Bayonne, mozzarella",4,www.marmiton.org,limit,croutons
5,9,"Salade mâche, jambon de Bayonne, mozzarella",4,www.marmiton.org,limit,vinaigrette
6,10,Ciabattina al pesto,1,www.club-sandwich.net,limit,pain ciabattina
7,10,Ciabattina al pesto,1,www.club-sandwich.net,limit,jambon de Parme
8,10,Ciabattina al pesto,1,www.club-sandwich.net,limit,tomate confite
9,10,Ciabattina al pesto,1,www.club-sandwich.net,limit,pesto


In [6]:
meta_data = pd.merge(merge_table, foods, left_on='name_food', right_on='name')
meta_data.head()

Unnamed: 0,recipe_id,title,servings,origin,rating,name_food,food_id,name,availability,category_id,ancestry
0,9,"Salade mâche, jambon de Bayonne, mozzarella",4,www.marmiton.org,limit,mâche,42,mâche,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14,86
1,566,Sandwich Spécial Magy,1,www.club-sandwich.net,limit,mâche,42,mâche,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14,86
2,397,Salade de ravioles,2,www.mangerbouger.fr,good,mâche,42,mâche,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14,86
3,508,Sandwich Marcilhacy's,1,www.club-sandwich.net,limit,mâche,42,mâche,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14,86
4,564,Sandwich Rösti Burger,1,www.club-sandwich.net,limit,mâche,42,mâche,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14,86


### Convert string features to nominal categorical variables

#### Approach #1 - Find and Replace

In [7]:
rating_cat = {"rating": {"avoid": 1, "limit": 2, "good": 3, "excellent": 4, "unknown": 0}}
meta_data.replace(rating_cat, inplace=True)

#### Approach #2 - Use Scikit-Learn

In [8]:
from sklearn.preprocessing import LabelEncoder
lb_make = LabelEncoder()
meta_data["origin_id"] = lb_make.fit_transform(meta_data["origin"])
meta_data.head()

Unnamed: 0,recipe_id,title,servings,origin,rating,name_food,food_id,name,availability,category_id,ancestry,origin_id
0,9,"Salade mâche, jambon de Bayonne, mozzarella",4,www.marmiton.org,2,mâche,42,mâche,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14,86,70
1,566,Sandwich Spécial Magy,1,www.club-sandwich.net,2,mâche,42,mâche,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14,86,67
2,397,Salade de ravioles,2,www.mangerbouger.fr,3,mâche,42,mâche,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14,86,69
3,508,Sandwich Marcilhacy's,1,www.club-sandwich.net,2,mâche,42,mâche,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14,86,67
4,564,Sandwich Rösti Burger,1,www.club-sandwich.net,2,mâche,42,mâche,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14,86,67


#### Approach #3 - Ordered Label Encoding

- First, convert *string* to an integer *list* in `availability` column

In [9]:
#list = [int(s) for s in meta_data.availability for s in s.replace(',','').split(" ")]
meta_data["availability_lst"] = [[int(s) for s in s.replace(',','').split(" ")] for s in meta_data.availability]

- Second, find the intersection of ingredients availability in a recipe and add the values in a new column

In [10]:
lists = list(meta_data.groupby("recipe_id")["availability_lst"])
L = [lst[1].values for lst in lists]
I = [lst[0] for lst in lists]

for i, lst in enumerate(L):
    months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    for l in lst:
        months = list(set(months) & set(l))
    
    if len(months) == 0:
        meta_data.loc[meta_data.recipe_id == I[i], ["availability"]] = '0'
    else:
        meta_data.loc[meta_data.recipe_id == I[i], ["availability"]] = str(months).strip('[]')
        
meta_data['Cat'] = [l.replace(',','').replace(' ','') for l in meta_data.availability]

- Then, sort the values and create the ordered encoding in a seperate dataframe 

In [11]:
c = sorted(set(meta_data.Cat), key=lambda x: int(x))
encode_df = pd.DataFrame([[i,n] for i,n in enumerate(c)])
encode_df = encode_df.rename(columns = lambda x : 'tag_' + str(x))

- And finally merge the tables and rename/delete extra columns

In [12]:
meta_data = pd.merge(meta_data, encode_df, left_on='Cat', right_on='tag_1')
meta_data.rename(columns={'tag_0':'availability_cat'}, inplace=True)
meta_data.drop(['Cat','tag_1'], axis=1, inplace=True)
meta_data.head()

Unnamed: 0,recipe_id,title,servings,origin,rating,name_food,food_id,name,availability,category_id,ancestry,origin_id,availability_lst,availability_cat
0,9,"Salade mâche, jambon de Bayonne, mozzarella",4,www.marmiton.org,2,mâche,42,mâche,"5, 6, 7, 8, 9",14,86,70,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]",18
1,566,Sandwich Spécial Magy,1,www.club-sandwich.net,2,mâche,42,mâche,"5, 6, 7, 8, 9",14,86,67,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]",18
2,551,Bruschetta Mâche/Parmesan,1,www.club-sandwich.net,3,mâche,42,mâche,"5, 6, 7, 8, 9",14,86,67,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]",18
3,1154,"Frites au four maison, guacamole, concombre, t...",1,simplyfitsociety,4,mâche,42,mâche,"5, 6, 7, 8, 9",14,86,58,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]",18
4,1198,"Mâche, avocat, tomate cerise, graines de sésam...",1,faismoifondre,2,mâche,42,mâche,"5, 6, 7, 8, 9",14,86,27,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]",18


### Preparing a dataframe `cluster_data` for Unsupervised learning models

- Remove seasoning ingredients by their category_ids (sauces, épices, sel, sucres, beurre, jus, alcools, levure, eau).

In [13]:
seasoning_categories = [18, 34, 37, 38, 39, 41, 42, 44, 45]
cluster_data = meta_data[meta_data['category_id'].apply(lambda x: x not in seasoning_categories)]

- Rearrange dataframe columns

In [14]:
cluster_data = cluster_data.loc[:, ['title','recipe_id','origin_id','servings','rating','food_id','category_id','availability','availability_cat','ancestry']]
cluster_data.rename(columns={'title':'recipe_title', 'food_id':'ingredient_id', 'category_id':'ingredient_cat'}, inplace=True)
cluster_data.head()

Unnamed: 0,recipe_title,recipe_id,origin_id,servings,rating,ingredient_id,ingredient_cat,availability,availability_cat,ancestry
0,"Salade mâche, jambon de Bayonne, mozzarella",9,70,4,2,42,14,"5, 6, 7, 8, 9",18,86
1,Sandwich Spécial Magy,566,67,1,2,42,14,"5, 6, 7, 8, 9",18,86
2,Bruschetta Mâche/Parmesan,551,67,1,3,42,14,"5, 6, 7, 8, 9",18,86
3,"Frites au four maison, guacamole, concombre, t...",1154,58,1,4,42,14,"5, 6, 7, 8, 9",18,86
4,"Mâche, avocat, tomate cerise, graines de sésam...",1198,27,1,2,42,14,"5, 6, 7, 8, 9",18,86


In [15]:
cluster_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6101 entries, 0 to 8456
Data columns (total 10 columns):
recipe_title        6101 non-null object
recipe_id           6101 non-null int64
origin_id           6101 non-null int64
servings            6101 non-null int64
rating              6101 non-null int64
ingredient_id       6101 non-null int64
ingredient_cat      6101 non-null int64
availability        6101 non-null object
availability_cat    6101 non-null int64
ancestry            6101 non-null int64
dtypes: int64(8), object(2)
memory usage: 524.3+ KB


In [16]:
cluster_data.describe()

Unnamed: 0,recipe_id,origin_id,servings,rating,ingredient_id,ingredient_cat,availability_cat,ancestry
count,6101.0,6101.0,6101.0,6101.0,6101.0,6101.0,6101.0,6101.0
mean,662.455827,54.679725,1.913457,2.54696,114.584658,20.985084,28.292083,46.61285
std,356.578337,16.05314,1.479162,0.819616,132.915569,8.027122,13.497987,124.062566
min,2.0,0.0,1.0,0.0,1.0,11.0,0.0,0.0
25%,370.0,42.0,1.0,2.0,21.0,14.0,18.0,0.0
50%,672.0,61.0,1.0,2.0,64.0,17.0,28.0,1.0
75%,986.0,69.0,4.0,3.0,161.0,27.0,43.0,1.0
max,1243.0,73.0,8.0,4.0,597.0,47.0,43.0,519.0
