## Data Preprocessing for food Mama project: 

In [1]:
import pandas as pd
import os

### Loading the Datasets into Pandas dataframe

In [12]:
path = "datasets/"
files = os.listdir(path)

dfs = []
for file in files:
    filename = file.split("_")
    dfs.append(filename[0])
    exec(dfs[-1] + '= pd.read_csv(path + file)')

print("Name of the dataframes created: ", *dfs, sep="\n")

Name of the dataframes created: 
foods
items
recipes


### 1) *foods* table: 

In [13]:
#exec("print(" + dfs[0] + ".head())")
foods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564 entries, 0 to 563
Data columns (total 7 columns):
id              564 non-null int64
name            564 non-null object
created_at      564 non-null object
updated_at      564 non-null object
availability    563 non-null object
category_id     563 non-null float64
ancestry        159 non-null object
dtypes: float64(1), int64(1), object(5)
memory usage: 30.9+ KB


- Create a copy of table and then drop unnecessary columns, replace index with "id", and try to fill the missing data. 

In [14]:
foods_copy = foods.copy()
foods.drop(["created_at","updated_at"], axis=1, inplace=True)
foods.set_index("id", inplace=True)
foods.sort_index().head()

Unnamed: 0_level_0,name,availability,category_id,ancestry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,huile d'olive,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",36.0,
2,steak haché,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",27.0,509.0
3,pâtes,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",20.0,
4,semoule,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",21.0,
5,tomate,"05, 06, 07, 08, 09",14.0,


In [15]:
# availability column missing data
foods[foods.availability.isnull()]

Unnamed: 0_level_0,name,availability,category_id,ancestry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
53,pomme de terre,,14.0,


In [19]:
# Check the children's availability data 
ancestry_id = foods[foods.availability.isnull()].index
foods[foods.ancestry == str(ancestry_id[0])]

Unnamed: 0_level_0,name,availability,category_id,ancestry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
180,purée de pommes de terre,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",47.0,53
243,pomme de terre nouvelle,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14.0,53
274,pomme de terre rouge,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14.0,53
157,pommes de terre rissolées,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",47.0,53
521,chips,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",19.0,53


In [24]:
# Fill availability cell with one of its children data (here with the 1st one)
child_ids = foods[foods.ancestry == str(ancestry_id[0])].index
foods.loc[ancestry_id[0], ["availability"]] = foods.loc[child_ids[0], ["availability"]]
foods.loc[ancestry_id[0]]

name                                            pomme de terre
availability    01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12
category_id                                                 14
ancestry                                                   NaN
Name: 53, dtype: object

In [25]:
# category_id column missing data
foods[foods.category_id.isnull()]

Unnamed: 0_level_0,name,availability,category_id,ancestry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
288,couscous,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",,


In [26]:
# Check similar food using the same name 
foods[foods.name.apply(lambda x: "couscous" in x)]

Unnamed: 0_level_0,name,availability,category_id,ancestry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
288,couscous,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",,


In [27]:
# As no similar food found in the table, give it a category_id of "0" instead
id_num = foods[foods.category_id.isnull()].index
foods.loc[id_num[0], ["category_id"]] = 0

- Check different value of *ancestry* column and try to minimize the "NaN" values.

In [29]:
foods.ancestry.value_counts(dropna=False)

NaN        405
3           14
5           12
509         10
327          9
508          8
96           8
510          8
86           8
237          7
21           6
53           5
510/404      5
317          5
308          5
511          4
349          4
269          3
7            3
519          3
516          3
198          2
194          2
487          2
51           2
334          2
84           2
510/512      2
36           2
409          1
508/422      1
247          1
259          1
43           1
508/277      1
503          1
19           1
152          1
262          1
506          1
174          1
66           1
Name: ancestry, dtype: int64

In [30]:
foods[foods.ancestry == "510/404"]

Unnamed: 0_level_0,name,availability,category_id,ancestry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
187,saucisse fumée,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",23.0,510/404
402,saucisse de Strasbourg,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",23.0,510/404
407,chipolata,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",23.0,510/404
200,saucisse de Francfort,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",23.0,510/404
186,saucisse de porc,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",23.0,510/404


In [31]:
# Verify both ancestries 
foods[(foods.index == 510) | (foods.index == 404)]

Unnamed: 0_level_0,name,availability,category_id,ancestry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
510,porc,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",27.0,
404,saucisse,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",23.0,510.0


In [47]:
# Change the ancestry value with the subcategory id number
sub_cat = foods[foods.ancestry == "510/404"]["ancestry"]
foods.loc[sub_cat.index, "ancestry"] = sub_cat.values[0].split("/")[1]

- Repeat the same procedure for other values 

In [51]:
foods[foods.ancestry == "510/512"]

Unnamed: 0_level_0,name,availability,category_id,ancestry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
211,saucisse sèche,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",23.0,510/512
361,salami,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",23.0,510/512


In [52]:
foods[(foods.index == 510) | (foods.index == 512)]

Unnamed: 0_level_0,name,availability,category_id,ancestry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
510,porc,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",27.0,
512,saucisson,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",23.0,510.0


In [53]:
foods.loc[[211,361], "ancestry"] = "512"

In [54]:
foods[(foods.ancestry == "508/277") | (foods.ancestry == "508/422")]

Unnamed: 0_level_0,name,availability,category_id,ancestry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
560,chou frisé,"01, 02, 03, 10, 11, 12",14.0,508/277
526,chou vert,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14.0,508/422


In [55]:
foods[(foods.index == 508) | (foods.index == 277) | (foods.index == 422)]

Unnamed: 0_level_0,name,availability,category_id,ancestry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
422,chou pommé,"01, 02, 03, 04, 10, 11, 12",14.0,508.0
277,chou kale,"01, 02, 03, 10, 11, 12",14.0,508.0
508,chou,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14.0,


In [56]:
foods[(foods.ancestry == "508") | (foods.ancestry == "277") | (foods.ancestry == "422")]

Unnamed: 0_level_0,name,availability,category_id,ancestry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
156,chou-fleur,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14.0,508
315,chou chinois,"01, 02, 05, 06, 07, 08, 09, 10, 11, 12",14.0,508
287,chou blanc,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14.0,508
422,chou pommé,"01, 02, 03, 04, 10, 11, 12",14.0,508
277,chou kale,"01, 02, 03, 10, 11, 12",14.0,508
420,chou rouge,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12",14.0,508
419,chou de Bruxelles,"01, 02, 03, 09, 10, 11, 12",14.0,508
421,chou romanesco,"06, 07, 08, 09",14.0,508


In [57]:
foods.loc[[560,526], "ancestry"] = "508"

- Change the "NaN" value in *ancestry* column with "0" (to be considered as a root position) for the foods line with at least one children in the table.  

In [58]:
id_list = foods[foods.ancestry.isnull()].index
for i in id_list:
    if len(foods[foods.ancestry == str(i)]) != 0:
        foods.loc[i, "ancestry"] = "0"

foods.ancestry.value_counts(normalize=True, dropna=False)

NaN    0.652482
0      0.065603
3      0.024823
5      0.021277
509    0.017730
508    0.017730
327    0.015957
86     0.014184
510    0.014184
96     0.014184
237    0.012411
21     0.010638
317    0.008865
404    0.008865
308    0.008865
53     0.008865
349    0.007092
511    0.007092
269    0.005319
7      0.005319
519    0.005319
516    0.005319
84     0.003546
487    0.003546
194    0.003546
51     0.003546
334    0.003546
198    0.003546
512    0.003546
36     0.003546
409    0.001773
247    0.001773
259    0.001773
43     0.001773
66     0.001773
19     0.001773
152    0.001773
262    0.001773
174    0.001773
506    0.001773
503    0.001773
Name: ancestry, dtype: float64

### 2) *recipes* table: 

In [59]:
recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234 entries, 0 to 1233
Data columns (total 12 columns):
id               1234 non-null int64
title            1234 non-null object
servings         1234 non-null int64
ingredients      1234 non-null object
instructions     1234 non-null object
created_at       1234 non-null object
updated_at       1234 non-null object
recommendable    23 non-null object
status           1234 non-null object
origin           1234 non-null object
link             753 non-null object
rating           1214 non-null object
dtypes: int64(2), object(10)
memory usage: 115.8+ KB


- Create a copy of table and then drop unnecessary columns, replace index with "id", and try to fill the missing data.

In [60]:
recipes_copy = recipes.copy()
recipes.drop(["recommendable","status"], axis=1, inplace=True)
recipes.set_index("id", inplace=True)
recipes.head()

Unnamed: 0_level_0,title,servings,ingredients,instructions,created_at,updated_at,origin,link,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9,"Salade mâche, jambon de Bayonne, mozzarella",4,4 bonne poignée de mâche \r\n40 tomate cerise ...,"Après avoir coupé en dés la mozzarella, couper...",2017-12-14 14:56:37.166524,2018-10-05 09:16:42.390163,www.marmiton.org,http://www.marmiton.org/recettes/recette_salad...,limit
10,Ciabattina al pesto,1,Ciabattina\r\nJambon de Parme\r\nTomates confi...,Ouvrir le pain Ciabattina en deux et tartiner ...,2017-12-14 14:56:46.270433,2018-10-05 09:16:42.45289,www.club-sandwich.net,http://www.club-sandwich.net/mobile/fiche.php?...,limit
402,Tortilla aux champignons et salade,4,250 g de champignons de Paris\r\r\n4 oeufs\r\r...,Préchauffer le four à 180 °C (th. 6).\r\nCoupe...,2017-12-15 16:41:37.056079,2018-10-05 09:16:42.505825,www.mangerbouger.fr,http://www.mangerbouger.fr/Manger-Mieux/Recett...,limit
2,Steak haché et pâtes,1,2.0 filets huile d'olive\r\n1.0 steak haché\...,"Dans une casserole, portez à ébullition un gra...",2017-12-13 16:17:24.125137,2018-10-05 09:16:42.55582,www.wecook.fr,https://www.wecook.fr/recette/steak-hache-et-p...,good
4,Beef Bagel,1,Pain Bagel\r\nCarpaccio de boeuf\r\nFromage fr...,Placer les tranches de carpaccio dans un plat ...,2017-12-13 17:13:29.069001,2018-10-05 09:16:42.597349,www.club-sandwich.net,http://www.club-sandwich.net/mobile/fiche.php?...,limit


In [61]:
# Check the "origin" column values for the null data in "link" column
origin_list = recipes[recipes.link.isnull()]["origin"]
origin_list.value_counts()

mama    481
Name: origin, dtype: int64

In [62]:
# Add the Mama web link recipes for the missing values in the "link" column
foodmama_path = "https://www.foodmama.fr/recipes/"
index_list = recipes[recipes.link.isnull()].index
for i in index_list:
    recipes.loc[i, ["link"]] = foodmama_path + str(i)
    
# Check the first 10 lines
recipes.loc[index_list[:10], ["link"]]

Unnamed: 0_level_0,link
id,Unnamed: 1_level_1
905,https://www.foodmama.fr/recipes/905
1063,https://www.foodmama.fr/recipes/1063
1103,https://www.foodmama.fr/recipes/1103
1118,https://www.foodmama.fr/recipes/1118
1099,https://www.foodmama.fr/recipes/1099
906,https://www.foodmama.fr/recipes/906
774,https://www.foodmama.fr/recipes/774
908,https://www.foodmama.fr/recipes/908
776,https://www.foodmama.fr/recipes/776
777,https://www.foodmama.fr/recipes/777


In [63]:
# Check different categories in "rating" column
recipes.rating.value_counts(dropna=False)

limit        506
good         487
excellent    171
avoid         50
NaN           20
Name: rating, dtype: int64

In [64]:
# Replace "NaN" values with "unknown" rating
rating_list = recipes[recipes.rating.isnull()].index
recipes.loc[rating_list, ["rating"]] = "unknown"
recipes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1234 entries, 9 to 1241
Data columns (total 9 columns):
title           1234 non-null object
servings        1234 non-null int64
ingredients     1234 non-null object
instructions    1234 non-null object
created_at      1234 non-null object
updated_at      1234 non-null object
origin          1234 non-null object
link            1234 non-null object
rating          1234 non-null object
dtypes: int64(1), object(8)
memory usage: 136.4+ KB


### 3) *items* table: 

In [65]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8464 entries, 0 to 8463
Data columns (total 7 columns):
Id                      8464 non-null int64
Quantity                8464 non-null object
Recipe ingredient       8464 non-null object
Name [Food]             8464 non-null object
Title [Recipe]          8464 non-null object
Ingredients [Recipe]    8464 non-null object
Origin [Recipe]         8464 non-null object
dtypes: int64(1), object(6)
memory usage: 463.0+ KB


- Create a copy of table and then drop unnecessary columns, rename the columns, and replace index with "id".

In [67]:
items_copy = items.copy()
items.drop(["Quantity"], axis=1, inplace=True)
items.columns = ['id', 'recipe_ingredient', 'name-foods', 'title-recipes', 'ingredients-recipes', 'origin-recipes']
items.set_index("id", inplace=True)
items.head()

Unnamed: 0_level_0,recipe_ingredient,name-foods,title-recipes,ingredients-recipes,origin-recipes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2.0 filets huile d'olive,huile d'olive,Steak haché et pâtes,2.0 filets huile d'olive\r\n1.0 steak haché\...,www.wecook.fr
2,1.0 steak haché,steak haché,Steak haché et pâtes,2.0 filets huile d'olive\r\n1.0 steak haché\...,www.wecook.fr
3,70.0 grammes pâtes,pâtes,Steak haché et pâtes,2.0 filets huile d'olive\r\n1.0 steak haché\...,www.wecook.fr
4,500 g de semoule moyenne,semoule,Taboulé ultra-facile,500 g de semoule moyenne \r\n500 g de tomate e...,www.marmiton.org
5,500 g de tomate environ (ébouillantées pour ot...,tomate,Taboulé ultra-facile,500 g de semoule moyenne \r\n500 g de tomate e...,www.marmiton.org
