In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
DATA_FOLDER = 'data'

In [3]:
dinner_nutrients_df = pd.read_csv('dinner_nutrient.csv')
dinner_ingredients_df = pd.read_csv('dinner_ingredient.csv')

## What we need to do

**Cleaning:**
1. Make quantities of nutrients into integers (while respecting units)
2. Create vegetarian dataset by removing all dishes containing meat or fish-related ingredients (list of words found in the corresponding text documents under /data). If a meat/fish is in ingredients, delete the corresponding recipe from the nutrient dataset.

**Analysis:**
1. Plot histograms of the quantity of a given nutrient thoughout the vegetarian recipes
2. Calculate median and mode (= most common value)
3. Compare these with the FDA recommendation. NOTE this needs to be per serving - verify
4. Calculate median and mode from omnivorous (original) dataset. For large differences with the vegetarian data, also view histogram. Compare together and with FDA.
5. For vegetarian histograms with large discrepencies (some recipes with high levels of nutrient, others with low levels), cross-reference ingredients to see what may be responsible for providing the nutrient.

**Visualization of results:**
1. It would be good to plot the FDA recommendation on the histogram, also median and mode

### Lets explore the data and do some cleaning

In [4]:
dinner_nutrients_df.head(20)

Unnamed: 0.1,Unnamed: 0,URL,amount,nutrient,recipe
0,0,https://www.allrecipes.com/recipe/222607/smoth...,20.1g,Total Fat,Smothered Chicken Breasts
1,1,https://www.allrecipes.com/recipe/222607/smoth...,8.0g,Saturated Fat,Smothered Chicken Breasts
2,2,https://www.allrecipes.com/recipe/222607/smoth...,124mg,Cholesterol,Smothered Chicken Breasts
3,3,https://www.allrecipes.com/recipe/222607/smoth...,809mg,Sodium,Smothered Chicken Breasts
4,4,https://www.allrecipes.com/recipe/222607/smoth...,377mg,Potassium,Smothered Chicken Breasts
5,5,https://www.allrecipes.com/recipe/222607/smoth...,16.9g,Total Carbohydrates,Smothered Chicken Breasts
6,6,https://www.allrecipes.com/recipe/222607/smoth...,0.5g,Dietary Fiber,Smothered Chicken Breasts
7,7,https://www.allrecipes.com/recipe/222607/smoth...,44g,Protein,Smothered Chicken Breasts
8,8,https://www.allrecipes.com/recipe/222607/smoth...,15g,Sugars,Smothered Chicken Breasts
9,9,https://www.allrecipes.com/recipe/222607/smoth...,29IU,Vitamin A,Smothered Chicken Breasts


In [5]:
dinner_ingredients_df.head(20)

Unnamed: 0.1,Unnamed: 0,URL,ingredient
0,0,https://www.allrecipes.com/recipe/222607/smoth...,"4 (6 ounce) skinless, boneless chicken breast ..."
1,1,https://www.allrecipes.com/recipe/222607/smoth...,1/4 teaspoon salt
2,2,https://www.allrecipes.com/recipe/222607/smoth...,1/4 teaspoon lemon pepper seasoning
3,3,https://www.allrecipes.com/recipe/222607/smoth...,1 tablespoon vegetable oil
4,4,https://www.allrecipes.com/recipe/222607/smoth...,8 strips bacon
5,5,https://www.allrecipes.com/recipe/222607/smoth...,"1 onion, sliced"
6,6,https://www.allrecipes.com/recipe/222607/smoth...,1/4 cup packed brown sugar
7,7,https://www.allrecipes.com/recipe/222607/smoth...,1/2 cup shredded Colby-Monterey Jack cheese
8,0,https://www.allrecipes.com/recipe/15679/asian-...,3 tablespoons soy sauce
9,1,https://www.allrecipes.com/recipe/15679/asian-...,2 tablespoons rice wine


In [6]:
# Can use the ID of the recipes
recipe_id = dinner_nutrients_df["URL"].str.extract(r'https://www.allrecipes.com/recipe/(\d+)/.+')
dinner_nutrients_df["recipe_id"] = recipe_id
dinner_nutrients_df = dinner_nutrients_df.drop(["Unnamed: 0", "URL"], axis=1)
dinner_nutrients_df.head()

Unnamed: 0,amount,nutrient,recipe,recipe_id
0,20.1g,Total Fat,Smothered Chicken Breasts,222607
1,8.0g,Saturated Fat,Smothered Chicken Breasts,222607
2,124mg,Cholesterol,Smothered Chicken Breasts,222607
3,809mg,Sodium,Smothered Chicken Breasts,222607
4,377mg,Potassium,Smothered Chicken Breasts,222607


In [7]:
# Can use the ID of the recipes
recipe_id = dinner_ingredients_df["URL"].str.extract(r'https://www.allrecipes.com/recipe/(\d+)/.+')
dinner_ingredients_df["recipe_id"] = recipe_id
dinner_ingredients_df = dinner_ingredients_df.drop(["Unnamed: 0", "URL"], axis=1)
dinner_ingredients_df.head()

Unnamed: 0,ingredient,recipe_id
0,"4 (6 ounce) skinless, boneless chicken breast ...",222607
1,1/4 teaspoon salt,222607
2,1/4 teaspoon lemon pepper seasoning,222607
3,1 tablespoon vegetable oil,222607
4,8 strips bacon,222607


In [8]:
# let's create a dataframe that contain all the different recipie for the dinner with the corresponding name
dinner_recipe_df = dinner_nutrients_df[["recipe_id", "recipe"]].drop_duplicates()
dinner_recipe_df = dinner_recipe_df.set_index("recipe_id")
print("We have in total: " + str(dinner_recipe_df["recipe"].count()) + " different recipes for dinner")
dinner_recipe_df.head()

We have in total: 1128 different recipes for dinner


Unnamed: 0_level_0,recipe
recipe_id,Unnamed: 1_level_1
222607,Smothered Chicken Breasts
15679,Asian Beef with Snow Peas
23847,Pasta Pomodoro
50435,Fry Bread Tacos II
140829,Pork Marsala


In [9]:
print("We have: " + str(len(dinner_ingredients_df["recipe_id"].unique())) + " recipe in the ingredient dataframe.")

We have: 1152 recipe in the ingredient dataframe.


We can see that there are recipe that did not give the nutrient values. 

In [10]:
sodium = dinner_nutrients_df.where(dinner_nutrients_df['nutrient'] == "Sodium")

In [11]:
sodium = sodium.dropna()
sodium = sodium.reset_index(drop=True)
sodium.head()

Unnamed: 0,amount,nutrient,recipe,recipe_id
0,809mg,Sodium,Smothered Chicken Breasts,222607
1,711mg,Sodium,Asian Beef with Snow Peas,15679
2,350mg,Sodium,Pasta Pomodoro,23847
3,2255mg,Sodium,Fry Bread Tacos II,50435
4,356mg,Sodium,Pork Marsala,140829


The unit is the same for each value of a given nutrient, therefore we can drop this value with impunity (verification using Sodium since this value can vary greatly)

Vitamin A will be completely ignored since it is given in IU. This unit will no longer be valid from 2021. In addition, it cannot be converted into micrograms since this depends on its origin: retinol (pre-formed vitamin A) or beta-carotene (a precursor).

In [12]:
dinner_nutrients_df = dinner_nutrients_df[dinner_nutrients_df.nutrient != "Vitamin A"]

In [13]:
dinner_nutrients_df.head()

Unnamed: 0,amount,nutrient,recipe,recipe_id
0,20.1g,Total Fat,Smothered Chicken Breasts,222607
1,8.0g,Saturated Fat,Smothered Chicken Breasts,222607
2,124mg,Cholesterol,Smothered Chicken Breasts,222607
3,809mg,Sodium,Smothered Chicken Breasts,222607
4,377mg,Potassium,Smothered Chicken Breasts,222607


In [14]:
#Remove the letters m, c, g, I, U from the string nutrient and convert it to a float

dinner_nutrients_df.amount = dinner_nutrients_df.amount.apply(lambda x: re.sub('[mcgIU]','', x))
dinner_nutrients_df.amount = dinner_nutrients_df.amount.replace(to_replace='< 1', value='0')
dinner_nutrients_df.amount = dinner_nutrients_df.amount.astype(float)
dinner_nutrients_df.dtypes

amount       float64
nutrient      object
recipe        object
recipe_id     object
dtype: object

**Now our data is clean, we can create a vegetarian dataframe: **

In [15]:
meat_list = list(pd.read_csv('data/meats', header=None)[0])
fish_list = list(pd.read_csv('data/fish', header=None)[0])

In [46]:
# We have a list of different meat and fish that the vege do not eat
meat_list = list(pd.read_csv('data/meats', header=None)[0])
fish_list = list(pd.read_csv('data/fish', header=None)[0])

vege_recipe_df = dinner_recipe_df
for meat_fish in meat_list or fish_list:
    # TODO: DO NOT WORK !!!
    parse = '\w*'+meat_fish+'\w*'
    # find the meat in the ingredients and give the recipe id that have meat
    recipe_id_with_meat = dinner_ingredients_df[dinner_ingredients_df["ingredient"].str.contains(parse)]["recipe_id"].unique()
    # ignore errors because of the recipe that have ingredient but no nutrient value
    vege_recipe_df = vege_recipe_df.drop(recipe_id_with_meat, errors='ignore')

In [47]:
vege_recipe_df

Unnamed: 0_level_0,recipe
recipe_id,Unnamed: 1_level_1
87648,Fresh Tomato Shrimp Pasta
16066,Awesome Slow Cooker Pot Roast
51283,Maple Salmon
26257,"Bow Ties with Sausage, Tomatoes and Cream"
21694,Marinated Grilled Shrimp
25874,Shrimp Scampi Bake
16427,Slow Cooker Italian Beef for Sandwiches
39748,Actually Delicious Turkey Burgers
13941,Zucchini Patties
213211,Turkey and Quinoa Meatloaf
