In [3]:
from wrangle import Wrangle
import pandas as pd
pd.options.display.max_columns = 110

In [4]:
nutrition_facts = Wrangle().get_food_data()

In [None]:
# removed food group nulls
nutrition_facts = nutrition_facts.dropna(axis=0, subset=["Food Group"])

In [None]:
# dropping columns that have every value missing
cols_drop = ["Added Sugar g", "Soluble Fiber g", "Insoluble Fiber g", "Total sugar alcohols g", "Molybdenum mcg", "Chlorine mg", "Biotin B7 mcg", "NetCarbs g"]
nutrition_facts = nutrition_facts.drop(columns=cols_drop, axis=1)

In [None]:
# dropping cols that don't seem to mean much
more_drop = ["ID", "Name", '183 n3 ccc ALA mg', '205 n3 EPA mg', '225 n3 DPA mg', '226 n3 DHA mg', "Serving Weight 1 g", "Serving Weight 2 g", "Serving Weight 3 g", "Serving Weight 4 g", "Serving Weight 5 g", "Serving Weight 6 g", "Serving Weight 7 g", "Serving Weight 8 g", "Serving Weight 9 g", "200 Calorie Weight g"] 
nutrition_facts = nutrition_facts.drop(columns=more_drop, axis=1)

In [None]:
def remove_missing_values(prop_required_column=.70, prop_required_row=.70, df=nutrition_facts):

    # drop column if 70% of its rows are empty
    threshold = int(prop_required_column*len(df))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    
    # drop row if 70% of its columns are empty
    threshold = int(prop_required_row*len(df.columns))
    df.dropna(axis=0, thresh=threshold, inplace=True)

    return df

In [None]:
def fill_nutrition(df, col, fill_value):

    df[col].fillna(fill_value, inplace=True)

    return df

In [None]:
# drop column if 70% of its rows are empty
prop_required_column = .70
threshold = int(prop_required_column*len(nutrition_facts))
nutrition_facts = nutrition_facts.dropna(axis=1, thresh=threshold)

In [None]:
nutrition_facts = nutrition_facts.fillna(0)

In [None]:
# nutrition_facts.isnull().sum().to_dict()

In [None]:
(nutrition_facts.isnull().sum()/len(nutrition_facts)).to_dict()

In [None]:
nutrition_facts.head(5)

In [None]:
nutrition_facts.info()

In [None]:
nutrition_facts.columns = nutrition_facts.columns.str.lower()
nutrition_facts.columns

### Column Units
- unitless: food group, calories, pral score
- gram (g): fat, protein, carbohydrate, sugars, fiber, saturated fats, water, alcohol
- milligram (mg): cholesterol, calcium, iron, potassium, magnesium, vitamin c, vitamin e alphatocopherol, omega 3s, omega 6s, phosphorus, sodium, zinc, copper, thiamin b1, riboflavin b2, niacin b3, vitamin b6, choline, fatty acids total monounsaturated, fatty acids total polyunsaturated, caffeine, theobromine
- micrograms (mcg): vitamin a, vitamin b12, vitamin d, selenium, folate b9, folic acid, food folate, folate dfe, retinol, carotene beta, carotene alpha, lycopene, lutein + zeaxanthin, vitamin k

In [None]:
cols_rename = {"food group": "food_group", "saturated fats g": "saturated_fats", "pral score": "pral_score", "fat g": "fat", "protein g": "protein", "carbohydrate g": "carbohydrate", "sugars g": "sugars", "fiber g": "fiber", "cholesterol mg": "cholesterol", "calcium mg": "calcium", "iron fe mg": "iron", "potassium k mg": "potassium", "magnesium mg": "magnesium", "vitamin a rae mcg": "'vitamin_a", "vitamin c mg": "vitamin_c", "vitamin b12 mcg": "vitamin_b12", "vitamin d mcg": "vitamin_d", "vitamin e alphatocopherol mg": "vitamin_e_alphatocopherol", "water g": "water", "omega 3s mg": "omega_3s", "omega 6s mg": "omega_6s", "phosphorus p mg": "phosphorus", "sodium mg": "sodium", "zinc zn mg": "zinc", "copper cu mg": "copper", "selenium se mcg": "selenium", "thiamin b1 mg": "thiamin_b1", "riboflavin b2 mg": "riboflavin_b2", "niacin b3 mg": "niacin_b3", "vitamin b6 mg": "vitamin_b6", "folate b9 mcg": "folate_b9", "folic acid mcg": "folic_acid", "food folate mcg": "food_folate", "folate dfe mcg": "folate_dfe", "choline mg": "choline", "retinol mcg": "retinol", "carotene beta mcg": "carotene_beta", "carotene alpha mcg": "carotene_alpha", "lycopene mcg": "lycopene", "lutein + zeaxanthin mcg": "lutein_plus_zeaxanthin", "vitamin k mcg": "vitamin_k", "fatty acids total monounsaturated mg": "fatty_acids_total_monounsaturated", "fatty acids total polyunsaturated mg": "fatty_acids_total_polyunsaturated", "alcohol g": "alcohol", "caffeine mg": "caffeine", "theobromine mg": "theobromine" }
nutrition_facts = nutrition_facts.rename(mapper=cols_rename, axis=1)
nutrition_facts.columns

In [None]:
nutrition_facts.head(3)

In [None]:
from prepare import Prepare

nutrition_facts = Prepare().get_food_prep()
nutrition_facts.head(3)