In [None]:
%matplotlib inline
import re
import os
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from requests import get
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

from IPython.core.display import SVG

In [None]:
import sys
sys.path.insert(1, './utilities/')

from health_functions import *

In [None]:
DUNNHUMBY_PATH = '../data/dunnhumby - The Complete Journey CSV/'

# Import Data

In [None]:
dfList = {}
for r, d, f in os.walk(DUNNHUMBY_PATH):
    for file in f:
        if '.csv' in file:
            print(file)
            dfList[file] = pd.read_csv(os.path.join(r, file))
            
products_df = dfList['product.csv']
transaction_data_df = dfList['transaction_data.csv']
hh_demographic_df = dfList['hh_demographic.csv']

In [None]:
products_df.head(3)

## Product Data

_We only take the categories which are food related, sorted manually the different departments_

In [None]:
products_sorted = products_df.groupby('DEPARTMENT').count().sort_values(by = 'PRODUCT_ID',ascending = False)
#NB: there are a few food in MISC. TRANS
food_related = np.array(['NUTRITION','GROCERY','PASTRY','MEAT-PCKGD','SEAFOOD-PCKGD','PRODUCE','DELI','MEAT','SALAD BAR','GRO BAKERY','FROZEN GROCERY','SPIRITS','RESTAURANT',''])

products_df = products_df[products_df.DEPARTMENT.isin(food_related)]

#we put all the description in a ingredients column
products_df['ingredients'] = products_df.COMMODITY_DESC + " " + products_df.SUB_COMMODITY_DESC
products_df.drop(["MANUFACTURER","DEPARTMENT","BRAND","COMMODITY_DESC","SUB_COMMODITY_DESC"],axis = 1, inplace = True)

In [None]:
products_df.ingredients = products_df.ingredients.apply(parse_words)

In [None]:
products_df.ingredients.head()

We see that we now have an easily parseable format

## Downloaded food nutrients data

In [None]:
dfList = {}
for r, d, f in os.walk('../data/health'):
    for file in f:
        if '.csv' in file:
            #print(file)
            dfList[file] = pd.read_csv(os.path.join(r, file))
            
branded_food_df = dfList['branded_food.csv']

#link the nutrient id with its name
nutrient_df = dfList['nutrient.csv']

#contains the food articles name and their id test commit
food_df = dfList['food.csv']

#contains the nutrients for each food article
food_nutrients_df = dfList['food_nutrient.csv']

# linke the food articles ids to their potential category
food_category_df = dfList['food_category.csv']

_We drop useless columns_

In [None]:
#drop unnecessary columns and rename to be more understandable
food_nutrients_df = food_nutrients_df.drop(["data_points","min","max","median","footnote","min_year_acquired","derivation_id"],axis=1)

nutrient_df = nutrient_df.drop(["nutrient_nbr","rank"],axis=1)

food_category_df.drop(["code"],axis=1,inplace=True)
food_category_df.rename(columns={'id':'food_category_id','description':'category'},inplace= True)

food_df.drop(["publication_date"],axis=1,inplace=True)

In [None]:
#filter out only the necessary food nutrients since we have 227, a lot of which aren't necessary to determine if a food is healthy
list_relevant_nutrients = ["Protein", "Total Carbohydrate","Total lipid (fat)","Sucrose",\
                            "Glucose (dextrose)","Sugars, total including NLEA","Fatty acids, total monounsaturated",\
                            "Fatty acids, total polyunsaturated","Fatty acids, total trans","Fatty acids, total saturated","Cholesterol",\
                            "Vitamin E, added","Vitamin K (phylloquinone)","Vitamin B-12","Vitamin B-6",\
                            "Vitamin E (label entry primarily)","Vitamin E (alpha-tocopherol)","Vitamin D","Vitamin A, RAE","Sodium, Na",\
                            "Total fat (NLEA)","Fiber, total dietary","Energy","Carbohydrate, by summation","Fructose"]

nutrient_df = nutrient_df[nutrient_df.name.isin(list_relevant_nutrients)]

In [None]:
nutrient_df.name = nutrient_df.name.apply(trim_nutrient_name)

In [None]:
nutrient_df.head(4)

In [None]:
food_nutrients_df.head()

_Add the names of the nutrients to the nutrients per food_

In [None]:
food_nutrients_df = food_nutrients_df.join(nutrient_df.set_index('id'),on='nutrient_id',how='inner')

#index the resulting table by multiindex: product id -> name of nutrients
food_nutrients_df = food_nutrients_df.set_index(pd.MultiIndex.from_frame(food_nutrients_df[['fdc_id','name']]))
#drop unnecessary columns 
food_nutrients_df = food_nutrients_df.drop(["id","fdc_id","nutrient_id","name"],axis=1)

In [None]:
food_nutrients_df.shape

In [None]:
#here is the result
food_nutrients_df.loc[336079,"energy"]["amount"].values[0] #TOTRASH

As an example, we show the food contents of corned beef, the format matches our needs

_We add the food category to food_df_

In [None]:
food_df = food_df.join(food_category_df.set_index("food_category_id"),on="food_category_id",how="left")
food_df.drop(["food_category_id"],axis=1,inplace=True)

In [None]:
all_information_df = food_nutrients_df.join(food_df.set_index("fdc_id"))

We see that a lot of categories are unfortunately missing from the governement database

In [None]:
food_df.description = food_df.description.apply(normalize_text)

Still, we create the dataframe that will allow us to link the test values to the one of the supermarket

In [None]:
food_name_df = food_df.copy()
food_name_df.description = food_name_df.description.apply(parse_words)

_At this stage we have 3 dataframes from our additional dataset for nutrition:_
- food_df = fdc_id vs name of food item (string)
- food_name_df = fdc_id vs parsed food title (**list of string**)
- food_nutrients_df = fdc_id vs nutrients contained (multiindex)
- all_information_df = fdc_id, nutrients, data type, description and food category

## Word Importance

_We filter the words according to their importance: that is, a word is more important as it apears many times in both datasets: (Ex: 'orange' is more important than 'artificial'). The words occuring in only one dataset are of no importance. The rest of the algorithm follows the following pipeline:_

In [None]:
SVG(filename='graphs/allwords.svg')

In [None]:
#all words present in the nutrition dataset
all_words_nutrition = get_allwords(food_name_df.description)

#all words present in the product dataset
all_words_supermarket = get_allwords(products_df.ingredients)

#### Inner merge between the 2 sets of words:

_we check which words occur in both dataframes: only these words will have importance in determining the type of food article we are dealing with. Of course, if no words are known from the nutrition dataset, the sample is not taken into account._


In [None]:
common_words = pd.merge(all_words_supermarket,all_words_nutrition,left_on = 'name',right_on = 'name',suffixes=('_supermarket', '_nutrition'))

In [None]:
print(common_words.size)
common_words.head(10)

## Assemble them together (and pray your god)

In [None]:
SVG(filename='graphs/Word_importance.svg')

In [None]:
test0 = ['duck','creamy','swiss','miss','pudding','24','oz']
test1 = ["penguin","afdadf"]
test2 = ["peanuts","orange","crisp"]
test3 = ["sandwich","lettuce","cheese"]
test4 = ["indian","lamb","josh"]
test5 = ['vanilla','creamy','swiss','miss','pudding','24','oz']
test6 = ['libbys']
test7 = ["hispanic", "oriental", "noodles"," rice"]
test8 = ["vegetables", "others"]
test9 = ["frozen", "ice", "cream", "bars"]
test10 =["wolf", "chili", "without", "beans"]

In [None]:
DIC_SCORE = construct_dic_score(common_words)

## Application of the function

In [None]:
products_df_short = products_df.head(5)

In [None]:
products_df_short

In [None]:
#First test on sample
estimated_df = products_df_short.copy()
findfoodshort = lambda list_words: find_food(list_words,food_name_df,DIC_SCORE,verb = True)
estimated_df["ref_fdc_id"] = products_df_short.ingredients.apply(findfoodshort).fdc_id
estimated_df.head()

In [None]:
#full df
estimated_total_df = products_df.copy()
findfoodtotal = lambda list_words: find_food(list_words,food_name_df,DIC_SCORE,verb = True)
#estimated_df["ref_fdc_id"] = products_df.ingredients.apply(findfoodtotal).fdc_id
estimated_df.head()
#Problem: takes more than 45 min

In [None]:
#saves results of this lengthy computation
estimated_df.to_pickle("../data/results/products_with_link_to_nutrients_df.pickle")

We now have established the link between the dunnhumby supermarket dataset and the food database from the Department of Food and Agriculture which provides the nutrients information. We can now begin our analysis

# Analysis

In [None]:
def calculate_nutrient_per_day_per_person(key1,nutrient):
    """
    key = household key
    nutrient = nutrient for which we want to calculate the amount bought
    returns the amount of a specified nutrient bought and presumably consumed per day and per person on avergade
    in the selected household
    """
    household_transactions_df = transaction_data_df[transaction_data_df.household_key == key1]
    household_transactions_df[nutrient] = household_transactions_df["PRODUCT_ID"].apply(get_nutrient_amount,args=(nutrient,))
    time_interval = household_transactions_df["DAY"].max() - household_transactions_df["DAY"].min() + 1
    nutrient_per_household = household_transactions_df[nutrient].sum() / time_interval
    household_demographics = hh_demographic_df[hh_demographic_df.household_key == key1]
    nutrient_per_person = nutrient_per_household / household_demographics["HOUSEHOLD_SIZE_DESC"]
    return nutrient_per_person

def parse_sex_homeowner(key):
    str1 = hh_demographic_df[hh_demographic_df.household_key == key]["HH_COMP_DESC"].values[0]
    str1 = str1.lower()
    if str1.contains("female"):
            return "f"
    elif str1.contains("male"):
            return "m"
    else:
            return "unknown"
    
#TO DO
#caloric needs adult = 2000 kcal , kid = 1000 kcal -> take it into account

In [None]:
get_nutrient_amount(25671,"energy",estimated_df,food_nutrients_df)

In [None]:
transaction_data_df.head()

In [None]:
hh_demographic_df.sample(4)

In [None]:
#doesn't work because the information fdc_id is missing because the computation take too long
calculate_nutrient_per_day_per_person(255,"energy")

In [None]:
test3 = ["sandwich","lettuce","cheese"]

find_food(test3, food_name_df, DIC_SCORE, verb = False)