In [1]:
%matplotlib inline
import re
import os
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from requests import get
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

from IPython.core.display import SVG

In [2]:
DUNNHUMBY_PATH = '../data/dunnhumby - The Complete Journey CSV/'

In [3]:
nltk.download('stopwords')
from nltk.corpus import stopwords 
STOP_WORDS = list(set(stopwords.words('english')))
STOP_WORDS.append('NFS')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jerome/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Manual addition of words that we want to ignore to the Stopwords list

In [4]:
to_delete = ["added","ns","made","eaten","type","all"]
STOP_WORDS.append('NFS')

# Import Data

In [5]:
products_df =  pd.read_csv(os.path.join(DUNNHUMBY_PATH,"product.csv"))

In [6]:
products_df.head(10)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ
5,26426,69,GROCERY,Private,SPICES & EXTRACTS,SPICES & SEASONINGS,2.5 OZ
6,26540,69,GROCERY,Private,COOKIES/CONES,TRAY PACK/CHOC CHIP COOKIES,16 OZ
7,26601,69,DRUG GM,Private,VITAMINS,VITAMIN - MINERALS,300CT(1)
8,26636,69,PASTRY,Private,BREAKFAST SWEETS,SW GDS: SW ROLLS/DAN,
9,26691,16,GROCERY,Private,PNT BTR/JELLY/JAMS,HONEY,12 OZ


## Product Data

_We only take the categories which are food related, sorted manually the different departments_

In [7]:
products_sorted = products_df.groupby('DEPARTMENT').count().sort_values(by = 'PRODUCT_ID',ascending = False)
#NB: there are a few food in MISC. TRANS
food_related = np.array(['NUTRITION','GROCERY','PASTRY','MEAT-PCKGD','SEAFOOD-PCKGD','PRODUCE','DELI','MEAT','SALAD BAR','GRO BAKERY','FROZEN GROCERY','SPIRITS','RESTAURANT',''])

products_df = products_df[products_df.DEPARTMENT.isin(food_related)]

#we put all the description in a ingredients column
products_df['ingredients'] = products_df.COMMODITY_DESC + " " + products_df.SUB_COMMODITY_DESC
products_df.drop(["MANUFACTURER","DEPARTMENT","BRAND","COMMODITY_DESC","SUB_COMMODITY_DESC"],axis = 1, inplace = True)

In [8]:
def parse(str1): 
    """
    parses the string in a list of string (words) with all type of separators thanks to regexes
    """
    #matches any separator and any whitespace and transforms to mathc to lower case
    temp = list(filter(None,re.split("[\s;&@\/:,\*\.\(\)\{\}\\-%\"\'0-9]",str1)))
    #remove duplicate word, as there are many
    temp = list(dict.fromkeys(temp))
    temp = [i.lower() for i in temp if not i in STOP_WORDS]
    
    return temp

In [9]:
products_df.ingredients = products_df.ingredients.apply(parse)

## Downloaded food nutrients data

In [10]:
dfList = {}
for r, d, f in os.walk('../data/health'):
    for file in f:
        if '.csv' in file:
            #print(file)
            dfList[file] = pd.read_csv(os.path.join(r, file))
            
branded_food_df = dfList['branded_food.csv']

#link the nutrient id with its name
nutrient_df = dfList['nutrient.csv']

#contains the food articles name and their id test commit
food_df = dfList['food.csv']

#contains the nutrients for each food article
food_nutrients_df = dfList['food_nutrient.csv']

# linke the food articles ids to their potential category
food_category_df = dfList['food_category.csv']

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


_We drop useless columns_

In [11]:
#drop unnecessary columns and rename to be more understandable
food_nutrients_df = food_nutrients_df.drop(["data_points","min","max","median","footnote","min_year_acquired","derivation_id"],axis=1)

nutrient_df = nutrient_df.drop(["nutrient_nbr","rank"],axis=1)

food_category_df.drop(["code"],axis=1,inplace=True)
food_category_df.rename(columns={'id':'food_category_id','description':'category'},inplace= True)

food_df.drop(["publication_date"],axis=1,inplace=True)

In [12]:
#filter out only the necessary food nutrients since we have 227, a lot of which aren't necessary to determine if a food is healthy
list_relevant_nutrients = ["Protein", "Total Carbohydrate","Total lipid (fat)","Sucrose",\
                            "Glucose (dextrose)","Sugars, total including NLEA","Fatty acids, total monounsaturated",\
                            "Fatty acids, total polyunsaturated","Fatty acids, total trans","Fatty acids, total saturated","Cholesterol",\
                            "Vitamin E, added","Vitamin K (phylloquinone)","Vitamin B-12","Vitamin B-6",\
                            "Vitamin E (label entry primarily)","Vitamin E (alpha-tocopherol)","Vitamin D","Vitamin A, RAE","Sodium, Na",\
                            "Total fat (NLEA)","Fiber, total dietary","Energy","Carbohydrate, by summation","Fructose"]

nutrient_df = nutrient_df[nutrient_df.name.isin(list_relevant_nutrients)]
nutrient_df.head(40)

Unnamed: 0,id,name,unit_name
1,1003,Protein,G
2,1004,Total lipid (fat),G
5,1008,Energy,KCAL
7,1010,Sucrose,G
8,1011,Glucose (dextrose),G
9,1012,Fructose,G
16,1050,"Carbohydrate, by summation",G
21,1062,Energy,kJ
26,1079,"Fiber, total dietary",G
30,1085,Total fat (NLEA),G


In [13]:
def trim_nutrient_name(str1):
    """
    simplifies the names of the nutrients for easier access afterwards
    """
    #matches any separator and any whitespace and transforms to mathc to lower case
    temp = list(filter(None,re.split("[;&@\/:,\*\.\(\)\{\}\\%\"\']",str1)))
    #remove duplicate word, as there are many
    temp = [i.lower() for i in temp if not i in STOP_WORDS]
    if(temp[0] == "fatty acids"):
        return str.strip(temp[0] + temp[1])
    else:
        return str.strip(temp[0])

nutrient_df.name = nutrient_df.name.apply(trim_nutrient_name)

In [14]:
nutrient_df.head(40)

Unnamed: 0,id,name,unit_name
1,1003,protein,G
2,1004,total lipid,G
5,1008,energy,KCAL
7,1010,sucrose,G
8,1011,glucose,G
9,1012,fructose,G
16,1050,carbohydrate,G
21,1062,energy,kJ
26,1079,fiber,G
30,1085,total fat,G


In [15]:
food_nutrients_df.head()

Unnamed: 0,id,fdc_id,nutrient_id,amount
0,4178832,346049,1079,0.0
1,4178833,346049,1087,0.0
2,4178834,346049,1089,2.57
3,4178835,346049,1104,0.0
4,4178836,346049,1162,0.0


_Add the names of the nutrients to the nutrients per food_

In [17]:
food_nutrients_df = food_nutrients_df.join(nutrient_df.set_index('id'),on='nutrient_id',how='inner')

#index the resulting table by multiindex: product id -> name of nutrients
food_nutrients_df = food_nutrients_df.set_index(pd.MultiIndex.from_frame(food_nutrients_df[['fdc_id','name']]))
#drop unnecessary columns 
food_nutrients_df = food_nutrients_df.drop(["id","fdc_id","nutrient_id","name"],axis=1)

In [19]:
#here is the result
food_nutrients_df.loc[346049]

Unnamed: 0_level_0,amount,unit_name
name,Unnamed: 1_level_1,Unnamed: 2_level_1
fiber,0.0,G
protein,25.0,G
total lipid,14.29,G
sodium,911.0,MG
cholesterol,71.0,MG
fatty acids total trans,0.0,G
fatty acids total saturated,6.25,G
fatty acids total monounsaturated,0.89,G
fatty acids total polyunsaturated,0.0,G
sugars,1.79,G


As an example, we show the food contents of corned beef, the format matches our needs

In [44]:
def get_amount(to_convert):
    """Returns the amount of a nutrient by taking into account the specified unit
    """
    if(to_convert.unit_name == "UG"):
        return to_convert.amount * 1e-6
    elif(to_convert.unit_name == "MG"):
        return to_convert.amount * 1e-3
    else:
        return to_convert.amount

_We add the food category to food_df_

In [20]:
food_df = food_df.join(food_category_df.set_index("food_category_id"),on="food_category_id",how="left")
food_df.drop(["food_category_id"],axis=1,inplace=True)
food_df.head()

Unnamed: 0,fdc_id,data_type,description,category
0,346049,branded_food,"LIBBYS Corned Beef With Onion, 12 OZ",
1,346050,branded_food,"LIBBYS Corned Beef With Chili, 12 OZ",
2,346464,branded_food,WOLF Chili Without Beans,
3,346466,branded_food,"WOLF Turkey Chili No Beans, 15 OZ",
4,346468,branded_food,"WOLF BRAND Chili With Beans, 24 oz., 24 OZ",


We see that a lot of categories are unfortunately missing from the governement database

_At this stage we have 3 dataframes from our additional dataset for nutrition:_
- food_df = fdc_id vs name of food item (string)
- food_name_df = fdc_id vs parsed food title (list of string)
- food_nutrients_df = fdc_id vs nutrients contained (multiindex)

In [21]:
food_names_to_parse_df = food_df.copy()
food_names_to_parse_df.description = food_names_to_parse_df.description.apply(parse)

## Word Importance

_We filter the words according to their importance: that is, a word is more important as it apears many times in both datasets: (Ex: 'orange' is more important than 'artificial'). The words occuring in only one dataset are of no importance. The rest of the algorithm follows the following pipeline:_

In [22]:
SVG(filename='untitled2.svg')

FileNotFoundError: [Errno 2] No such file or directory: 'untitled2.svg'

In [33]:
def get_allwords(serie):
    """
    serie: serie containing lists of words
    return a dataframe containing
      - column name: name of the unique articles found in the lists of the serie
      - column count: how many times they appear in the serie
    """
    allwords = np.concatenate(serie.ravel())
    allwords = pd.Series(allwords)
    allwords = pd.DataFrame(allwords,columns= ["name"])
    allwords.reset_index(inplace = True)
    allwords.rename(columns = {'index':'number'},inplace = True)
    allwords = allwords.groupby('name').count().sort_values(by = 'number',ascending = False)
    return allwords.reset_index()

#all words present in the nutrition dataset
all_words_nutrition = get_allwords(food_names_to_parse_df.description)

#all words present in the product dataset
all_words_supermarket = get_allwords(products_df.ingredients)

#### TODO
Manual updates of STOPWORDS: _the idea would be to create a list of the words to update/ modify in the STOPWORDS list._

In [34]:
#to transform 
#SNKS/CKYS/CRKR/CNDY
to_transform = dict({"frzn":"frozen","refrgratd":"refrigerated","brkfst":"breakfast","whlsm":"wholesome"})

#### Inner join between the 2 sets of words:

_we check which words occur in both dataframes: only these words will have importance in determining the type of food article we are dealing with. Of course, if no words are known from the nutrition dataset, the sample is not taken into account._


In [35]:
common_words = pd.merge(all_words_supermarket,all_words_nutrition,left_on = 'name',right_on = 'name',suffixes=('_supermarket', '_nutrition'))

In [37]:
print(common_words.size)
common_words.head(100)

3141


Unnamed: 0,name,number_supermarket,number_nutrition
0,frzn,3341,1
1,snacks,3178,1913
2,meat,3090,2771
3,dry,2991,2533
4,premium,2423,4906
...,...,...,...
95,prepared,709,1051
96,packs,705,77
97,pet,682,21
98,sald,655,1


In [28]:
#Manual check to see which words occur in which dataset
print('chocolate' in all_words_nut.name.values)
print('chocolate' in all_words_art.name.values)

True
True


## Assemble them together (and pray your god)

In [38]:
def get_matches(test,food_list):
    """
    test = list of strings to test
    food_list: pandas dataframe linking the food article/id to the lists of words of its name
    return all the articles whose words contain all of the words of test
    """
    raise NotImplemented
    
def get_importance(word):
    """
    word: string for which we want to know the  importance
    return importance of word
    """
    raise NotImplemented
    
def find_food(test,food_list):
    """
    implementation of the graphic above
    test = list of strings to test
    food_list: pandas dataframe linking the food article/id to the lists of words of its name
    return the best article
    """
    if len(k) == 0:
        #give up the sample
        return 0 #dummy
    
    matches = get_matches(test,food_list)
    if len(matches) == 0:
        importance = [get_importance(i) for i in test]
        mino = np.min(importance)
        test = [i for i in test if i != mino]
        return find_food(test,food_list)
    elif len(matches) == 1:
        return matches[0]
    else:
        sizes = [len(i) for i in matches]
        minsize = np.min(sizes)
        minsizes = [i for i in matches if len(i) == minsize]
        if len(minsizes) == 1:
            return minsizes[0]
        else:
            importances = [np.sum([get_importance(j) for j in trial]) for trial in minsizes]
            armin_imp = np.argmin(importances)
            return importances[armin_imp]
                

def find_food_naive(test,food_list):
    """
    food_list: pandas dataframe linking the food article/id id to the lists of words of its name
    test: list of strings you want to have an id for
    return the corresponding food indx
    """
    #TODO: improve the non unique max
    scores = [get_score(test,i) for i in food_list.description]
    maxo = np.max(scores)
    if len([1 for x in scores if x == maxo]) > 1:
        print("Multiple maximums!")
    armax = np.argmax(scores)
    print('result: ',food_list.description[armax])
    return food_list.fdc_id[armax]

def get_score(test,trial):
    """
    test: the list of strings you're trying to classify
    trial: the list you want the score for
    return the score of matching
    """
    return np.sum([1 for i in test if i in trial])

In [39]:
test1 = ['chili']
find_food_naive(test1,food_df)

Multiple maximums!
result:  Dip, cheese with chili pepper (chili con queso)


336788

In [41]:
food_df.head()
food_nutrients_df.head()
food_nutrients_df.loc[336788]

Unnamed: 0_level_0,amount,unit_name
name,Unnamed: 1_level_1,Unnamed: 2_level_1
fiber,0.7,G
protein,3.14,G
total lipid,9.51,G
sodium,796.0,MG
cholesterol,9.0,MG
fatty acids total saturated,2.491,G
fatty acids total monounsaturated,2.762,G
fatty acids total polyunsaturated,3.643,G
sugars,1.95,G
energy,143.0,KCAL


In [None]:
test1 = ['chocolate bar']
find_food_naive(test1,food_df)
#Problem: if no specific match -> returns corned beef w/ onion

In [None]:
all_words_art.size

In [None]:
all_words_nut.size