In [1]:
%matplotlib inline
import re
import os
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from requests import get
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

from IPython.core.display import SVG

In [2]:
DUNNHUMBY_PATH = '../data/dunnhumby - The Complete Journey CSV/'

In [3]:
nltk.download('stopwords')
from nltk.corpus import stopwords 
STOP_WORDS = list(set(stopwords.words('english')))
STOP_WORDS.append('NFS')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jerome/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Import Data

In [65]:
products_df =  pd.read_csv(os.path.join(DUNNHUMBY_PATH,"product.csv"))

In [66]:
products_df.sample(30)

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
52682,5566545,407,GROCERY,National,VEGETABLES - SHELF STABLE,TOMATOES: STEWED/DICED/CRMD,14.5 OZ
53240,5570325,1282,DRUG GM,National,BABY FOODS,BABY FOOD - BEGINNER,
20357,946166,2496,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,3 LTR
44585,1384886,181,GROCERY,National,DRY BN/VEG/POTATO/RICE,RICE SIDE DISH MIXES DRY,4.9 OZ
45470,1496715,71,GROCERY,National,MARGARINES,MARGARINE: TUBS AND BOWLS,15 OZ
29636,1029616,317,GROCERY,National,SALD DRSNG/SNDWCH SPRD,POURABLE SALAD DRESSINGS,16 OZ
85040,13940278,69,PRODUCE,Private,NUTS,PECANS SHELLED,16OZ
75072,12263466,6301,DRUG GM,National,CANDLES/ACCESSORIES,CANDLES,16 OUNCE
28198,1016592,4951,DRUG GM,National,ANALGESICS,ADULT ANALGESICS,
42550,1194630,227,GROCERY,National,LAUNDRY ADDITIVES,DRY & SPRAY STARCH,10 OZ


## Product Data

_We only take the categories which are food related, sorted manually the different departments_

In [67]:
products_sorted = products_df.groupby('DEPARTMENT').count().sort_values(by = 'PRODUCT_ID',ascending = False)
#NB: there are a few food in MISC. TRANS
food_related = np.array(['NUTRITION','GROCERY','PASTRY','MEAT-PCKGD','SEAFOOD-PCKGD','PRODUCE','DELI','MEAT','SALAD BAR','GRO BAKERY','FROZEN GROCERY','SPIRITS','RESTAURANT',''])

products_df = products_df[products_df.DEPARTMENT.isin(food_related)]

#we put all the description in a ingredients column
products_df['ingredients'] = products_df.COMMODITY_DESC + " " + products_df.SUB_COMMODITY_DESC
products_df.drop(["MANUFACTURER","DEPARTMENT","BRAND","COMMODITY_DESC","SUB_COMMODITY_DESC"],axis = 1, inplace = True)

In [75]:
def parse(str1): 
    """
    pars the string in a list of string (words) with all type of separators thx to regex
    """
    #matches any separator and any whitespace and transforms to mathc to lower case
    temp = list(filter(None,re.split("[\s;&@\/:,\*\.\(\)\{\}\\-%\"\'0-9]",str1)))
    #remove duplicate word, as there are many
    temp = list(dict.fromkeys(temp))
    temp = [i.lower() for i in temp if not i in STOP_WORDS]
    
    return temp

In [76]:
products_df.ingredients = products_df.ingredients.apply(parse)

## Downloaded food nutrients data

In [77]:
dfList = {}
for r, d, f in os.walk('../data/health'):
    for file in f:
        if '.csv' in file:
            #print(file)
            dfList[file] = pd.read_csv(os.path.join(r, file))
            
branded_food_df = dfList['branded_food.csv']

#link the nutrient id with its name
nutrient_df = dfList['nutrient.csv']

#contains the food articles name and their id
food_df = dfList['food.csv']

#contains the nutrients for each food article
food_nutrients_df = dfList['food_nutrient.csv']

# linke the food articles ids to their potential category
food_category_df = dfList['food_category.csv']

_We drop useless columns_

In [78]:
#drop unnecessary columns and rename to be more understandable
food_nutrients_df = food_nutrients_df.drop(["data_points","min","max","median","footnote","min_year_acquired","derivation_id"],axis=1)

nutrient_df = nutrient_df.drop(["nutrient_nbr","rank"],axis=1)

food_category_df.drop(["code"],axis=1,inplace=True)
food_category_df.rename(columns={'id':'food_category_id','description':'category'},inplace= True)

food_df.drop(["publication_date"],axis=1,inplace=True)

_Add the names of the nutrients to the nutrients per food_

In [79]:
food_nutrients_df = food_nutrients_df.join(nutrient_df.set_index('id'),on='nutrient_id',how='left')

#index the resulting table by multiindex: product id -> name of nutrients
food_nutrients_df = food_nutrients_df.set_index(pd.MultiIndex.from_frame(food_nutrients_df[['fdc_id','name']]))
#drop unnecessary columns 
food_nutrients_df = food_nutrients_df.drop(["id","fdc_id","nutrient_id","name"],axis=1)

_We add the food category to food_df_

In [80]:
food_df = food_df.join(food_category_df.set_index("food_category_id"),on="food_category_id",how="left")
food_df.drop(["food_category_id"],axis=1,inplace=True)

In [81]:
food_name = food_df.copy()
food_name.description = food_name.description.apply(parse)

_At this stage we have 3 dataframes for nutrition:_
- food_df = id of food articel vs food title (string)
- food_name_df = id of food vs parsed food title (list of string)
- food_nutrients_df = id of food article vs food nutrients

## Word Importance

_We filter the words according to their importance: that is, a word is more important as it apears many times in both datasets: (Ex: 'orange' is more important than 'artificial'). The words occuring in only one dataset are of no importance. The rest of the algorithm follows the following pipeline:_

In [82]:
SVG(filename='untitled2.svg')

FileNotFoundError: [Errno 2] No such file or directory: 'untitled2.svg'

In [83]:
def get_allwords(serie):
    """
    serie: serie containing lists of words
    return a dataframe containing
      - column name: name of the unique articles found in the lists of the serie
      - column count: how many times they appear in the serie
    """
    allwords = np.concatenate(serie.ravel())
    allwords = pd.Series(allwords)
    allwords = pd.DataFrame(allwords,columns= ["name"])
    allwords.reset_index(inplace = True)
    allwords.rename(columns = {'index':'number'},inplace = True)
    allwords = allwords.groupby('name').count().sort_values(by = 'number',ascending = False).head(50)
    return allwords.reset_index()

#all words present in the nutrition dataset
all_words_nut = get_allwords(food_name.description)

#all words present in the product dataset
all_words_art = get_allwords(products_df.ingredients)



#### TODO
Manual updates of STOPWORDS: _the idea would be to create a list of the words to update/ modify in the STOPWORDS list._

In [84]:
#to delete
to_delete = ["added","ns","made","eaten","type","all"]

In [92]:
#to transform
#SNKS/CKYS/CRKR/CNDY 	
to_transform = dict({"frzn":"frozen","refrgratd":"refrigerated","brkfst":"breakfast","whlsm":"wholesome"})

#### Inner join between the 2 sets of words:

_we check which words occur in both dataframes: only these words will have importance in determining the type of food article we are dealing with. Of course, if no words are known from the nutrition dataset, the sample is not taken into account._


In [93]:
common_words = pd.merge(all_words_art,all_words_nut,left_on = 'name',right_on = 'name')

In [94]:
common_words.head(100)

Unnamed: 0,name,number_x,number_y
0,premium,2423,4906
1,cheese,2373,17424
2,bread,1858,4688
3,fruit,1767,6148
4,milk,1687,9276
5,cookies,1641,6060
6,ice,1337,5892
7,juice,1314,6341
8,potato,1271,4225
9,beef,1262,6226


PROBLEM: the 2 dataframes don't share much words!
possible solutions:
- use another/more nutrition datset
- parse better the articles dataset (some words are badly parsed)

In [95]:
#Manual check to see which words occur in which dataset
print('chocolate' in all_words_nut.name.values)
print('chocolate' in all_words_art.name.values)

True
False


## Assemble them together (and pray your god)

In [96]:


def get_matches(test,food_list):
    """
    test = list of strings to test
    food_list: pandas dataframe linking the food article/id to the lists of words of its name
    return all the articles whose words contain all of the words of test
    """
    raise NotImplemented
    
def get_importance(word):
    """
    word: string for which we want to know the  importance
    return importance of word
    """
    raise NotImplemented
    
def find_food(test,food_list):
    """
    implementation of the graphic above
    test = list of strings to test
    food_list: pandas dataframe linking the food article/id to the lists of words of its name
    return the best article
    """
    if len(k) == 0:
        #give up the sample
        return 0 #dummy
    
    matches = get_matches(test,food_list)
    if len(matches) == 0:
        importance = [get_importance(i) for i in test]
        mino = np.min(importance)
        test = [i for i in test if i != mino]
        return find_food(test,food_list)
    elif len(matches) == 1:
        return matches[0]
    else:
        sizes = [len(i) for i in matches]
        minsize = np.min(sizes)
        minsizes = [i for i in matches if len(i) == minsize]
        if len(minsizes) == 1:
            return minsizes[0]
        else:
            importances = [np.sum([get_importance(j) for j in trial]) for trial in minsizes]
            armin_imp = np.argmin(importances)
            return importances[armin_imp]
                

def find_food_naive(test,food_list):
    """
    food_list: pandas dataframe linking the food article/id id to the lists of words of its name
    test: list of strings you want to have an id for
    return the corresponding food indx
    """
    #TODO: improve the non unique max
    scores = [get_score(test,i) for i in food_list.description]
    maxo = np.max(scores)
    if len([1 for x in scores if x == maxo]) > 1:
        print("Multiple maximums!")
    armax = np.argmax(scores)
    print('result: ',food_list.description[armax])
    return food_list.fdc_id[armax]

def get_score(test,trial):
    """
    test: the list of strings you're trying to classify
    trial: the list you want the score for
    return the score of matching
    """
    return np.sum([1 for i in test if i in trial])
        

In [97]:
test1 = ['seafood']
find_food_naive(test1,food_df)

Multiple maximums!
result:  Paella with seafood


338359

In [98]:
all_words_art.size

100

In [99]:
all_words_nut.size

100