In [1]:
import os, sys
parent_dir = os.path.abspath('..')
# the parent_dir could already be there if the kernel was not restarted,
# and we run this cell again
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [2]:
# import sys
# from pathlib import Path # if you haven't already done so
# file = Path('..').resolve()
# sys.path.append(str(file.parents[1]))

from food_database.utils.utils import *
from food_database.food_df_join import *
from food_database.utils.parallel import *

import pandas as pd
import re
import dask as dd

from pathlib import Path

In [3]:
expanded_ingredients_df = pd.read_feather('../data/local/full/expanded_ingredients/6.feather', dtype_backend='pyarrow')
expanded_= expanded_ingredients_df.mask(expanded_ingredients_df == 'na >') 
food_df = pd.read_feather('../data/local/full/food/1.feather')
ingredients_df = pd.read_feather('../data/local/full/ingredients/8.feather')

### Food DF

In [4]:
# separating comma separated description into its own element
exploded_food_df = food_df.explode('description_list')['description_list'].to_frame('description')
exploded_food_df.head()

Unnamed: 0_level_0,description
fdc_id,Unnamed: 1_level_1
167549,snack
167549,popcorn
167549,oil-popped
167549,microwave
167549,regular flavor


# Test Search

In [5]:
ingredient = expanded_ingredients_df.iloc[18]
ingredient[ingredient.notnull()]

name.name.nouns.0    milk
Name: (5, 3), dtype: string[pyarrow]

In [6]:
food_df.loc[match_ingredient(ingredient, food_df, exploded_food_df)]

data_type                  survey_fndds_food
description                        Milk, NFS
density_exists                          True
description_length                         9
description_list                 [milk, nfs]
description_list_length                    2
default_word_count                         1
exclusion_word_count                       0
Name: 2340761, dtype: object

In [7]:
import dask.dataframe as dd

In [8]:
food_df.dtypes

data_type                  category
description                  object
density_exists                 bool
description_length            int64
description_list             object
description_list_length       int64
default_word_count            int64
exclusion_word_count          int64
dtype: object

In [9]:
dd.from_pandas(exploded_food_df, npartitions=10, name='exploded_food_df')

Unnamed: 0_level_0,description
npartitions=10,Unnamed: 1_level_1
167549,string
168943,...
...,...
2345397,...
2346355,...


In [10]:
sampled_df = expanded_ingredients_df.sample(int(1e5), random_state=1337)
sampled_df

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name.nouns.5,name.name.nouns.4,name.name.nouns.3,name.name.nouns.2,name.name.nouns.1,name.name.nouns.0,name.name.others.0,name.name.others.1,name.name.others.2,name.name.others.3,...,name.description.nouns.3,name.description.nouns.2,name.description.nouns.1,name.description.nouns.0,name.description.others.0,name.description.others.1,name.description.others.2,name.description.others.3,name.description.others.4,name.description.others.5
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
97847,2,,,,,mix,biscuit,,,,,...,,,,,,,,,,
2143660,2,,,,,,salt,,,,,...,,,,,,,,,,
1178017,3,,,,,,celery,,,,,...,,,,,,,,,,
424939,3,,,,,,egg,,,,,...,,,,,,,,,,
1927364,3,,,,,vodka,fifth,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1143130,2,,,,jam,raspberry,seedless,,,,,...,,,,,,,,,,
162356,0,,,,,cheese,cream,,,,,...,,,,pkg,,,,,,
1776407,6,,,,,,egg,,,,,...,,,,,,,,,,
1459987,16,,,,,,soda,baking,,,,...,,,,,,,,,,


In [11]:
results = parallel_apply(
    sampled_df, 
    match_ingredient, 
    meta=pd.Series(dtype='int64'), 
    args=(food_df, exploded_food_df),
    npartitions=100
)



Running parallel apply on DF with size: Index                         13196
name.description.nouns.0     464789
name.description.nouns.1     446593
name.description.nouns.2     424154
name.description.nouns.3     417131
name.description.nouns.4     414258
name.description.nouns.5     413117
name.description.others.0    482442
name.description.others.1    422245
name.description.others.2    414718
name.description.others.3    412955
name.description.others.4    412679
name.description.others.5    412552
name.name.nouns.0            971907
name.name.nouns.1            544377
name.name.nouns.2            420212
name.name.nouns.3            412974
name.name.nouns.4            412538
name.name.nouns.5            412500
name.name.others.0           526278
name.name.others.1           418650
name.name.others.2           412645
name.name.others.3           412500
name.name.others.4           412500
name.name.others.5           412500
dtype: int64)


In [13]:
expanded_ingredients_df.shape[0] / sampled_df.shape[0] * 3.5 / 60

7.887706166666666

In [14]:
results = results.convert_dtypes()
results

recipe   ingredient
97847    2              174902
2143660  2              173468
1178017  3             2345298
424939   3              171287
1927364  3                <NA>
                        ...   
1143130  2             2345859
162356   0             2341163
1776407  6              171287
1459987  16             175040
799949   1              169599
Length: 100000, dtype: Int64

In [15]:
ingredients_df = pd.read_feather('../data/local/full/ingredients/8.feather').loc[results.index]
ingredients_df

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,comment,quantity,unit,full_string,unit_tags,unit_remainders
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
97847,2,biscuit mix,biscuit mix,,0.5,cup,1/2 c. biscuit mix,[cup],[]
2143660,2,salt,salt,,2,teaspoons,2 teaspoons salt,[teaspoon],[]
1178017,3,celery,celery,minced,0.5,stalks,1/2 stalk celery (minced),[],[stalk]
424939,3,egg,egg,,2,,"2 eggs, beaten",[],[]
1927364,3,fifth vodka,fifth vodka,,1,,1 Fifth vodka,[],[]
...,...,...,...,...,...,...,...,...,...
1143130,2,seedless raspberry jam,seedless raspberry jam,,0.5,cups,1/2 cup seedless raspberry jam,[cup],[]
162356,0,cream cheese,pkg cream cheese,,1,,1 (8 oz.) pkg. cream cheese,[],[]
1776407,6,egg,egg,,2,large,2 large eggs,[whole],[]
1459987,16,baking soda,baking soda,,0.5,teaspoons,1/2 teaspoon baking soda,[teaspoon],[]


In [16]:
ingredients_df['food_id'] = results

In [17]:
ingredients_df.to_feather('../data/local/tmp/food_join.feather')

In [18]:
joined_df = ingredients_df.join(food_df, on='food_id')[['name.name', 'name.description', 'comment', 'description']]
joined_df.to_feather('../data/local/joined_df')

In [19]:
pd.set_option("display.max_rows", None)

In [20]:
joined_df.head(500)

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,comment,description
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
97847,2,biscuit mix,biscuit mix,,"Biscuits, plain or buttermilk, dry mix"
2143660,2,salt,salt,,"Salt, table"
1178017,3,celery,celery,minced,"Celery, raw"
424939,3,egg,egg,,"Egg, whole, raw, fresh"
1927364,3,fifth vodka,fifth vodka,,
1701239,3,green onion,green onion,,"Onions, green, raw"
1782258,7,salt,salt,,"Salt, table"
659826,0,chicken breast,boneless skinless chicken breast,,"Chicken breast, fried, coated, prepared skinle..."
858262,8,light cream,light cream,,"Cream, NS as to light, heavy, or half and half"
46382,3,egg,egg,,"Egg, whole, raw, fresh"


# Full Join First Look

In [4]:
food_join_path = Path('../data/local/full/food_ids')

ingredient_food_ids = pd.DataFrame(dtype='int64')
for i in range(len(set((food_join_path/'chunks').iterdir()))):
    chunk = pd.read_feather(str(food_join_path/f"chunks/{i}.feather"))
    ingredient_food_ids = pd.concat([ingredient_food_ids, chunk], axis=0)

ingredient_food_ids.shape

(13521782, 1)

In [5]:
ingredient_food_ids.rename({0: 'food_id'}, axis=1, inplace=True)

In [6]:
ingredient_food_ids.to_feather('../data/local/full/food_ids/0_matched.feather')

## NA ID's

In [7]:
food_ids = pd.read_feather('../data/local/full/food_ids/1_na_filled.feather')
food_ids

Unnamed: 0_level_0,Unnamed: 1_level_0,food_id
recipe,ingredient,Unnamed: 2_level_1
1,0,2341263.0
1,1,2341359.0
1,2,171175.0
1,3,171257.0
2,0,169217.0
...,...,...
2231141,6,172183.0
2231141,7,2345869.0
2231141,8,2341107.0
2231141,10,171825.0


In [9]:
ingredient_food_ids['food_id'].isna().sum()

159524

In [10]:
food_ids['food_id'].isna().sum()

158662

In [13]:
food_ids['food_id'].isna().sum() / food_ids.shape[0] * 100

1.1733808458086368

In [15]:
ingredients_df['food_id'] = food_ids['food_id']

In [17]:
ingredients_df[ingredients_df['food_id'].isna()].head(40)

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,comment,quantity,unit,full_string,unit_tags,unit_remainders,food_id
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11,3,paraffin,paraffin,,,,paraffin,[],[],
75,0,bisquick,bisquick,,2,cup,2 c. Bisquick,[cup],[],
87,4,velveeta,velveeta,,1,lb,1 lb. Velveeta,[pound],[],
108,5,brandy,brandy,,0.25,cup,1/4 c. brandy,[cup],[],
168,0,bisquick,bisquick,,,,Bisquick,[],[],
215,0,veg-all,veg-all,,1,can,"1 can Veg-All, drained",[can],[],
261,2,bisquick,bisquick,,0.5,cup,1/2 c. Bisquick,[cup],[],
279,6,paraffin,paraffin,,,,paraffin,[],[],
310,2,crisco,crisco,,0.25,cup,1/4 c. Crisco,[cup],[],
320,3,crisco,crisco,,0.5,cup,1/2 c. Crisco,[cup],[],


In [12]:
na_synonyms_df = pd.read_feather('../data/local/full/food_ids/na_synonyms.feather')
na_synonyms_df

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name.nouns.5,name.name.nouns.4,name.name.nouns.3,name.name.nouns.2,name.name.nouns.1,name.name.nouns.0,name.name.others.0,name.name.others.1,name.name.others.2,name.name.others.3,...,name.description.nouns.1.5.0,name.description.nouns.1.6.0,name.description.nouns.0.0.0,name.description.nouns.0.1.0,name.description.nouns.0.2.0,name.description.nouns.0.3.0,name.description.others.0.0.0,name.description.others.0.1.0,name.description.others.1.0.0,name.description.others.1.1.0
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
11,3,[],[],[],[],[],[wax],[],[],[],[],...,,,,,,,,,,
21,6,[],[],[],[],[],[],[],[],[],[],...,,,herb,,,,,,,
75,0,[],[],[],[],[],[],[],[],[],[],...,,,,,,,,,,
87,4,[],[],[],[],[],"[Velveeta, cheese]",[],[],[],[],...,,,,,,,,,,
108,5,[],[],[],[],[],[liquor],[],[],[],[],...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166710,0,[],[],[],[],[],[berry],[],[],[],[],...,,,,,,,,,,
166732,2,[],[],[],[],[],[],[],[],[],[],...,,,,,,,,,,
166748,1,[],[],[],[],[],[],[],[],[],[],...,,,,,,,,,,
166749,3,[],[],[],[],[],[material],[],[],[],[],...,,,,,,,,,,


## Issues

Full match only for default words.

In [None]:
joined_df.loc[228767,	2]

NaN ingredients

In [23]:
joined_df.loc[1656535,	0]

name.name           strawberries
name.description                
comment                         
description                  NaN
Name: (1656535, 0), dtype: object

In [27]:
joined_df.loc[1289653,	16]

name.name           lime zest
name.description             
comment                      
description               NaN
Name: (1289653, 16), dtype: object

In [39]:
joined_df.loc[1549997,	15]

name.name           Vanilla
name.description           
comment                    
description             NaN
Name: (1549997, 15), dtype: object

In [41]:
joined_df.loc[2176199,	1]

name.name           Water
name.description         
comment                  
description           NaN
Name: (2176199, 1), dtype: object

In [47]:
joined_df.loc[2101587,	2]

name.name           romano cheese
name.description                 
comment                          
description                   NaN
Name: (2101587, 2), dtype: object

In [48]:
joined_df.loc[2024512,	7	]

name.name           Parmigiano-Reggiano cheese
name.description                              
comment                                       
description                                NaN
Name: (2024512, 7), dtype: object

Basic ingredient defaults 

In [57]:
joined_df.loc[1399227,	8	]

name.name                                                       Pasta
name.description                                               Penne 
comment                                or your favorite kind of pasta
description         Pasta, whole grain, 51% whole wheat, remaining...
Name: (1399227, 8), dtype: object

In [56]:
joined_df.loc[2072101,	7	]

name.name                   coffee
name.description    strong brewed 
comment                           
description          Coffee, Cuban
Name: (2072101, 7), dtype: object

In [None]:
joined_df.loc[1406364,	5]

name.name                                flour
name.description                              
comment                                       
description         Sorghum flour, whole-grain
Name: (1406364, 5), dtype: object

In [28]:
joined_df.loc[340141,	3]

name.name                              flour
name.description                            
comment                                     
description         Soy flour, full-fat, raw
Name: (340141, 3), dtype: object

In [32]:
joined_df.loc[200173,	0]

name.name                              flour
name.description                all-purpose 
comment                                     
description         Soy flour, full-fat, raw
Name: (200173, 0), dtype: object

In [25]:
joined_df.loc[655648,	0]

name.name                          chicken
name.description                    pieces
comment                                   
description         Chicken, meatless, NFS
Name: (655648, 0), dtype: object

In [29]:
joined_df.loc[1026275,	4]

name.name                     bacon
name.description                   
comment                            
description         Bacon, meatless
Name: (1026275, 4), dtype: object

In [33]:
joined_df.loc[824600,	8]

name.name                          walnuts
name.description                   broken 
comment                                   
description         Walnuts, honey roasted
Name: (824600, 8), dtype: object

In [34]:
joined_df.loc[813086,	1]

name.name                             Parmesan cheese
name.description                                     
comment                                              
description         Parmesan cheese topping, fat free
Name: (813086, 1), dtype: object

In [49]:
joined_df.loc[1176998,	5]

name.name                                               cooking oil
name.description                                                   
comment                                                (for frying)
description         Oil, cooking and salad, ENOVA, 80% diglycerides
Name: (1176998, 5), dtype: object

Spices

In [30]:
joined_df.loc[1323426,	14	]

name.name             cayenne pepper
name.description                    
comment                             
description         Pepper, raw, NFS
Name: (1323426, 14), dtype: object

Can deal with these misc ones

In [31]:
joined_df.loc[1531866,	11]

name.name             milk chocolate chips
name.description                          
comment                                   
description         Cookie, chocolate chip
Name: (1531866, 11), dtype: object

Full word match only for NER match?

In [35]:
joined_df.loc[2015172,	5]

name.name                                     salt
name.description                       uned butter
comment                                           
description         Sunflower seeds, plain, salted
Name: (2015172, 5), dtype: object

Issue with word order swap (very wrong)

In [36]:
joined_df.loc[1317618,	1]

name.name                          tomato paste
name.description                               
comment                                        
description         Pate, chicken liver, canned
Name: (1317618, 1), dtype: object

Very wrong

In [55]:
joined_df.loc[156076,	4]

name.name                                    dressing
name.description                             Russian 
comment                              (recipe follows)
description         Dressing, honey mustard, fat-free
Name: (156076, 4), dtype: object

In [54]:
joined_df.loc[85307,	1]

name.name                                                    Bisquick
name.description                                                     
comment                                                              
description         Beverages, The COCA-COLA company, Hi-C Flashin...
Name: (85307, 1), dtype: object

In [53]:
joined_df.loc[2006319,	4]

name.name                                                 blueberries
name.description                                                     
comment                                                              
description         Yogurt, fruit, low fat, 10 grams protein per 8...
Name: (2006319, 4), dtype: object

In [51]:
joined_df.loc[1730165,	7]

name.name                            italian seasoning
name.description                                      
comment                                               
description         Seasoning mix, dry, taco, original
Name: (1730165, 7), dtype: object

In [52]:
joined_df.loc[1554582,	10]

name.name                                                      Chives
name.description                                                     
comment                                                              
description         Cereals, CREAM OF WHEAT, 2 1/2 minute cook tim...
Name: (1554582, 10), dtype: object

In [40]:
joined_df.loc[2063189,	3]

name.name                   Ketchup
name.description                   
comment                            
description         Fish, scup, raw
Name: (2063189, 3), dtype: object

In [45]:
joined_df.loc[1840312,	1	]

name.name                        Onion
name.description                      
comment                               
description         Milk, low fat (1%)
Name: (1840312, 1), dtype: object

In [46]:
joined_df.loc[706307,	5	]

name.name                         vanilla
name.description                         
comment                                  
description         SILK Vanilla, soymilk
Name: (706307, 5), dtype: object

Seasoning unfiltered

In [37]:
joined_df.loc[1435100,	5]

name.name           freshly ground black pepper
name.description                               
comment                                        
description               Spices, pepper, black
Name: (1435100, 5), dtype: object

Fuzzy Matching

In [38]:
joined_df.loc[1162952,	1]

name.name             lean ground beef
name.description                      
comment                               
description         Beef, steak, round
Name: (1162952, 1), dtype: object

Missed specifics

In [42]:
joined_df.loc[1624505,	5]

name.name                                  chia seeds
name.description                                     
comment                                              
description         Seeds, sesame seeds, whole, dried
Name: (1624505, 5), dtype: object

In [43]:
joined_df.loc[1747196,	2	]

name.name                    rice powder
name.description                        
comment                     (misu gallu)
description         Spices, curry powder
Name: (1747196, 2), dtype: object

Slightly wrong

In [44]:
joined_df.loc[902667,	4]

name.name                                       vanilla
name.description                                extract
comment                                                
description         Vanilla extract, imitation, alcohol
Name: (902667, 4), dtype: object

### Questions

- Are there tests that to find wrong matches?
- Is there a way of making this manual process easier?
    - Designating certain time of day
    - Excel sheet with tick columns / Copy/paste into JSON
- 