# Introduction

Here we are getting the finalised information from the dataset.

Now that we have the densities of all ingredients, we are able to use this to get their concentration in the recipe. This will be done by first obtaining the weight of each ingredient, and then using these values to get the weight ratio in the full recipe.

# Setup

In [1]:
#|default_exp density.finalise

In [2]:
#| export
from pyprojroot import here
root = here()
import sys
sys.path.append(str(root))

In [3]:
#| export
import pandas as pd
import numpy as np

import json

In [4]:
from tqdm import tqdm
tqdm.pandas()

In [5]:
pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
ingredients_df = pd.read_feather('../../data/local/recipe/partial/ingredients/0.feather')
food_df = pd.read_feather('../../data/local/density/full/food/0.feather')
food_portion_df = pd.read_feather('../../data/local/density/full/food_portion/0.feather')
food_ids = pd.read_feather('../../data/local/density/partial/food_ids/0.feather')
food_portion_ids = pd.read_feather('../../data/local/density/partial/food_portion_ids/0.feather')

In [7]:
ingredients_df = ingredients_df.join(food_ids).join(food_portion_ids)
ingredients_df

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string,unit_tags,unit_remainders,unit_type,food_id,food_portion_id
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1746116,0,butter,land lake butter,0.667,cups,,softened,"2/3 cup Land O Lakes Butter, softened",[cup],[],volume,2345703.0,287267.0
1746116,1,sugar,sugar,0.5,cups,,,1/2 cup sugar,[cup],[],volume,2345817.0,287772.0
1746116,2,egg,land lake egg,2.0,,(yolks only),,2 Land O Lakes Eggs (yolks only),[],[],portion,171287.0,88378.0
1746116,3,vanilla,vanilla,1.0,teaspoon,,,1 teaspoon vanilla,[teaspoon],[],volume,172236.0,90134.0
1746116,4,flour,all-purpose flour,1.5,cups,,,1 1/2 cups all-purpose flour,[cup],[],volume,169761.0,85466.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
931097,9,red onion,red onion,0.25,cups,,finely chopped,"1/4 cup red onion, finely chopped",[cup],[],volume,2345315.0,286030.0
931097,10,red bell pepper,red bell pepper,0.25,cups,,chopped,1/4 cup chopped red bell pepper,[cup],[],volume,2345321.0,286059.0
931097,11,jasmine rice,jasmine rice,1.0,cup,,,1 cup Jasmine rice,[cup],[],volume,169756.0,85461.0
931097,12,chicken,reduced-sodium chicken broth,1.5,cups,,,1 1/2 cups reduced-sodium chicken broth,[cup],[],volume,2341341.0,270046.0


In [8]:
ingredients_df = ingredients_df.fillna(pd.NA)

In [9]:
#| export
with open(f'{root}/config/unit_conversions.json') as f:
    unit_list = json.load(f)

# Weight Calculation

Using this information we want to calculate the gram weight of each ingredient.

In [10]:
unit_list

{'volume': {'milliliter': {'matches': ['ml'], 'conversion': 1.0},
  'cup': {'matches': ['c'], 'conversion': 236.588},
  'quart': {'matches': ['qrt', 'q'], 'conversion': 946.353},
  'gallon': {'matches': [], 'conversion': 3785.41},
  'pint': {'matches': ['p'], 'conversion': 473},
  'fluid_ounce': {'matches': ['floz', 'fl oz'], 'conversion': 30},
  'tablespoon': {'matches': ['tbsp', 'tbsps'], 'conversion': 15},
  'teaspoon': {'matches': ['tsp', 'tsps'], 'conversion': 5},
  'can': {'matches': ['tin'], 'conversion': 450},
  'cubic_inch': {'matches': ['in^3, inch^3'], 'conversion': 16.3871},
  'pinch': {'matches': [], 'conversion': 0.3},
  'dash': {'matches': [], 'conversion': 0.6}},
 'weight': {'gram': {'matches': ['g', 'gr'], 'conversion': 1.0},
  'pound': {'matches': ['lb'], 'conversion': 453.592},
  'killogram': {'matches': ['kg'], 'conversion': 1000},
  'ounce': {'matches': ['oz'], 'conversion': 28.3495}},
 'portion': {'whole': {'matches': ['full',
    'large',
    'medium',
    'small

In [11]:
ingredient = ingredients_df.iloc[0]
ingredient

name.name                                           butter
name.description                          land lake butter
quantity                                             0.667
unit                                                  cups
comment                                               <NA>
preparation                                       softened
ingredient_string    2/3 cup Land O Lakes Butter, softened
unit_tags                                            [cup]
unit_remainders                                         []
unit_type                                           volume
food_id                                          2345703.0
food_portion_id                                   287267.0
Name: (1746116, 0), dtype: object

The ingredient's weight are found according to the ingredient's unit_type ie. weight/volume/portion

In [12]:
ingredients_df

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string,unit_tags,unit_remainders,unit_type,food_id,food_portion_id
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1746116,0,butter,land lake butter,0.667,cups,,softened,"2/3 cup Land O Lakes Butter, softened",[cup],[],volume,2345703.0,287267.0
1746116,1,sugar,sugar,0.5,cups,,,1/2 cup sugar,[cup],[],volume,2345817.0,287772.0
1746116,2,egg,land lake egg,2.0,,(yolks only),,2 Land O Lakes Eggs (yolks only),[],[],portion,171287.0,88378.0
1746116,3,vanilla,vanilla,1.0,teaspoon,,,1 teaspoon vanilla,[teaspoon],[],volume,172236.0,90134.0
1746116,4,flour,all-purpose flour,1.5,cups,,,1 1/2 cups all-purpose flour,[cup],[],volume,169761.0,85466.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
931097,9,red onion,red onion,0.25,cups,,finely chopped,"1/4 cup red onion, finely chopped",[cup],[],volume,2345315.0,286030.0
931097,10,red bell pepper,red bell pepper,0.25,cups,,chopped,1/4 cup chopped red bell pepper,[cup],[],volume,2345321.0,286059.0
931097,11,jasmine rice,jasmine rice,1.0,cup,,,1 cup Jasmine rice,[cup],[],volume,169756.0,85461.0
931097,12,chicken,reduced-sodium chicken broth,1.5,cups,,,1 1/2 cups reduced-sodium chicken broth,[cup],[],volume,2341341.0,270046.0


In [13]:
#| export
def get_gram_weight(ingredient, food_portion_df, unit_list):

    weight = 0.0

    if ingredient['unit_type'] != 'weight' and (pd.isnull(ingredient['food_id']) or pd.isnull(ingredient['food_portion_id'])): return pd.NA

    if ingredient['unit_type'] == 'weight':
        
        ingredient_weight_unit = [unit for unit in ingredient['unit_tags'] if unit in unit_list['weight'].keys()][0]
        weight = unit_list['weight'][ingredient_weight_unit]['conversion'] * ingredient['quantity']

    else:

        portion = food_portion_df.loc[ingredient['food_id'], ingredient['food_portion_id']]

        if ingredient['unit_type'] == 'volume':

            ingredient_volume_unit = [unit for unit in ingredient['unit_tags'] if unit in unit_list['volume'].keys()][0]

            if portion['unit_type'] == 'volume':

                portion_volume_unit = [unit for unit in portion['unit_tags'] if unit in unit_list['volume'].keys()][0]
                # simple density calculation if exists
                if not pd.notnull(portion['portion_amount']):
                    density = portion['gram_weight'] / (portion['amount'] * unit_list['volume'][portion_volume_unit]['conversion'])
                else:
                    density = portion['gram_weight'] / (portion['amount'] * portion['portion_amount'] * unit_list['volume'][portion_volume_unit]['conversion']) # #todo can just make porion_amount == 1 or factor this in the amount when creating dataframe
                weight = unit_list['volume'][ingredient_volume_unit]['conversion'] * density * ingredient['quantity']

            else:

                # volume measurement not given -> must be portion (set to NA for now)
                weight = pd.NA

        else: # ingredient whole/portion measurements

            weight = portion['gram_weight'] * ingredient['quantity']


    return weight    

In [14]:
assert get_gram_weight(ingredient, food_portion_df, unit_list) > 100

In [15]:
ingredients_df.iloc[2]

name.name                                         egg
name.description                        land lake egg
quantity                                          2.0
unit                                             <NA>
comment                                  (yolks only)
preparation                                      <NA>
ingredient_string    2 Land O Lakes Eggs (yolks only)
unit_tags                                          []
unit_remainders                                    []
unit_type                                     portion
food_id                                      171287.0
food_portion_id                               88378.0
Name: (1746116, 2), dtype: object

In [16]:
ingredients_df['gram_weight'] = ingredients_df.progress_apply(get_gram_weight, axis=1, args=(food_portion_df, unit_list,))

  0%|                                                                                                                                                                                                                                                            | 0/2450 [00:00<?, ?it/s]

 63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                        | 1544/2450 [00:00<00:00, 15436.52it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2450/2450 [00:00<00:00, 15559.25it/s]




In [17]:
assert ingredients_df[(ingredients_df['quantity'] != 0) & (ingredients_df['gram_weight'] == 0)].empty

In [18]:
ingredients_df[ingredients_df['gram_weight'].isna() & ingredients_df['quantity'].notna() & ingredients_df['food_portion_id'].notna()].join(food_portion_df['description'], on=['food_id', 'food_portion_id'])

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string,unit_tags,unit_remainders,unit_type,food_id,food_portion_id,gram_weight,description
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
574304,0,brandy,brandy,0.333,cups,,,1/3 cup brandy,[cup],[],volume,171907.0,89543.0,,bottle
1781719,3,light molasses,light molasses,1.0,teaspoon,,,1 teaspoon light molasses,[teaspoon],[],volume,2343424.0,278155.0,,medium
828558,4,health valley,health valley spicy,1.0,can,,,1 (15 oz.) can Health Valley spicy,[can],[],volume,2343412.0,278110.0,,cookie
1379076,5,liquor,peach flavored liquor,2.5,tablespoons,,,2-3 tablespoons Peach Flavored liquor,[tablespoon],[],volume,171907.0,89543.0,,bottle
165596,5,dark molasses,dark molasses,0.667,cups,,,2/3 cup dark molasses,[cup],[],volume,2343424.0,278155.0,,medium


In [19]:
food_portion_df.loc[2343424.0]

Unnamed: 0_level_0,seq_num,amount,gram_weight,description,unit_tags,unit_remainders,unit_type,portion_amount,portion_unit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
278153,1,1.0,5.0,miniature/bite size,[],"[miniature, bite, size]",portion,,
278154,2,1.0,20.0,small,[whole],[],portion,,
278155,3,1.0,30.0,medium,[whole],[],portion,,
278156,4,1.0,45.0,large,[whole],[],portion,,
278157,5,1.0,20.0,little debbie cookie,[],"[little, debbie, cookie]",portion,,


-> Occurs when portion options doesn't contain the same unit_type information ie. required volume and only had portion.

In [20]:
food_df.loc[171287]

data_type                            sr_legacy_food
description                  Egg, whole, raw, fresh
description_list           [egg, whole, raw, fresh]
description_length                               22
description_list_length                           4
default_word_count                                3
exclusion_word_count                              0
volume_exists                                  True
portion_exists                                 True
Name: 171287, dtype: object

In [21]:
food_portion_df[food_portion_df.index.get_level_values(1) == 88378.0]

Unnamed: 0_level_0,Unnamed: 1_level_0,seq_num,amount,gram_weight,description,unit_tags,unit_remainders,unit_type,portion_amount,portion_unit
fdc_id,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
171287,88378,6,1.0,44.0,medium,[whole],[],portion,,


#### Handling NA's

In [22]:
ingredients_df['gram_weight'] = ingredients_df['gram_weight'].groupby('recipe').transform(lambda x: x.astype('double[pyarrow]').fillna(x.mean()))

In [23]:
ingredients_df[ingredients_df['gram_weight'].isna()]

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string,unit_tags,unit_remainders,unit_type,food_id,food_portion_id,gram_weight
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
824709,0,bread,bread,,,,,bread,[],[],portion,2343328.0,277706.0,
824709,1,ground cinnamon,ground cinnamon,,,,,ground cinnamon,[],[],portion,167940.0,82252.0,
824709,2,sugar,sugar,,,,,sugar,[],[],portion,2345817.0,287776.0,
824709,3,margarine,squeeze margarine,,,,,squeeze margarine,[],[],portion,171018.0,87784.0,


In [24]:
ingredients_df = ingredients_df[~ingredients_df['gram_weight'].isna()]

# Weight Concentration

Ultimately we'll want to measure the concentration of ingredients in each recipe.

In [25]:
ingredient = ingredients_df.iloc[0]
ingredient

name.name                                           butter
name.description                          land lake butter
quantity                                             0.667
unit                                                  cups
comment                                               <NA>
preparation                                       softened
ingredient_string    2/3 cup Land O Lakes Butter, softened
unit_tags                                            [cup]
unit_remainders                                         []
unit_type                                           volume
food_id                                          2345703.0
food_portion_id                                   287267.0
gram_weight                                        149.408
Name: (1746116, 0), dtype: object

In [26]:
weight_ratios = ingredients_df['gram_weight'] / ingredients_df.groupby('recipe')['gram_weight'].sum()
weight_ratios

recipe   ingredient
1746116  0             0.072355
         1             0.048428
         2             0.042616
         3             0.002034
         4             0.090802
                         ...   
931097   9             0.021104
         10            0.019785
         11            0.097605
         12            0.106838
         13            0.001398
Name: gram_weight, Length: 2446, dtype: double[pyarrow]

In [27]:
ingredients_df['weight_ratio'] = weight_ratios

### Evaluating

In [28]:
ingredients_df['weight_ratio'].groupby('recipe').sum()

recipe
222        1.0
1703       1.0
8981       1.0
13596      1.0
17929      1.0
          ... 
2185822    1.0
2189489    1.0
2195357    1.0
2196831    1.0
2201833    1.0
Name: weight_ratio, Length: 301, dtype: double[pyarrow]

In [29]:
_ = ingredients_df['weight_ratio'].groupby('recipe').sum().astype('float')
assert _[~np.isclose(_,np.full((len(_)), 1))].empty

In [30]:
ingredients_df[ingredients_df['weight_ratio'].isna()]

Unnamed: 0_level_0,Unnamed: 1_level_0,name.name,name.description,quantity,unit,comment,preparation,ingredient_string,unit_tags,unit_remainders,unit_type,food_id,food_portion_id,gram_weight,weight_ratio
recipe,ingredient,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1


# Saving

In [31]:
ingredients_df['weight_ratio'].to_frame().to_feather('../../data/local/density/partial/weights/0.feather')

In [32]:
from nbdev import nbdev_export; nbdev_export()