## USDA food databases (foundation food database)

+ Data are available in csv and json formats. Current json file do not provide desired format.
+ json will be the chosen file format since we'll work with MongoDB for faster API reponses
+ We will however use CSVs as base to re-build relevant json format accross all 3 DBs

### We might have access to market products (Barcodes)

---
### Desired format (v1) 

````json

{
    "foundation": [
        {
            "_id" : "r00000000001",
            "type" : 2 , #define food type (1-food, 2-branded , 3-recipes)
            "title": "Banana com aveia e mel",
            "ingredients" : #optional
                {
                    "f0000000001": {
                        "amount" : 100,
                            "unit" : "g",
                            "name" : "Can be food"},
                    "f0000000002": 
                        {"amount" : 50,
                        "unit" : "g",
                        "name": "Could also be a branded product"},
                    "f0000000003": 
                        {"amount" : 10,
                        "unit" : "g",
                        "name" : "Even another recipe- like home made cheese"}                       
                },
            "nutrients" : 
                {
                    "f0000000001": {
                        "amount" : 100,
                         "unit" : "g",
                         "name" : "carbs"},
                    "f0000000002": 
                        {"amount" : 50,
                        "unit" : "kcal",
                        "name" : "fat"},
                    "f0000000003": 
                        {"amount" : 10,
                        "unit" : "g"}                       
                }
}

```

In [46]:
import pandas as pd


In [47]:
#USDA foundation database select files

food = pd.read_csv(r"./foundation/food.csv").fillna(0)
nutrients = pd.read_csv(r"./foundation/food_nutrient.csv")

nutrient_detail=pd.read_csv(r"./support/nutrient.csv")\
    .rename(columns={'id':'nutrient_id','name':'nutrient_name', 'unit_name':'nutrient_unit'})

categories = pd.read_csv(r"./support/food_category.csv")\
    .rename(columns={'id':'food_category_id', 'code': 'category_code', 'description':'category_description'})

portion = pd.read_csv(r"./foundation/food_portion.csv").rename(columns={'modifier':'portion'})

  exec(code_obj, self.user_global_ns, self.user_ns)


In [48]:
# there are a lot what seems to be duplicated data 
# this could be a sample analysis db with collected info with diff tests
# lets leave all socalled duplicates now and assume a aggregation metric later

In [49]:
food = food.merge(nutrients.loc[:,'fdc_id':'amount'], on='fdc_id', how='left')\
    .rename(columns={'amount': 'nutrient_amount'})
    
food = food.drop_duplicates(subset=['data_type','description', 'food_category_id','nutrient_id'],keep='first')

food = food.merge(nutrient_detail[['nutrient_id','nutrient_name','nutrient_unit']],\
    on='nutrient_id', how='left',suffixes=('', '_DROP'))\
        .filter(regex='^(?!.*_DROP)')
           # .astype({'nutrient_id':'int32'})

food = food.merge(categories, on= 'food_category_id', how='left',suffixes=('', '_DROP'))\
    .filter(regex='^(?!.*_DROP)')

food = food.merge(portion[['fdc_id','portion','gram_weight']], on= 'fdc_id', how='left',suffixes=('', '_DROP'))\
    .filter(regex='^(?!.*_DROP)')

In [50]:
food.head(5)

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date,nutrient_id,nutrient_amount,nutrient_name,nutrient_unit,category_code,category_description,portion,gram_weight
0,319874,sample_food,"HUMMUS, SABRA CLASSIC",16.0,2019-04-01,,,,,1600.0,Legumes and Legume Products,,
1,319875,market_acquisition,"HUMMUS, SABRA CLASSIC",16.0,2019-04-01,,,,,1600.0,Legumes and Legume Products,,35.8
2,319877,sub_sample_food,Hummus,16.0,2019-04-01,1051.0,56.3,Water,G,1600.0,Legumes and Legume Products,,
3,319877,sub_sample_food,Hummus,16.0,2019-04-01,1002.0,1.28,Nitrogen,G,1600.0,Legumes and Legume Products,,
4,319877,sub_sample_food,Hummus,16.0,2019-04-01,1004.0,19.0,Total lipid (fat),G,1600.0,Legumes and Legume Products,,


In [51]:
# Observed PROBLEMS
#  market acquisition misses nutrition information
# sample foods without gram_weight 

#lets keep only items with complete info
mask = (food['portion'].notna() & food['nutrient_id'].notna())
food = food[mask].drop(columns='data_type')\
    .astype({
        'nutrient_id':'int', 
        'food_category_id':'int',
        'category_code':'int'})

food.head(5)

Unnamed: 0,fdc_id,description,food_category_id,publication_date,nutrient_id,nutrient_amount,nutrient_name,nutrient_unit,category_code,category_description,portion,gram_weight
1588,321611,"Beans, snap, green, canned, regular pack, drai...",11,2019-04-01,1014,0.0,Maltose,G,1100,Vegetables and Vegetable Products,drained,129.0
1589,321611,"Beans, snap, green, canned, regular pack, drai...",11,2019-04-01,1051,93.6,Water,G,1100,Vegetables and Vegetable Products,drained,129.0
1590,321611,"Beans, snap, green, canned, regular pack, drai...",11,2019-04-01,1002,0.17,Nitrogen,G,1100,Vegetables and Vegetable Products,drained,129.0
1591,321611,"Beans, snap, green, canned, regular pack, drai...",11,2019-04-01,1095,0.19,"Zinc, Zn",MG,1100,Vegetables and Vegetable Products,drained,129.0
1592,321611,"Beans, snap, green, canned, regular pack, drai...",11,2019-04-01,1011,0.65,Glucose,G,1100,Vegetables and Vegetable Products,drained,129.0


In [52]:
#prepare grouping for nested json output
# make it better? Abstract / Faster

def inlist(df, aggcol:list):

    groupbycol = [col for col in df.columns if col not in aggcol ]
    aggregation = {col:lambda x: list(x) for col in aggcol}
    groupid = [col for col in aggcol if 'id' in col]
    df= df.groupby(groupbycol).agg(aggregation).reset_index()

    # add column name as prefix to each aggregated item
    for col in aggcol:
        if col not in groupid:
            df[col] = df[col].apply(lambda x: [{col : v} for v in x])
    return df

cols=['nutrient_id','nutrient_amount','nutrient_name','nutrient_unit']
# cols = ['fdc_id', 'description','category_description', 'category_code','food_category_id','publication_date']
grupo = inlist(food, cols)

In [53]:
from itertools import chain

def izipit(x, head=False):
    keys = (x.keys().unique())
    if head:
        return dict(zip(*x[keys]))
    else:
        arr =  list(map(list,zip(*x[keys])))
        return [dict(chain.from_iterable(map(dict.items, row)))for row in arr]

cols = ['nutrient_amount',	'nutrient_name','nutrient_unit']
grupo['nutrients']=grupo[cols].apply(izipit, axis=1)
grupo.drop(columns=cols, inplace=True)
grupo['nutrients']=grupo[['nutrient_id', 'nutrients']].apply(izipit,args=(True,), axis=1)
grupo.drop(columns='nutrient_id',inplace=True)

In [54]:
grupo.head(3)

Unnamed: 0,fdc_id,description,food_category_id,publication_date,category_code,category_description,portion,gram_weight,nutrients
0,321611,"Beans, snap, green, canned, regular pack, drai...",11,2019-04-01,1100,Vegetables and Vegetable Products,drained,129.0,"{1014: {'nutrient_amount': 0.0, 'nutrient_name..."
1,321900,"Broccoli, raw",11,2019-04-01,1100,Vegetables and Vegetable Products,chopped,76.0,"{1128: {'nutrient_amount': 0.01, 'nutrient_nam..."
2,323294,"Nuts, almonds, dry roasted, with salt added",12,2019-04-01,1200,Nut and Seed Products,whole,135.0,"{1005: {'nutrient_amount': 16.2, 'nutrient_nam..."


In [56]:
#export to json 
grupo['source'] ='USDA'
grupo.to_json('foundation.json',orient="records", indent=2)
#TODO - add tables to sql db for future OLAP enrichment

In [57]:
del food
del nutrient_detail
del nutrients
del categories
del grupo

In [None]:
# remove = foundation['description'].str.contains('^([a-z A-Z]+-\snfy\w+)').fillna(False)
# foundation = foundation[~remove]
