## USDA food databases (foundation - Survey - Branded)

+ Data are available in csv and json formats. Current json file do not provide desired format.
+ json will be the chosen file format since we'll work with MongoDB for faster API reponses
+ We will however use CSVs as base to re-build relevant json format accross all 3 DBs

---
### Desired format (v1) 

````json

{
    "foundation": [
        {
            "_id" : "r00000000001",
            "type" : 2 , #define food type (1-food, 2-branded , 3-recipes)
            "title": "Banana com aveia e mel",
            "ingredients" : #optional
                {
                    "f0000000001": {
                        "amount" : 100,
                            "unit" : "g",
                            "name" : "Can be food"},
                    "f0000000002": 
                        {"amount" : 50,
                        "unit" : "g",
                        "name": "Could also be a branded product"},
                    "f0000000003": 
                        {"amount" : 10,
                        "unit" : "g",
                        "name" : "Even another recipe- like home made cheese"}                       
                },
            "nutrients" : 
                {
                    "f0000000001": {
                        "amount" : 100,
                         "unit" : "g",
                         "name" : "carbs"},
                    "f0000000002": 
                        {"amount" : 50,
                        "unit" : "kcal",
                        "name" : "fat"},
                    "f0000000003": 
                        {"amount" : 10,
                        "unit" : "g"}                       
                }
}

```

In [1]:
import pandas as pd

In [2]:
#USDA legacy database select files

food = pd.read_csv(r"./legacy/food_update_log_entry.csv")
nutrients = pd.read_csv(r"./legacy/food_nutrient.csv")
nutrient_detail=pd.read_csv(r"./legacy/support/nutrient.csv")

food.head(5)

Unnamed: 0,id,description,last_updated
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",2019-04-01
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",2019-04-01
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...",2019-04-01
3,167515,"George Weston Bakeries, Thomas English Muffins",2019-04-01
4,167516,"Waffles, buttermilk, frozen, ready-to-heat",2019-04-01


In [3]:
food = food.merge(nutrients.loc[:,'fdc_id':'amount'], left_on='id', right_on='fdc_id', how='outer')\
    .drop(columns='fdc_id')\
        .rename(columns={'amount': 'nutrient_amount'})


food = food.merge(nutrient_detail[['id','name','unit_name']],\
    left_on='nutrient_id', right_on='id', how='left',suffixes=('', '_DROP'))\
        .filter(regex='^(?!.*_DROP)')\
            .rename(columns={'name':'nutrient_name', 'unit_name':'nutrient_unit'})

food.head(10)

Unnamed: 0,id,description,last_updated,nutrient_id,nutrient_amount,nutrient_name,nutrient_unit
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",2019-04-01,1003,5.88,Protein,G
1,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",2019-04-01,1007,3.5,Ash,G
2,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",2019-04-01,1062,1286.0,Energy,kJ
3,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",2019-04-01,1079,1.2,"Fiber, total dietary",G
4,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",2019-04-01,1089,2.12,"Iron, Fe",MG
5,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",2019-04-01,1093,1059.0,"Sodium, Na",MG
6,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",2019-04-01,1253,0.0,Cholesterol,MG
7,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",2019-04-01,1257,4.412,"Fatty acids, total trans",G
8,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",2019-04-01,1258,2.941,"Fatty acids, total saturated",G
9,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",2019-04-01,1004,13.24,Total lipid (fat),G


In [4]:
food.columns

Index(['id', 'description', 'last_updated', 'nutrient_id', 'nutrient_amount',
       'nutrient_name', 'nutrient_unit'],
      dtype='object')

In [5]:
#prepare grouping for nested json output
# make it better? Abstract / Faster

groupcol = ['id', 'description', 'last_updated']

def inlist(df, groupbycol:list):

    # aggregate columns into list of values
    aggregtcol = [col for col in df.columns if col not in groupbycol]
    groupid = [col for col in aggregtcol if 'id' in col]
    aggregation = {col:lambda x: list(x) for col in aggregtcol}
    df= df.groupby(groupcol).agg(aggregation).reset_index()

    # add column name as prefix to each aggregated item
    for col in aggregtcol:
        if col not in groupid:
            df[col] = df[col].apply(lambda x: [{col : v} for v in x])
    return df


cols = ['id', 'description', 'last_updated']
grupo = inlist(food, cols)

In [6]:
grupo

Unnamed: 0,id,description,last_updated,nutrient_id,nutrient_amount,nutrient_name,nutrient_unit
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",2019-04-01,"[1003, 1007, 1062, 1079, 1089, 1093, 1253, 125...","[{'nutrient_amount': 5.88}, {'nutrient_amount'...","[{'nutrient_name': 'Protein'}, {'nutrient_name...","[{'nutrient_unit': 'G'}, {'nutrient_unit': 'G'..."
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",2019-04-01,"[1003, 1007, 1062, 1079, 1089, 1093, 1104, 110...","[{'nutrient_amount': 4.34}, {'nutrient_amount'...","[{'nutrient_name': 'Protein'}, {'nutrient_name...","[{'nutrient_unit': 'G'}, {'nutrient_unit': 'G'..."
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...",2019-04-01,"[1062, 1093, 1004, 1005, 1008, 1051, 1003, 1007]","[{'nutrient_amount': 1577.0}, {'nutrient_amoun...","[{'nutrient_name': 'Energy'}, {'nutrient_name'...","[{'nutrient_unit': 'kJ'}, {'nutrient_unit': 'M..."
3,167515,"George Weston Bakeries, Thomas English Muffins",2019-04-01,"[1105, 1007, 1051, 1062, 1089, 1093, 1104, 110...","[{'nutrient_amount': 0.0}, {'nutrient_amount':...","[{'nutrient_name': 'Retinol'}, {'nutrient_name...","[{'nutrient_unit': 'UG'}, {'nutrient_unit': 'G..."
4,167516,"Waffles, buttermilk, frozen, ready-to-heat",2019-04-01,"[1258, 1104, 1106, 1293, 1190, 1166, 1177, 117...","[{'nutrient_amount': 1.898}, {'nutrient_amount...","[{'nutrient_name': 'Fatty acids, total saturat...","[{'nutrient_unit': 'G'}, {'nutrient_unit': 'IU..."
...,...,...,...,...,...,...,...
7788,175300,"Game meat, buffalo, water, cooked, roasted",2019-04-01,"[1062, 1186, 1190, 1004, 1005, 1008, 1051, 108...","[{'nutrient_amount': 548.0}, {'nutrient_amount...","[{'nutrient_name': 'Energy'}, {'nutrient_name'...","[{'nutrient_unit': 'kJ'}, {'nutrient_unit': 'U..."
7789,175301,"Game meat, elk, raw",2019-04-01,"[1062, 1222, 1224, 1225, 1226, 1253, 1258, 126...","[{'nutrient_amount': 464.0}, {'nutrient_amount...","[{'nutrient_name': 'Energy'}, {'nutrient_name'...","[{'nutrient_unit': 'kJ'}, {'nutrient_unit': 'G..."
7790,175302,"Game meat, elk, cooked, roasted",2019-04-01,"[1062, 1004, 1005, 1008, 1051, 1087, 1092, 109...","[{'nutrient_amount': 611.0}, {'nutrient_amount...","[{'nutrient_name': 'Energy'}, {'nutrient_name'...","[{'nutrient_unit': 'kJ'}, {'nutrient_unit': 'G..."
7791,175303,"Game meat, goat, raw",2019-04-01,"[1062, 1186, 1190, 1003, 1007, 1079, 1089, 109...","[{'nutrient_amount': 456.0}, {'nutrient_amount...","[{'nutrient_name': 'Energy'}, {'nutrient_name'...","[{'nutrient_unit': 'kJ'}, {'nutrient_unit': 'U..."


In [7]:
from itertools import chain

def izipit(x):
    keys = (x.keys().unique())
    arr =  list(map(list,zip(*x[keys])))
    return [dict(chain.from_iterable(map(dict.items, row)))for row in arr]

cols = ['nutrient_amount',	'nutrient_name','nutrient_unit']
grupo['nutrients']=grupo[cols].apply(izipit, axis=1)
grupo.drop(columns=cols, inplace=True)


In [8]:
grupo['nutrients'][0]

[{'nutrient_amount': 5.88, 'nutrient_name': 'Protein', 'nutrient_unit': 'G'},
 {'nutrient_amount': 3.5, 'nutrient_name': 'Ash', 'nutrient_unit': 'G'},
 {'nutrient_amount': 1286.0, 'nutrient_name': 'Energy', 'nutrient_unit': 'kJ'},
 {'nutrient_amount': 1.2,
  'nutrient_name': 'Fiber, total dietary',
  'nutrient_unit': 'G'},
 {'nutrient_amount': 2.12, 'nutrient_name': 'Iron, Fe', 'nutrient_unit': 'MG'},
 {'nutrient_amount': 1059.0,
  'nutrient_name': 'Sodium, Na',
  'nutrient_unit': 'MG'},
 {'nutrient_amount': 0.0,
  'nutrient_name': 'Cholesterol',
  'nutrient_unit': 'MG'},
 {'nutrient_amount': 4.412,
  'nutrient_name': 'Fatty acids, total trans',
  'nutrient_unit': 'G'},
 {'nutrient_amount': 2.941,
  'nutrient_name': 'Fatty acids, total saturated',
  'nutrient_unit': 'G'},
 {'nutrient_amount': 13.24,
  'nutrient_name': 'Total lipid (fat)',
  'nutrient_unit': 'G'},
 {'nutrient_amount': 41.18,
  'nutrient_name': 'Carbohydrate, by difference',
  'nutrient_unit': 'G'},
 {'nutrient_amount': 