# Prepare Data for food and nutrients

## Imports and Data

In [530]:
import pandas as pd
import numpy as np

import sqlite3
import os

plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)

The data is from this website: https://fdc.nal.usda.gov/download-datasets

In [188]:
food_df = pd.read_csv('../data/FoodData/food.csv')

In [87]:
food_df.shape

(78026, 5)

In [60]:
food_df.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,319874,sample_food,"HUMMUS, SABRA CLASSIC",16.0,2019-04-01
1,319875,market_acquisition,"HUMMUS, SABRA CLASSIC",16.0,2019-04-01
2,319876,market_acquisition,"HUMMUS, SABRA CLASSIC",16.0,2019-04-01
3,319877,sub_sample_food,Hummus,16.0,2019-04-01
4,319878,sub_sample_food,Hummus,16.0,2019-04-01


# Let's clean the food.csv dataset

To get rid of the duplicated rows, first we must normalize the names so that we can see if names are duplicated. To start, lets make it all lowercase.

In [189]:
import re

def normalize_description(s):
    if pd.isna(s):
        return s

    # lowercase
    s = s.lower()

    # keep text before dash
    s = s.split('-')[0]

    # remove punctuation
    s = re.sub(r'[^\w\s]', '', s)

    # normalize whitespace
    s = re.sub(r'\s+', ' ', s).strip()

    return s

In [190]:
food_df_normal = food_df.copy()

food_df_normal['description'] = food_df['description'].apply(normalize_description)

In [112]:
food_df_normal.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,319874,sample_food,hummus sabra classic,16.0,2019-04-01
1,319875,market_acquisition,hummus sabra classic,16.0,2019-04-01
2,319876,market_acquisition,hummus sabra classic,16.0,2019-04-01
3,319877,sub_sample_food,hummus,16.0,2019-04-01
4,319878,sub_sample_food,hummus,16.0,2019-04-01


### Remove duplicated rows

We have a lot of duplicated rows. The majority of rows are duplicated.

In [191]:
food_df_normal.loc[food_df_normal['description'].duplicated()]

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
1,319875,market_acquisition,hummus sabra classic,16.0,2019-04-01
2,319876,market_acquisition,hummus sabra classic,16.0,2019-04-01
4,319878,sub_sample_food,hummus,16.0,2019-04-01
5,319879,sample_food,hummus sabra classic,16.0,2019-04-01
6,319880,market_acquisition,hummus sabra classic,16.0,2019-04-01
...,...,...,...,...,...
78021,2751499,sub_sample_food,shallots bulb peeled root removed raw,,2025-07-03
78022,2751500,sub_sample_food,shallots bulb peeled root removed raw,,2025-07-03
78023,2751501,sub_sample_food,shallots bulb peeled root removed raw,,2025-07-03
78024,2751502,sub_sample_food,shallots bulb peeled root removed raw,,2025-07-03


Ok, so now we need to remove the duplicated rows, but we need to be strategic about which to throw away. My initial attempt:  
<code> food_df_clean = food_df_normal.drop_duplicates(subset=['description'], keep='last') </code>  
Worked, however this issue was that a lot of the nutritional info I wanted was not provided.  

To deal with this, I will first search thru the duplicated rows to see which one provides the most nutritional info and keep that one. 

In [192]:
# Let's make a dataframe of just duplicated data

food_df_duplicated_total = food_df_normal.loc[food_df_normal['description'].duplicated()]

In [225]:
# Now we will make a dataframe of one entry from each item that was duplicated. 
# The duplicated item will later be replaced by the one with the most nutrients. 

# Find all duplicated descriptions
duplicated_names = food_df_normal[food_df_normal.duplicated(subset=['description'], keep=False)]

# Keep only one entry per duplicated description (first occurrence for now)
food_df_duplicated = duplicated_names.drop_duplicates(subset=['description'], keep='first')

In [57]:
food_df_duplicated_total.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
1,319875,market_acquisition,hummus sabra classic,16.0,2019-04-01
2,319876,market_acquisition,hummus sabra classic,16.0,2019-04-01
4,319878,sub_sample_food,hummus,16.0,2019-04-01
5,319879,sample_food,hummus sabra classic,16.0,2019-04-01
6,319880,market_acquisition,hummus sabra classic,16.0,2019-04-01


In [69]:
food_df_duplicated_total.shape

(70536, 5)

In [67]:
food_df_duplicated.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,319874,sample_food,hummus sabra classic,16.0,2019-04-01
3,319877,sub_sample_food,hummus,16.0,2019-04-01
20,319894,sample_food,hummus other,16.0,2019-04-01
84,319958,sample_food,hummus tribe classic,16.0,2019-04-01
150,320025,sample_food,milk 2 wave 22e,1.0,2019-04-01


In [70]:
food_df_duplicated.shape

(1417, 5)

Ok, now we have 2 dataframes, food_df_duplicated_total has every duplicated row, and food_df_duplicated has one row for each duplicated row.  

Now, for each row in food_df_duplicated, we will search through the rowsn in food_df_duplicated_total to find the fdc_id that has the most nutritional information and replace that entry in food_df_duplicated.

In [194]:
# To do this, we first need nutrient_df. Let's import it. 

nutrient_df = pd.read_csv("FoodData/nutrient.csv")

In [17]:
nutrient_df.shape

(477, 5)

In [18]:
nutrient_df.head()

Unnamed: 0,id,name,unit_name,nutrient_nbr,rank
0,2047,Energy (Atwater General Factors),KCAL,957.0,280.0
1,2048,Energy (Atwater Specific Factors),KCAL,958.0,290.0
2,1001,Solids,G,201.0,200.0
3,1002,Nitrogen,G,202.0,500.0
4,1003,Protein,G,203.0,600.0


In [195]:
# We also need food_nutrient_df to convert the values.

food_nutrient_df = pd.read_csv("FoodData/food_nutrient.csv", usecols=[0,1,2,3])

In [27]:
food_nutrient_df.shape

(159285, 4)

In [28]:
food_nutrient_df.head()

Unnamed: 0,id,fdc_id,nutrient_id,amount
0,2201847,319877,1051,56.3
1,2201845,319877,1002,1.28
2,2201846,319877,1004,19.0
3,2201844,319877,1007,1.98
4,2201852,319878,1091,188.0


### Now that we have our dataframes, lets go over the logic

lets look up the nutrient info of the first entry in <code>food_df_duplicated</code> to get a feel for how to do this.

In [196]:
# First find the descritption and fdc_id of the fist item.

d0 = food_df_duplicated.loc[0, 'description']
print(d0)

fdc0 = food_df_duplicated.loc[0, 'fdc_id']
print(fdc0)

hummus sabra classic
319874


In [197]:
# Now we use that fdc_id to get all nutrient id's associated with it
# The dataframe is empty, meaning this humman sabra classic has no nutritional information 

fn0 = food_nutrient_df[food_nutrient_df['fdc_id']==fdc0] 
fn0.head()

Unnamed: 0,id,fdc_id,nutrient_id,amount


In [198]:
# Let's check for another item to see what we get if there is nutritional info

d3 = food_df_duplicated.loc[3, 'description']
print(d3)

fdc3 = food_df_duplicated.loc[3, 'fdc_id']
print(fdc3)

hummus
319877


In [171]:
# Now we use that fdc_id to get all nutrient id's associated with it

fn3 = food_nutrient_df[food_nutrient_df['fdc_id']==fdc3] 
fn3.head()

Unnamed: 0,id,fdc_id,nutrient_id,amount
0,2201847,319877,1051,56.3
1,2201845,319877,1002,1.28
2,2201846,319877,1004,19.0
3,2201844,319877,1007,1.98


When we have this dataframe, we can check if it lists the desired nutritient_id's, which are listed below. 

In [199]:
# These are the nutrients we want to show for each food and recipe

nutri = {'calories': 1008, 'protein_g': 1003, 'carbs_g': 1005, 'fat_g': 1004, 'fiber_g': 1079, 'sodium_mg': 1093}
nutri_iter = ['calories', 'protein_g', 'carbs_g', 'fat_g', 'fiber_g', 'sodium_mg']

In [200]:
present_nutrients = set(fn3['nutrient_id'])

In [201]:
score = sum(nid in present_nutrients for nid in nutri.values())
print("Nutrient completeness score:", score)

Nutrient completeness score: 1


Now we want to do this for every duplicated item until we get a score of 6, or keep the highest score if none have a score of 6.

In [202]:
duplicated_hummas = food_df_duplicated_total[food_df_duplicated_total['description']==d3]
duplicated_hummas.shape

(87, 5)

In [143]:
duplicated_hummas.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
4,319878,sub_sample_food,hummus,16.0,2019-04-01
8,319882,sub_sample_food,hummus,16.0,2019-04-01
9,319883,sub_sample_food,hummus,16.0,2019-04-01
10,319884,sub_sample_food,hummus,16.0,2019-04-01
18,319892,sub_sample_food,hummus,16.0,2019-04-01


In [203]:
# Let's make this a functionso we can better understand the data

def find_nutrients(df):
    max_nutri = 0
    best_fdc_id = None
    i = 0
    nutri_id_to_name = {v: k for k, v in nutri.items()}
    
    while max_nutri < 6 and i < df.shape[0]:
        fdc = df.iloc[i]['fdc_id']
    
        # Get nutrients for this fdc_id
        fn = food_nutrient_df[food_nutrient_df['fdc_id'] == fdc]
    
        present_nutrients = set(fn['nutrient_id'])
        core_score = sum(nid in present_nutrients for nid in nutri.values())
        total_score = len(present_nutrients)
    
        present_nutrients = sorted(set(fn['nutrient_id']))
    
        name = ""
        
        for nid in present_nutrients:
            curr_name = nutri_id_to_name.get(nid, str(nid))
            name += f"{curr_name}, "

        # Get nutrients for this fdc_id
        fn = food_nutrient_df[food_nutrient_df['fdc_id'] == fdc]
        nutrient_dict = dict(zip(fn['nutrient_id'], fn['amount']))
        # Gives a dictionary of {nutrient_id: amount, ...}
        
        print(f"Row {i} | fdc_id={fdc} | core_score={core_score} | total_score={total_score} | nutrients: {name}")

        # print(f"Row {i} | fdc_id={fdc} | core_score={core_score} | total_score={total_score} | nutrients: amounts - {nutrient_dict}")
    
        # Update best score
        if score > max_nutri:
            max_nutri = score
            best_fdc_id = fdc
    
        # Early exit if perfect match
        if score == 6:
            break
    
        i += 1

In [27]:
find_nutrients(duplicated_hummas)

Row 0 | fdc_id=319878 | core_score=1 | total_score=9 | nutrients: 1087, 1089, 1090, 1091, 1092, sodium_mg, 1095, 1098, 1101, 
Row 1 | fdc_id=319882 | core_score=1 | total_score=4 | nutrients: 1002, fat_g, 1007, 1051, 
Row 2 | fdc_id=319883 | core_score=0 | total_score=1 | nutrients: 1170, 
Row 3 | fdc_id=319884 | core_score=1 | total_score=9 | nutrients: 1087, 1089, 1090, 1091, 1092, sodium_mg, 1095, 1098, 1101, 
Row 4 | fdc_id=319892 | core_score=1 | total_score=4 | nutrients: 1002, fat_g, 1007, 1051, 
Row 5 | fdc_id=319893 | core_score=1 | total_score=9 | nutrients: 1087, 1089, 1090, 1091, 1092, sodium_mg, 1095, 1098, 1101, 
Row 6 | fdc_id=319899 | core_score=1 | total_score=4 | nutrients: 1002, fat_g, 1007, 1051, 
Row 7 | fdc_id=319900 | core_score=1 | total_score=9 | nutrients: 1087, 1089, 1090, 1091, 1092, sodium_mg, 1095, 1098, 1101, 
Row 8 | fdc_id=319906 | core_score=0 | total_score=42 | nutrients: 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1271, 1272, 1273, 1276, 12

### Next Strategy

Ok, so now it looks like there are a lot of different nutrients present. What I will try to do now is I will make a new id for each item in food_df_duplicated that will point to a new food_nutrient_df that will then use the id from food_df_duplicated to point to each nutrient. This way each food will point to as many nutrients as possible.

Next, I want the id's in food_df_duplicated to be in ascending order from 0

In [229]:
food_df_duplicated_d = food_df_duplicated.reset_index(drop=True)

# food_df_duplicated_d = food_df_duplicated_d.rename(columns={"fdc_id": "fdc_id_old"})
# food_df_duplicated_d.insert(0, "fdc_id", range(len(food_df_duplicated_d)))

Ok, so food_df_duplicated is now the food dataframe we are using, so let's name it as such.

In [230]:
food_df_new = food_df_duplicated_d

In [231]:
food_df_new.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,319874,sample_food,hummus sabra classic,16.0,2019-04-01
1,319877,sub_sample_food,hummus,16.0,2019-04-01
2,319894,sample_food,hummus other,16.0,2019-04-01
3,319958,sample_food,hummus tribe classic,16.0,2019-04-01
4,320025,sample_food,milk 2 wave 22e,1.0,2019-04-01


Our nutrients df will of course stay the same. The last df to worry about is food_nutrients. We will make a new food_nutrients_df_true that we will add rows that will contain as much nutrient info as possible. 

The fdc_id in food_df_new will correspond to the food in food_nutrient_df_new

In [207]:
# This function will return a dictionary of nutrient to amount

def get_nutrients(df):
    nutrient_dictionary = {} 
    N = df.shape[0]
    
    for i in range (N):
        # find the fdc_id of the current item 
        fdc = df.iloc[i]['fdc_id']
    
        # Get nutrients for this fdc_id
        fn = food_nutrient_df[food_nutrient_df['fdc_id'] == fdc]
        nutrient_dict = dict(zip(fn['nutrient_id'], fn['amount']))
        # Gives a dictionary of {nutrient_id: amount, ...}

        # Now add this dictionary to nutrient_dictionary
        for nutrient_id, amount in nutrient_dict.items():
            if nutrient_id not in nutrient_dictionary:
                nutrient_dictionary[nutrient_id] = [amount]
            else:
                nutrient_dictionary[nutrient_id].append(amount)

    nutrient_dictionary_ret = {}
    for nutrient_id, amounts in nutrient_dictionary.items():
        avr = round(sum(amounts) / len(amounts), 4)
        nutrient_dictionary_ret[nutrient_id] = avr
    
    # print(f"nutrient_dictionary: {nutrient_dictionary}\n\nnutrient_dictionary_ret: {nutrient_dictionary_ret}\n\n")
    return nutrient_dictionary_ret

In [30]:
hummas_nutrients = get_nutrients(duplicated_hummas)
print(hummas_nutrients)

{1091: 166.0909, 1101: 1.0591, 1092: 289.2727, 1087: 40.7273, 1093: 438.3636, 1090: 71.1091, 1089: 2.4109, 1098: 0.3478, 1095: 1.3791, 1051: 58.92, 1007: 1.968, 1002: 1.166, 1004: 16.95, 1170: 0.3175, 1264: 0.0088, 1301: 0.0272, 1335: 0.0, 1266: 0.6335, 1272: 0.0, 1260: 0.0, 1411: 0.0, 1299: 0.0035, 1278: 0.0, 1311: 0.0025, 1271: 0.0048, 1259: 0.0, 1265: 1.4117, 1303: 0.0, 1276: 0.0, 1273: 0.0443, 1280: 0.0, 1304: 0.006, 1414: 0.0, 1333: 0.0, 1315: 6.2517, 2012: 0.0838, 1404: 0.6367, 1267: 0.0787, 1305: 0.0, 1334: 0.0, 1313: 0.0047, 2014: 0.0013, 1262: 0.0, 1300: 0.0103, 1263: 0.0, 1261: 0.0, 1306: 0.012, 1314: 0.0208, 1406: 0.0, 2019: 0.0, 1321: 0.0195, 1316: 6.805, 2009: 0.0, 1323: 0.0073, 1312: 0.0048, 1405: 0.0, 1009: 8.1167, 1177: 36.3333, 1103: 16.15, 1166: 0.115, 1167: 0.9483, 1079: 5.45, 1165: 0.15, 1175: 0.1433, 1123: 258.0, 1107: 12.0, 1120: 3.0, 1122: 0.0, 1108: 0.0, 1127: 1.3017, 1125: 0.305, 1126: 9.4667, 1109: 1.745, 1128: 0.0, 1129: 0.0, 1130: 0.0, 1131: 0.0, 1013: 0.0, 

In [232]:
def update_fn(i, df):
    # i is going to be the row id in food_df_new
    # We will get the name of the item in food_df_new[i, 'description']
    # Next we will go thru 

    fdc_i = food_df_new.loc[i, 'fdc_id']
    desc_i = food_df_new.loc[i, 'description']
    # This is an array of every item that shares a name 
    duplicated_df = food_df_duplicated_total[food_df_duplicated_total['description']==desc_i]
    # Gives a dictionary of {nutrient_id: amount}
    working_nutrients = get_nutrients(duplicated_df)

    # Next up we want to update food_nutrient_df_new to have a row for each item.

    j = 0
    for nutrient_id_i, amount in working_nutrients.items():
        new_row = pd.DataFrame([{'id': i*1000+j, 'fdc_id': fdc_i, 'nutrient_id': nutrient_id_i, 'amount': amount}])
        df = pd.concat([df, new_row], ignore_index=True)
        j+=1
    
    return df

In [233]:
def create_fn():
    df = food_nutrient_df.iloc[0:0].copy()
    
    num = len(food_df_new)
    
    for i in range(num):
        df = update_fn(i, df)

    return df

In [234]:
food_nutrient_df_new = create_fn()
print("Done")

Done


In [235]:
food_nutrient_df_new.shape

(18471, 4)

In [236]:
food_nutrient_df_new.head()

Unnamed: 0,id,fdc_id,nutrient_id,amount
0,0,319874,1162,0.0
1,1000,319877,1091,166.0909
2,1001,319877,1101,1.0591
3,1002,319877,1092,289.2727
4,1003,319877,1087,40.7273


### Save the dataframes to files

We want to save <code>food_df_new</code> and <code>food_nutrient_df_new</code>.

In [241]:
food_df_new.to_csv("FoodData/food_new.csv", index=False)

In [242]:
food_nutrient_df_new.to_csv("FoodData/food_nutrient_new.csv", index=False)

After looking at the files, I've noticed that some entries have very few or no nutrients associated, these may be foods that have few entries, I could possibly get rid of them since they may confuse the system but like thats a later consideration for now its ok.

### Initial Attempt

This was the first thing I tried, we can ignore it.

In [88]:
food_df_clean = food_df_normal.drop_duplicates(subset=['description'], keep='last')

In [94]:
food_df_clean.head(30)

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
83,319957,sub_sample_food,hummus sabra classic,16.0,2019-04-01
130,320004,sub_sample_food,hummus tribe classic,16.0,2019-04-01
148,320022,sub_sample_food,hummus,16.0,2019-04-01
149,320023,sub_sample_food,hummus other,16.0,2019-04-01
474,320349,sub_sample_food,milk 2,1.0,2019-04-01
476,320351,sub_sample_food,milk 2 wave 22e,1.0,2019-04-01
477,320353,sample_food,beef eye of round roaststeak lean only raw ani...,13.0,2019-04-01
480,320356,sub_sample_food,proximates beef eye of round roaststeak lean o...,13.0,2019-04-01
481,320357,sample_food,beef eye of round roaststeak select raw comp13...,13.0,2019-04-01
492,320368,market_acquisition,beef eye of round roast raw er37,13.0,2019-04-01


In [95]:
food_df_clean.shape

(7490, 5)

In [113]:
# save the cleaned df to a csv file
food_df_clean.to_csv("FoodData/food_clean.csv", index=False)

# Create Database

Now that we have the tables that we want to work with, we can make a database file using SQLight to help us reference the information.

In [243]:
conn = sqlite3.connect("food_data.db")
cur = conn.cursor()

### Create dataframes for the tables we want to use

In [244]:
food_df = pd.read_csv("FoodData/food_new.csv")

In [245]:
nutrient_df = pd.read_csv("FoodData/nutrient.csv")

nutrient_df = nutrient_df.rename(columns={"id": "nutrient_id"})

In [246]:
food_nutrient_df = pd.read_csv("FoodData/food_nutrient_new.csv", usecols=[0,1,2,3])

food_nutrient_df = food_nutrient_df.rename(columns={"id": "fdc_nutrient_id"})

In [247]:
print("Food table:", food_df.shape)
print("Nutrient table:", nutrient_df.shape)
print("Food-Nutrient table:", food_nutrient_df.shape)

Food table: (1417, 5)
Nutrient table: (477, 5)
Food-Nutrient table: (18471, 4)


### Let's add the food table to our database

In [61]:
food_df.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,319874,sample_food,hummus sabra classic,16.0,2019-04-01
1,319877,sub_sample_food,hummus,16.0,2019-04-01
2,319894,sample_food,hummus other,16.0,2019-04-01
3,319958,sample_food,hummus tribe classic,16.0,2019-04-01
4,320025,sample_food,milk 2 wave 22e,1.0,2019-04-01


In [249]:
# conn.execute("DROP TABLE food;")

In [250]:
conn.execute("""
CREATE TABLE food (
    fdc_id INTEGER PRIMARY KEY,
    data_type TEXT,
    description TEXT,
    food_category_id INTEGER,
    publication_date TEXT
);
""")

<sqlite3.Cursor at 0x29213c0c740>

In [251]:
food_df.to_sql("food", conn, if_exists="append", index=False)

1417

In [252]:
pd.read_sql("SELECT * FROM food LIMIT 5;", conn)

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,319874,sample_food,hummus sabra classic,16,2019-04-01
1,319877,sub_sample_food,hummus,16,2019-04-01
2,319894,sample_food,hummus other,16,2019-04-01
3,319958,sample_food,hummus tribe classic,16,2019-04-01
4,320025,sample_food,milk 2 wave 22e,1,2019-04-01


### Let's add the nutrient table to our database

In [253]:
nutrient_df.head()

Unnamed: 0,nutrient_id,name,unit_name,nutrient_nbr,rank
0,2047,Energy (Atwater General Factors),KCAL,957.0,280.0
1,2048,Energy (Atwater Specific Factors),KCAL,958.0,290.0
2,1001,Solids,G,201.0,200.0
3,1002,Nitrogen,G,202.0,500.0
4,1003,Protein,G,203.0,600.0


In [255]:
# conn.execute("DROP TABLE nutrient")

In [256]:
conn.execute("""
CREATE TABLE nutrient (
    nutrient_id INTEGER PRIMARY KEY,
    name TEXT,
    unit_name TEXT,
    nutrient_nbr INTEGER,
    rank INTEGER
);
""")

<sqlite3.Cursor at 0x29213c62040>

In [257]:
nutrient_df.to_sql("nutrient", conn, if_exists="append", index=False)

477

In [258]:
pd.read_sql("SELECT * FROM nutrient LIMIT 5;", conn)

Unnamed: 0,nutrient_id,name,unit_name,nutrient_nbr,rank
0,1001,Solids,G,201,200
1,1002,Nitrogen,G,202,500
2,1003,Protein,G,203,600
3,1004,Total lipid (fat),G,204,800
4,1005,"Carbohydrate, by difference",G,205,1110


### Finally we will add the food_nutrient table to our dataframe

In [259]:
food_nutrient_df.head()

Unnamed: 0,fdc_nutrient_id,fdc_id,nutrient_id,amount
0,0,319874,1162,0.0
1,1000,319877,1091,166.0909
2,1001,319877,1101,1.0591
3,1002,319877,1092,289.2727
4,1003,319877,1087,40.7273


In [264]:
# conn.execute("DROP TABLE food_nutrient;")

<sqlite3.Cursor at 0x29213d715c0>

In [265]:
conn.execute("""
CREATE TABLE food_nutrient (
    fdc_nutrient_id INTEGER PRIMARY KEY,
    fdc_id INTEGER,
    nutrient_id INTEGER,
    amount REAL,
    FOREIGN KEY (fdc_id) REFERENCES food(fdc_id),
    FOREIGN KEY (nutrient_id) REFERENCES nutrient(nutrient_id)
);
""")

<sqlite3.Cursor at 0x29213d70fc0>

In [266]:
food_nutrient_df.to_sql("food_nutrient", conn, if_exists="append", index=False)

18471

In [267]:
pd.read_sql("SELECT * FROM food_nutrient LIMIT 5;", conn)

Unnamed: 0,fdc_nutrient_id,fdc_id,nutrient_id,amount
0,0,319874,1162,0.0
1,1000,319877,1091,166.0909
2,1001,319877,1101,1.0591
3,1002,319877,1092,289.2727
4,1003,319877,1087,40.7273


# Working with the data

### Now that we have our tables we are ready to start working with the data

In [268]:
conn = sqlite3.connect("food_data.db")
cur = conn.cursor()

In [269]:
# These are the nutrients we want to show for each food and recipe

nutri = {'calories': 1008, 'protein_g': 1003, 'carbs_g': 1005, 'fat_g': 1004, 'fiber_g': 1079, 'sodium_mg': 1093}
nutri_iter = ['calories', 'protein_g', 'carbs_g', 'fat_g', 'fiber_g', 'sodium_mg']

In [270]:
print([f"{nutrien}: {nutri[nutrien]}" for nutrien in nutri_iter])

['calories: 1008', 'protein_g: 1003', 'carbs_g: 1005', 'fat_g: 1004', 'fiber_g: 1079', 'sodium_mg: 1093']


In [271]:
# LEFT JOIN

food_to_calories_query = f"""
SELECT f.description AS "Food Item", 
calories.amount AS Calories, 
protein_g.amount AS "Protein in Grams", 
carbs_g.amount AS "Carbs in Grams", 
fat_g.amount AS "Fat in Grams", 
fiber_g.amount AS "Fiber in Grams", 
sodium_mg.amount AS "Sodium in Milligrams"

FROM food f

LEFT JOIN food_nutrient calories
    ON f.fdc_id = calories.fdc_id
    AND calories.nutrient_id = {nutri['calories']}

LEFT JOIN food_nutrient protein_g
    ON f.fdc_id = protein_g.fdc_id
    AND protein_g.nutrient_id = {nutri['protein_g']}

LEFT JOIN food_nutrient carbs_g
    ON f.fdc_id = carbs_g.fdc_id
    AND carbs_g.nutrient_id = {nutri['carbs_g']}

LEFT JOIN food_nutrient fat_g
    ON f.fdc_id = fat_g.fdc_id
    AND fat_g.nutrient_id = {nutri['fat_g']}

LEFT JOIN food_nutrient fiber_g
    ON f.fdc_id = fiber_g.fdc_id
    AND fiber_g.nutrient_id = {nutri['fiber_g']}

LEFT JOIN food_nutrient sodium_mg
    ON f.fdc_id = sodium_mg.fdc_id
    AND sodium_mg.nutrient_id = {nutri['sodium_mg']}

"""

In [275]:
food_to_calories_df = pd.read_sql(food_to_calories_query, conn)

In [276]:
food_to_calories_df.head()

Unnamed: 0,Food Item,Calories,Protein in Grams,Carbs in Grams,Fat in Grams,Fiber in Grams,Sodium in Milligrams
0,hummus sabra classic,,,,,,
1,hummus,,,,16.95,5.45,438.3636
2,hummus other,,,,,,
3,hummus tribe classic,,,,,,
4,milk 2 wave 22e,,,,,,


In [280]:
# Export this query. We will work on filling in the missing information. 

# food_to_calories_df.to_csv("FoodData/NutrientCharts/Table1.csv", index=False)

At this point, something ive noticed is that a lot of nutrition info is missing and the likely reason for this is because i choose the last item if items had the same name and instead I need a better aproach in order to select the version that has the most nutrition info and that has info for most of my nutritional info.  

My idea for an apporach is to first create an array of duplicated items based on identified names. Then from that list I will search thru and compare each entrie's food_nutrition_id and see how many of the desired nutritional info it has and give it a number based on that. If that number is higher than the previous entry ill keep that in my final entries array. Once I get all the nutritional info I need or I run out of entrues, I will keep that entry in my final entries array. My final entries array will keep one row signified by its fdc_id from all the duplicated arrays. then finally I will update my food_df_clean to first delete all duplicated rows, keeping none, then adding the rows from my final entries array. This way I will have as much info as possible. Then I will add this new df back to my database and run my query again, hopefully having full rows of health info-data. 

# Conversions

The next issue to tackle, now that we've seen what nutrients are included, is to find the best estimates for each nutrient for every food item. For this, we need conversions and I'm going to relly on chat gpt to set up the logic and fact check it after, since I don't know anything really about nutrient conversion.

## Data

Let's start by getting our databases we are using.

In [285]:
food_df = pd.read_csv("FoodData/food_new.csv")

nutrient_df = pd.read_csv("FoodData/nutrient.csv")
nutrient_df = nutrient_df.rename(columns={"id": "nutrient_id"})

food_nutrient_df = pd.read_csv("FoodData/food_nutrient_new.csv", usecols=[0,1,2,3])
food_nutrient_df = food_nutrient_df.rename(columns={"id": "fdc_nutrient_id"})

In [288]:
print("Food table:", food_df.shape)
print("Nutrient table:", nutrient_df.shape)
print("Food-Nutrient table:", food_nutrient_df.shape)

Food table: (1417, 5)
Nutrient table: (477, 5)
Food-Nutrient table: (18471, 4)


In [290]:
food_df.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,319874,sample_food,hummus sabra classic,16.0,2019-04-01
1,319877,sub_sample_food,hummus,16.0,2019-04-01
2,319894,sample_food,hummus other,16.0,2019-04-01
3,319958,sample_food,hummus tribe classic,16.0,2019-04-01
4,320025,sample_food,milk 2 wave 22e,1.0,2019-04-01


In [287]:
nutrient_df.head()

Unnamed: 0,nutrient_id,name,unit_name,nutrient_nbr,rank
0,2047,Energy (Atwater General Factors),KCAL,957.0,280.0
1,2048,Energy (Atwater Specific Factors),KCAL,958.0,290.0
2,1001,Solids,G,201.0,200.0
3,1002,Nitrogen,G,202.0,500.0
4,1003,Protein,G,203.0,600.0


In [286]:
food_nutrient_df.head()

Unnamed: 0,fdc_nutrient_id,fdc_id,nutrient_id,amount
0,0,319874,1162,0.0
1,1000,319877,1091,166.0909
2,1001,319877,1101,1.0591
3,1002,319877,1092,289.2727
4,1003,319877,1087,40.7273


Let's make a new df which will include the food and the nutrients: Calories, Protein_G, Carbs_G, Fat_G, Fiber_G, Sodium_MG

df is called: <code>food_nutrient_core_df</code>

We will include the following collumns: <code>food_nutrient_core_id, fdc_id, description, Calories, Protein_G, Carbs_G, Fat_G, Fiber_G, Sodium_MG</code>

The nutrient collumns will be null for now. 

In [298]:
food_nutrient_core_df = pd.DataFrame({
    "food_nutrient_core_id": range(len(food_df)),
    "fdc_id": food_df["fdc_id"],
    "description": food_df["description"],
    "Calories": np.nan,
    "Protein_G": np.nan,
    "Carbs_G": np.nan,
    "Fat_G": np.nan,
    "Fiber_G": np.nan,
    "Sodium_MG": np.nan
})

In [299]:
food_nutrient_core_df.head()

Unnamed: 0,food_nutrient_core_id,fdc_id,description,Calories,Protein_G,Carbs_G,Fat_G,Fiber_G,Sodium_MG
0,0,319874,hummus sabra classic,,,,,,
1,1,319877,hummus,,,,,,
2,2,319894,hummus other,,,,,,
3,3,319958,hummus tribe classic,,,,,,
4,4,320025,milk 2 wave 22e,,,,,,


## Conversions

I asked chatgpt for some help with conversions, here is the prompt to share my thought process. 

ok, so I've gotten the table made but like for all of the different food items they don't have all the nutrient info I want and hardly any have calorie information. Did u say before theres a way to find calorie info from other info? I'm thinking now the best option might be to calculate the desired nutrients but I don't know all the conversions. At the core I want to have the following nutrients: Calories	Protein in Grams	Carbs in Grams	Fat in Grams	Fiber in Grams	Sodium in Milligrams. So From the following nutrients that may or may not be provided for each food item, I want to include as many possible conversions as possible, from most accurate to least, the logic of which I will implement in an if statement for each food to try to find as much info for each categore so I will have the info. [list of nutrients included]

#### Protein_G

In [398]:
def find_protein_g(id_amount):
    protein = np.nan
    
    # Try 1003 (Protein)
    if 1003 in id_amount:
        return id_amount[1003]
    
    # Try 1053 (Adjusted protein)
    elif 1053 in id_amount:
        return id_amount[1053]
        
    # Try Protein=NitrogenÃ—6.25
    # Nitrogen = 1002 or 1052
    elif 1002 in id_amount:
        protein = id_amount[1002] * 6.25
    elif 1052 in id_amount:
        protein = id_amount[1052] * 6.25

    return protein

#### Carbs_G

In [454]:
def find_carbs_g(id_amount):
    carbs = np.nan
    
    # Try 1005 (Carbohydrate, by difference)
    if 1005 in id_amount:
        carbs = id_amount[1005]

    # Try 2039 (Carbohydrates)
    elif 2039 in id_amount:
        carbs = id_amount[2039]

    # Try 1050 (Carbohydrate, by summation)
    elif 1050 in id_amount:
        carbs = id_amount[1050]

    # Try formula: Carbs=Sugars+Starch+Fiber+Sugar alcohols
    # sugars = 1063 or 2000; starch = 1009; fiber = 1079 or 2033; Sugar alcohols = 1086
    else:
        # Get sugars (1063 or 2000)
        sugars = id_amount.get(1063, id_amount.get(2000, 0))
        
        # Get starch (1009)
        starch = id_amount.get(1009, 0)

        # Get fiber (1079 or 2033)
        fiber = id_amount.get(1079, id_amount.get(2033, 0))

        # Get Sugar alcohols (1086)
        sugar_alcohols = id_amount.get(1086, 0)

        # Add them up 
        if any(v != 0 for v in [sugars, starch, fiber, sugar_alcohols]):
            carbs = sugars + starch + fiber + sugar_alcohols

    return carbs

#### Fat_G

In [None]:
def find_fat_g(id_amount):
    fat = np.nan
    
    # Try 1004 (Total lipid (fat))
    if 1004 in id_amount:
        fat = id_amount[1004]

    # Try 1085 (Total fat (NLEA))
    elif 1085 in id_amount:
        fat = id_amount[1085]

    # Try Fat=Saturated+Monounsaturated+Polyunsaturated+Trans
    # Saturated = 1258 or 1326; Monounsaturated = 1292 or 1327; Polyunsaturated = 1293 or 1328; Trans = 1257 or 1329â€“1331
    else:
        # Get saturated (1258 or 1326)
        if 1258 in id_amount:
            saturated = id_amount[1258]
        elif 1326 in id_amount:
            saturated = id_amount[1326]
        else:
            return fat

        # Get monounsaturated (1292 or 1327)
        if 1292 in id_amount:
            monounsaturated = id_amount[1292]
        elif 1327 in id_amount:
            monounsaturated = id_amount[1327]
        else:
            return fat

        # Get polyunsaturated (1293 or 1328)
        if 1293 in id_amount:
            polyunsaturated = id_amount[1293]
        elif 1328 in id_amount:
            polyunsaturated = id_amount[1328]
        else:
            return fat

        # Get trans (1257 or 1329â€“1331)
        if 1257 in id_amount:
            trans = id_amount[1257]
        elif 1329 in id_amount:
            trans = id_amount[1329]
        elif 1330 in id_amount:
            trans = id_amount[1330]
        elif 1331 in id_amount:
            trans = id_amount[1331]
        else:
            return fat

        # Add them up
        fat = saturated + monounsaturated + polyunsaturated + trans

    return fat

#### Fiber_G

In [401]:
def find_fiber_g(id_amount):
    fiber = np.nan
    
    # Try 1079 (Fiber, total dietary)
    if 1079 in id_amount:
        fiber = id_amount[1079]
    
    # Try 2033 (Total dietary fiber (AOAC))
    elif 2033 in id_amount:
        fiber = id_amount[2033]

    # Try Fiber=Soluble+Insoluble
    # Soluble = 1082, 2035â€“2037; Insoluble = 1084, 2034
    else:
        # Get soluble (1082, 2035â€“2037)
        if 1082 in id_amount:
            soluble = id_amount[1082]
        elif 2035 in id_amount:
            soluble = id_amount[2035]
        elif 2036 in id_amount:
            soluble = id_amount[2036]
        elif 2037 in id_amount:
            soluble = id_amount[2037]
        else:
            return fiber

        # Get insoluble (1084, 2034)
        if 1084 in id_amount:
            insoluble = id_amount[1084]
        elif 2034 in id_amount:
            insoluble = id_amount[2034]
        else:
            return fiber

        # Add them up
        fiber = soluble + insoluble
    
    return fiber

#### Sodium_MG

In [402]:
def find_sodium_mg(id_amount):
    sodium = np.nan
    
    # Try 1093 (Sodium, Na)
    if 1093 in id_amount:
        sodium = id_amount[1093]

    # Try Sodium=SaltÃ—0.393
    # salt = (NaCl 1149)
    elif 1149 in id_amount:
        sodium = id_amount[1149] * 0.393

    return sodium

#### Calories

In [434]:
def find_calories(id_amount, protein, carbs, fat, fiber):
    cal = np.nan
    
    # Try 1008 (Energy (kcal))
    if 1008 in id_amount:
        cal = id_amount[1008]

    # Try 2048 (Energy (Atwater Specific Factors))
    elif 2048 in id_amount:
        cal = id_amount[2048]

    # Try 2047 (Energy (Atwater General Factors))
    elif 2047 in id_amount:
        cal = id_amount[2047]

    # Try kcal=kJ/4.184
    # Energy (kJ) = 1062
    elif 1062 in id_amount:
        cal = id_amount[1062] / 4.184

    # kcal=4(protein)+4(digestible carbs)+9(fat)+7(alcohol)+2(fiber)
    # Alcohol = 1018; Sugar alcohols = 1086; Digestible carbs = Total carbs âˆ’ fiber âˆ’ sugar alcohols
    else:
        # Treat missing values as 0, but only for calculation
        protein_val = 0 if np.isnan(protein) else protein
        carbs_val = 0 if np.isnan(carbs) else carbs
        fat_val = 0 if np.isnan(fat) else fat
        fiber_val = 0 if np.isnan(fiber) else fiber
        
        alcohol = id_amount.get(1018, 0)
        sugar_alcohols = id_amount.get(1086, 0)
        
        digestible_carbs = carbs_val - fiber_val - sugar_alcohols
        digestible_carbs = max(digestible_carbs, 0)  # avoid negative
        
        # Only calculate if at least one component exists
        if protein_val + carbs_val + fat_val + fiber_val + alcohol > 0:
            cal = 4*protein_val + 4*digestible_carbs + 9*fat_val + 7*alcohol + 2*fiber_val
        else:
            cal = np.nan  # truly no info

    return cal

#### find_nutrients

In [429]:
def find_nutrients(fdc_id):
    df = food_nutrient_df[food_nutrient_df['fdc_id']==fdc_id]
    id_amount = dict(zip(df["nutrient_id"], df["amount"]))
    
    # Protein_G
    protein = find_protein_g(id_amount)
    
    # Carbs_G
    carbs = find_carbs_g(id_amount)
    
    # Fat_G
    fat = find_fat_g(id_amount)
    
    # Fiber_G
    fiber = find_fiber_g(id_amount)
    
    # Sodium_MG
    sodium = find_sodium_mg(id_amount)

    # Calories
    calories = find_calories(id_amount, protein, carbs, fat, fiber)

    return {"Calories": calories, "Protein_G": protein, "Carbs_G": carbs, "Fat_G": fat, "Fiber_G": fiber, "Sodium_MG": sodium}

#### Update food_nutrient_core_df

In [435]:
def update_fn():
    N = food_df.shape[0]
    for i in range (N):
        fdc_id = food_df.loc[i, 'fdc_id']
        nutrient_dict = find_nutrients(fdc_id)
        for item, amount in nutrient_dict.items():
            food_nutrient_core_df.loc[i, item] = amount

update_fn()

## Working with it

Now that we've gotten as much nutrient info as we can, we can work with the data to remove null rows.

In [436]:
food_nutrient_core_df.head()

Unnamed: 0,food_nutrient_core_id,fdc_id,description,Calories,Protein_G,Carbs_G,Fat_G,Fiber_G,Sodium_MG
0,0,319874,hummus sabra classic,,,,,,
1,1,319877,hummus,225.0668,7.2875,13.5667,16.95,5.45,438.3636
2,2,319894,hummus other,,,,,,
3,3,319958,hummus tribe classic,,,,,,
4,4,320025,milk 2 wave 22e,,,,,,


In [437]:
food_nutrient_core_df.to_csv("FoodData/NutrientCharts/Table2.csv", index=False)

Now let's also make a df which includes only the rows with food info

In [444]:
# Check for nulls in ID and description columns
food_nutrient_core_df[['food_nutrient_core_id', 'fdc_id', 'description']].isnull().sum()

food_nutrient_core_id    0
fdc_id                   0
description              1
dtype: int64

In [449]:
food_nutrients_core_notnull_df = food_nutrient_core_df.dropna()

In [440]:
food_nutrients_core_notnull_df.head()

Unnamed: 0,food_nutrient_core_id,fdc_id,description,Calories,Protein_G,Carbs_G,Fat_G,Fiber_G,Sodium_MG
1,1,319877,hummus,225.0668,7.2875,13.5667,16.95,5.45,438.3636
13,13,320413,tomatoes grape,13.1872,0.833125,2.1,0.6283,2.1,5.8333
41,41,321900,broccoli raw,31.0,2.57,6.27,0.34,2.4,36.0
57,57,323127,almonds dry roasted salted,639.0553,24.6825,11.0,57.5917,11.0,255.4615
70,70,323448,kale,31.714,2.90625,4.1,1.321,4.1,32.4545


In [441]:
food_nutrients_core_notnull_df.to_csv("FoodData/NutrientCharts/Table3.csv", index=False)

In [445]:
food_nutrients_core_notnull_df.head()

Unnamed: 0,food_nutrient_core_id,fdc_id,description,Calories,Protein_G,Carbs_G,Fat_G,Fiber_G,Sodium_MG
1,1,319877,hummus,225.0668,7.2875,13.5667,16.95,5.45,438.3636
13,13,320413,tomatoes grape,13.1872,0.833125,2.1,0.6283,2.1,5.8333
41,41,321900,broccoli raw,31.0,2.57,6.27,0.34,2.4,36.0
57,57,323127,almonds dry roasted salted,639.0553,24.6825,11.0,57.5917,11.0,255.4615
70,70,323448,kale,31.714,2.90625,4.1,1.321,4.1,32.4545


In [462]:
nutrient_cols = ['Calories', 'Protein_G', 'Carbs_G', 'Fat_G', 'Fiber_G', 'Sodium_MG']

In [450]:
# Keep rows where at least one nutrient is not null
food_nutrients_core_notnull_df = food_nutrient_core_df.dropna(
    subset=nutrient_cols, how='all'
)

In [451]:
food_nutrients_core_notnull_df.to_csv("FoodData/NutrientCharts/Table4.csv", index=False)

In [452]:
food_nutrients_core_notnull_df.head()

Unnamed: 0,food_nutrient_core_id,fdc_id,description,Calories,Protein_G,Carbs_G,Fat_G,Fiber_G,Sodium_MG
1,1,319877,hummus,225.0668,7.2875,13.5667,16.95,5.45,438.3636
5,5,320027,milk 2,30.2813,3.2825,,1.9057,,38.9167
9,9,320381,minerals,,,,,,53.8571
13,13,320413,tomatoes grape,13.1872,0.833125,2.1,0.6283,2.1,5.8333
14,14,321359,milk reduced fat fluid 2 milkfat with added vi...,50.0,3.36,4.9,1.9,,39.0


In [453]:
food_nutrients_core_notnull_df.shape

(570, 9)

Next thing I want to do is add a penalty collumn which will count how many nutrition collumns are null. then I will use this df as my final food_nutrient_df in the recipes so i will save it to the database and that way I can easily make queries with it later when I want to apply it to recipes. I may also go through this manyally and add info since there are probably repeat rows and info i can find on my own but for now on a large scale i think i will leave it at this. 

Next session i will add the penalty collumn, save it to my database, then make sure I am ready to move on to recipes!

In [468]:
food_nutrients_core_notnull_df["penalty"] = (
    food_nutrients_core_notnull_df[nutrient_cols]
    .isna()
    .sum(axis=1)
)

In [467]:
# food_nutrients_core_notnull_df = food_nutrients_core_notnull_df.drop(columns=["penalty"])

In [469]:
food_nutrients_core_notnull_df.head()

Unnamed: 0,food_nutrient_core_id,fdc_id,description,Calories,Protein_G,Carbs_G,Fat_G,Fiber_G,Sodium_MG,penalty
1,1,319877,hummus,225.0668,7.2875,13.5667,16.95,5.45,438.3636,0
5,5,320027,milk 2,30.2813,3.2825,,1.9057,,38.9167,2
9,9,320381,minerals,,,,,,53.8571,5
13,13,320413,tomatoes grape,13.1872,0.833125,2.1,0.6283,2.1,5.8333,0
14,14,321359,milk reduced fat fluid 2 milkfat with added vi...,50.0,3.36,4.9,1.9,,39.0,1


In [470]:
food_nutrients_core_notnull_df.to_csv("FoodData/NutrientCharts/Table5.csv", index=False)

In [475]:
avr = food_nutrients_core_notnull_df['penalty'].count() / len(food_nutrients_core_notnull_df['penalty'])
print(avr)

1.0


In [476]:
food_nutrients_core_notnull_df.shape

(570, 10)

In [479]:
food_nutrients_core_notnull_df.to_csv("FoodData/food_nutrient_table", index=False)

## Add to database

In [488]:
conn = sqlite3.connect("food_data.db")
cur = conn.cursor()

### First food_core

In [489]:
food_df_new.head()

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,319874,sample_food,hummus sabra classic,16.0,2019-04-01
1,319877,sub_sample_food,hummus,16.0,2019-04-01
2,319894,sample_food,hummus other,16.0,2019-04-01
3,319958,sample_food,hummus tribe classic,16.0,2019-04-01
4,320025,sample_food,milk 2 wave 22e,1.0,2019-04-01


In [510]:
# conn.execute("DROP TABLE food_core;")

In [511]:
conn.execute("""
CREATE TABLE food_core (
    fdc_id INTEGER PRIMARY KEY,
    data_type TEXT,
    description TEXT,
    food_category_id INTEGER,
    publication_date TEXT
);
""")

<sqlite3.Cursor at 0x292184977c0>

In [512]:
food_df_new.to_sql("food_core", conn, if_exists="append", index=False)

1417

In [500]:
pd.read_sql("SELECT * FROM food_core LIMIT 5;", conn)

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,319874,sample_food,hummus sabra classic,16.0,2019-04-01
1,319877,sub_sample_food,hummus,16.0,2019-04-01
2,319894,sample_food,hummus other,16.0,2019-04-01
3,319958,sample_food,hummus tribe classic,16.0,2019-04-01
4,320025,sample_food,milk 2 wave 22e,1.0,2019-04-01


### Next food_nutrient_core

In [502]:
food_nutrient_df_new.head()

Unnamed: 0,id,fdc_id,nutrient_id,amount
0,0,319874,1162,0.0
1,1000,319877,1091,166.0909
2,1001,319877,1101,1.0591
3,1002,319877,1092,289.2727
4,1003,319877,1087,40.7273


In [515]:
# conn.execute("DROP TABLE food_nutrient_core;")

<sqlite3.Cursor at 0x292184e9bc0>

In [516]:
conn.execute("""
CREATE TABLE food_nutrient_core (
    id INTEGER PRIMARY KEY,
    fdc_id INTEGER,
    nutrient_id INTEGER,
    amount REAL,
    FOREIGN KEY (fdc_id) REFERENCES food_core(fdc_id)
    FOREIGN KEY (nutrient_id) REFERENCES nutrient(nutrient_id)
);
""")

<sqlite3.Cursor at 0x292184e9cc0>

In [517]:
food_nutrient_df_new.to_sql("food_nutrient_core", conn, if_exists="append", index=False)

18471

In [518]:
pd.read_sql("SELECT * FROM food_nutrient_core LIMIT 5;", conn)

Unnamed: 0,id,fdc_id,nutrient_id,amount
0,0,319874,1162,0.0
1,1000,319877,1091,166.0909
2,1001,319877,1101,1.0591
3,1002,319877,1092,289.2727
4,1003,319877,1087,40.7273


### Finaly food_table

In [519]:
food_nutrients_core_notnull_df.head()

Unnamed: 0,food_nutrient_core_id,fdc_id,description,Calories,Protein_G,Carbs_G,Fat_G,Fiber_G,Sodium_MG,penalty
1,1,319877,hummus,225.0668,7.2875,13.5667,16.95,5.45,438.3636,0
5,5,320027,milk 2,30.2813,3.2825,,1.9057,,38.9167,2
9,9,320381,minerals,,,,,,53.8571,5
13,13,320413,tomatoes grape,13.1872,0.833125,2.1,0.6283,2.1,5.8333,0
14,14,321359,milk reduced fat fluid 2 milkfat with added vi...,50.0,3.36,4.9,1.9,,39.0,1


In [526]:
# conn.execute("DROP TABLE food_table;")

In [527]:
conn.execute("""
CREATE TABLE food_table (
    food_nutrient_core_id INTEGER PRIMARY KEY,
    fdc_id INTEGER,
    description TEXT,
    calories REAL,
    protein_g REAL,
    carbs_g REAL,
    fat_g REAL,
    fiber_g REAL,
    sodium_mg REAL,
    penalty INTEGER,
    FOREIGN KEY (fdc_id) REFERENCES food_core(fdc_id)
);
""")

<sqlite3.Cursor at 0x292187006c0>

In [528]:
food_nutrients_core_notnull_df.to_sql("food_table", conn, if_exists="append", index=False)

570

In [529]:
pd.read_sql("SELECT * FROM food_table LIMIT 5;", conn)

Unnamed: 0,food_nutrient_core_id,fdc_id,description,calories,protein_g,carbs_g,fat_g,fiber_g,sodium_mg,penalty
0,1,319877,hummus,225.0668,7.2875,13.5667,16.95,5.45,438.3636,0
1,5,320027,milk 2,30.2813,3.2825,,1.9057,,38.9167,2
2,9,320381,minerals,,,,,,53.8571,5
3,13,320413,tomatoes grape,13.1872,0.833125,2.1,0.6283,2.1,5.8333,0
4,14,321359,milk reduced fat fluid 2 milkfat with added vi...,50.0,3.36,4.9,1.9,,39.0,1


And now my food_data is on my database, with food_table as the main table with all of the food and nutritional information.  

Now that we have our final table in our database, our work here is done on this nutritional stuff. Next up, recipes! 

Actually, the next thing I want to do is make a python script that will give you nutritional info about any food by searching the most similar item in the table and getting that info. I want to also add in a chatbot that can tell if there are multiple ingredients to give a better guess and estimate the price.