---

### USDA FoodData Central API Integration

To enhance the dataset with **nutritional information**, we integrated the **USDA FoodData Central API**, a public database providing nutrient composition data for thousands of food items.

The following Python code demonstrates how the API was accessed and converted into a CSV file (`usda_food_data.csv`):

In [5]:
food_101_labels = sorted(set([
    "apple_pie","baby_back_ribs","baklava","beef_carpaccio","beef_tartare",
    "beet_salad","beignets","bibimbap","bread_pudding","breakfast_burrito",
    "bruschetta","caesar_salad","cannoli","caprese_salad","carrot_cake",
    "ceviche","cheesecake","cheese_plate","chicken_curry","chicken_quesadilla",
    "chicken_wings","chocolate_cake","chocolate_mousse","churros","clam_chowder",
    "club_sandwich","crab_cakes","creme_brulee","croque_madame","cup_cakes",
    "deviled_eggs","donuts","dumplings","edamame","eggs_benedict","escargots",
    "falafel","filet_mignon","fish_and_chips","foie_gras","french_fries",
    "french_onion_soup","french_toast","fried_calamari","fried_rice","frozen_yogurt",
    "garlic_bread","gnocchi","greek_salad","grilled_cheese_sandwich","grilled_salmon",
    "guacamole","gyoza","hamburger","hot_and_sour_soup","hot_dog","huevos_rancheros",
    "hummus","ice_cream","lasagna","lobster_bisque","lobster_roll_sandwich","macaroni_and_cheese",
    "macarons","miso_soup","mussels","nachos","omelette","onion_rings","oysters",
    "pad_thai","paella","pancakes","panna_cotta","peking_duck","pho","pizza",
    "pork_chop","poutine","prime_rib","pulled_pork_sandwich","ramen","ravioli",
    "red_velvet_cake","risotto","samosa","sashimi","scallops","seaweed_salad",
    "shrimp_and_grits","spaghetti_bolognese","spaghetti_carbonara","spring_rolls",
    "steak","strawberry_shortcake","sushi","tacos","takoyaki","tiramisu","tuna_tartare","waffles"
]))

In [6]:
synonyms = {
    
    "beef_carpaccio": "raw beef slices",
    "beef_tartare": "steak tartare",
    "bibimbap": "korean rice bowl",
    "club_sandwich": "turkey sandwich",
    "croque_madame": "ham and cheese sandwich",
    "fish_and_chips": "fried fish with fries",
    "foie_gras": "duck liver pate",
    "gyoza": "dumplings",
    "lobster_roll_sandwich": "lobster sandwich",
    "omelette": "egg omelet",
    "oysters": "raw oysters",
    "pho": "vietnamese soup",
    "pork_chop": "grilled pork chop",
    "poutine": "french fries with gravy",
    "prime_rib": "roast beef",
    "spaghetti_bolognese": "meat sauce spaghetti",
    "spring_rolls": "vegetable rolls",
    "sushi": "japanese sushi",
    "tacos": "mexican tacos",
    "takoyaki": "octopus balls",
    "tuna_tartare": "raw tuna",
    "beef_carpaccio_alt": "thinly sliced raw beef appetizer",
    "bibimbap_alt": "korean mixed rice bowl with vegetables and egg",
    "club_sandwich_alt": "multi layer chicken bacon sandwich",
    "croque_madame_alt": "grilled ham and cheese sandwich with fried egg",
    "fish_and_chips_alt": "battered fried fish served with potato fries",
    "foie_gras_alt": "goose liver pate french delicacy",
    "omelette_alt": "fluffy egg omelet with cheese",
    "oysters_alt": "steamed oysters shellfish",
    "pho_alt": "beef noodle soup vietnamese",
    "pork_chop_alt": "fried pork meat with bone",
    "poutine_alt": "canadian french fries with cheese curds and gravy",
    "prime_rib_alt": "slow roasted beef rib cut",
    "spring_rolls_alt": "crispy fried vegetable rolls",
    "sushi_alt": "japanese rice rolls with fish",
    "tacos_alt": "mexican corn tortilla with beef and vegetables",
    "takoyaki_alt": "japanese octopus dough balls",
    "tuna_tartare_alt": "minced raw tuna appetizer",
    "ramen": "japanese noodle soup",
    "paella": "spanish seafood rice",
    "samosa": "indian fried pastry with potato filling",
    "ceviche": "raw fish marinated in lemon juice",
    "panna_cotta": "italian milk dessert",
    "beignets": "fried french dough pastry",
    "bruschetta": "grilled bread with tomato topping",
    "guacamole": "avocado dip",
    "falafel": "fried chickpea balls",
    "lasagna": "layered pasta with cheese and meat sauce",
    "fried_rice": "stir fried rice with egg and vegetables"
}

In [11]:
extra_synonyms = {
    "beef_carpaccio": "thinly sliced raw beef appetizer",
    "beignets": "fried dough pastry french",
    "bibimbap": "korean rice bowl with vegetables and egg",
    "bruschetta": "grilled bread with tomato topping",
    "ceviche": "raw fish marinated in lemon juice",
    "club_sandwich": "layered chicken bacon sandwich",
    "croque_madame": "grilled ham and cheese sandwich with fried egg",
    "falafel": "fried chickpea balls",
    "fish_and_chips": "battered fried fish with french fries",
    "foie_gras": "duck liver pate",
    "fried_rice": "stir fried rice with egg and vegetables",
    "guacamole": "avocado dip",
    "lasagna": "layered pasta with cheese and tomato sauce",
    "omelette": "fluffy egg omelet with cheese",
    "oysters": "fresh raw oysters shellfish",
    "paella": "spanish seafood rice dish",
    "panna_cotta": "italian creamy milk dessert",
    "pho": "vietnamese beef noodle soup",
    "pork_chop": "grilled pork meat with bone",
    "poutine": "french fries with cheese curds and gravy",
    "prime_rib": "slow roasted beef rib",
    "ramen": "japanese noodle soup",
    "samosa": "indian fried pastry with potato filling",
    "spring_rolls": "crispy fried vegetable rolls",
    "sushi": "japanese rice rolls with fish",
    "tacos": "mexican tortilla with meat and vegetables",
    "takoyaki": "japanese octopus dough balls",
    "tuna_tartare": "raw minced tuna appetizer"
}

synonyms.update(extra_synonyms)

In [12]:
import requests
import pandas as pd
import time

api_key = "cPZExQIH2MXPTmWaxWVVLnN2Nz4tlpjQihbcYZCy"
base_url = "https://api.nal.usda.gov/fdc/v1/foods/search"

foods = []

for item in food_101_labels:
    query = item.replace("_", " ")
    params = {"query": query, "pageSize": 5, "api_key": api_key}
    response = requests.get(base_url, params=params).json()
    print(f"Fetching {query} ...")
    if "foods" in response:
        for food in response["foods"]:
            nutrients = {nutr["nutrientName"]: nutr["value"] for nutr in food.get("foodNutrients", [])}
            foods.append({
                "query": query,
                "fdcId": food.get("fdcId"),
                "description": food.get("description"),
                "dataType": food.get("dataType"),
                "calories": nutrients.get("Energy", None),
                "protein": nutrients.get("Protein", None),
                "fat": nutrients.get("Total lipid (fat)", None),
                "carbohydrates": nutrients.get("Carbohydrate, by difference", None)
            })
    time.sleep(0.3)

usda_df = pd.DataFrame(foods)
usda_df.drop_duplicates(subset=["description"], inplace=True)
usda_df.to_csv("../data/raw/usda_food_data.csv", index=False)

print(f"Saved {len(usda_df)} food entries for 101 categories to ../data/raw/usda_food_data.csv")

Fetching apple pie ...
Fetching baby back ribs ...
Fetching baklava ...
Fetching beef carpaccio ...
Fetching beef tartare ...
Fetching beet salad ...
Fetching beignets ...
Fetching bibimbap ...
Fetching bread pudding ...
Fetching breakfast burrito ...
Fetching bruschetta ...
Fetching caesar salad ...
Fetching cannoli ...
Fetching caprese salad ...
Fetching carrot cake ...
Fetching ceviche ...
Fetching cheese plate ...
Fetching cheesecake ...
Fetching chicken curry ...
Fetching chicken quesadilla ...
Fetching chicken wings ...
Fetching chocolate cake ...
Fetching chocolate mousse ...
Fetching churros ...
Fetching clam chowder ...
Fetching club sandwich ...
Fetching crab cakes ...
Fetching creme brulee ...
Fetching croque madame ...
Fetching cup cakes ...
Fetching deviled eggs ...
Fetching donuts ...
Fetching dumplings ...
Fetching edamame ...
Fetching eggs benedict ...
Fetching escargots ...
Fetching falafel ...
Fetching filet mignon ...
Fetching fish and chips ...
Fetching foie gras ..

In [13]:
df = pd.read_csv("../data/raw/usda_food_data.csv")
print("Total entries:", len(df))
print("Unique foods:", df['query'].nunique())
df.head()

Total entries: 327
Unique foods: 99


Unnamed: 0,query,fdcId,description,dataType,calories,protein,fat,carbohydrates
0,apple pie,2288447,APPLE PIE,Branded,354.0,1.77,15.0,54.0
1,baby back ribs,1457876,BABY BACK RIBS,Branded,170.0,18.8,9.82,0.0
2,baby back ribs,2134922,"BABY BACK RIBS POTATO CHIPS, BABY BACK RIBS",Branded,536.0,7.14,32.1,57.1
3,baklava,2708044,Baklava,Survey (FNDDS),440.0,6.58,29.34,37.55
4,baklava,2218273,BAKLAVA,Branded,514.0,5.41,40.5,37.8


In [19]:
import pandas as pd

# Load your existing USDA data
usda_df = pd.read_csv("../data/raw/usda_food_data.csv")

# Prepare manual entries for unmatched categories
manual_entries = [
    {
        "query": "beef_carpaccio",
        "description": "Beef carpaccio (thinly sliced raw beef appetizer)",
        "dataType": "Manual",
        "calories": 235,        # example value
        "protein": 13.0,
        "fat": 15.0,
        "carbohydrates": 2.0
    },
    {
        "query": "guacamole",
        "description": "Guacamole (avocado dip)",
        "dataType": "Manual",
        "calories": 160,
        "protein": 2.0,
        "fat": 14.0,
        "carbohydrates": 9.0
    },
    {
        "query": "prime_rib",
        "description": "Prime rib (roast beef rib cut)",
        "dataType": "Manual",
        "calories": 318,
        "protein": 27.0,
        "fat": 24.0,
        "carbohydrates": 0.0
    },
    {
        "query": "samosa",
        "description": "Samosa (Indian fried pastry with potato filling)",
        "dataType": "Manual",
        "calories": 262,
        "protein": 5.0,
        "fat": 14.0,
        "carbohydrates": 28.0
    },
    {
        "query": "tacos",
        "description": "Tacos (Mexican tortilla with meat and vegetables)",
        "dataType": "Manual",
        "calories": 211,
        "protein": 10.0,
        "fat": 9.0,
        "carbohydrates": 23.0
    },
    {
        "query": "takoyaki",
        "description": "Takoyaki (Japanese octopus dough balls)",
        "dataType": "Manual",
        "calories": 180,
        "protein": 6.0,
        "fat": 8.0,
        "carbohydrates": 18.0
    }
]

# Append to the DataFrame
manual_df = pd.DataFrame(manual_entries)
usda_df = pd.concat([usda_df, manual_df], ignore_index=True)

# Save back
usda_df.to_csv("../data/raw/usda_food_data.csv", index=False)

print("Manual entries added for unmatched categories.")

Manual entries added for unmatched categories.


In [20]:
import pandas as pd
from rapidfuzz import process, fuzz
import re

# Reload the updated USDA dataset (with manual entries)
usda_df = pd.read_csv("../data/raw/usda_food_data.csv")

# Normalize text
def normalize_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

usda_df["normalized_description"] = usda_df["description"].astype(str).apply(normalize_text)
usda_names = usda_df["normalized_description"].tolist()

In [21]:
from rapidfuzz import process, fuzz

matches = []
unmatched = []

for food in food_101_labels:
    # Try exact synonym first
    query = synonyms.get(food, food.replace("_", " "))

    # Step 1: Try high threshold (85)
    match = process.extractOne(query, usda_names, scorer=fuzz.token_sort_ratio, score_cutoff=85)

    # Step 2: If not found, try a lower threshold (70)
    if not match:
        match = process.extractOne(query, usda_names, scorer=fuzz.partial_ratio, score_cutoff=70)

    if match:
        matches.append((food, match[0], match[1]))
    else:
        unmatched.append(food)

print(f"\nMatched {len(matches)} categories out of {len(food_101_labels)}.")
print(f"{len(unmatched)} categories have no match:\n", unmatched)


Matched 101 categories out of 101.
0 categories have no match:
 []


In [22]:
matches_df = pd.DataFrame(matches, columns=["Food_101_Label", "Matched_USDA_Description", "Match_Score"])
matches_df.to_csv("../data/raw/usda_food_matches.csv", index=False)

print("\nSaved USDA–Food101 matching file with scores to ../data/raw/usda_food_matches.csv")


Saved USDA–Food101 matching file with scores to ../data/raw/usda_food_matches.csv
