In [None]:
!pip install -q rapidfuzz faiss-cpu streamlit


In [None]:
import pandas as pd
import openai
import numpy as np
import warnings
import ast
import streamlit as st
warnings.filterwarnings('ignore')

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import AutoModelForSeq2SeqLM
import torch
from sentence_transformers import SentenceTransformer, util
import faiss
from rapidfuzz import process, fuzz



new stuff

In [None]:
usda = pd.read_csv('USDA.csv')
usda['clean_description'] = usda['Description'].str.lower().str.strip()

intake = pd.read_csv('intake.csv', header = [0,1])
# remove commas
def remove_commas(x):
    if isinstance(x, str):
        return x.replace(',', '')
    else:
        return x
intake.iloc[0:, 2:] = intake.iloc[0:, 2:].applymap(remove_commas)


recipes = pd.read_csv('world_recipes.csv')
recipes.head()

Unnamed: 0,recipe,ingredients
0,Spicy Kimchi Fried Rice,"['Cooked rice', 'Kimchi', 'Gochujang', 'Soy sa..."
1,Classic Margherita Pizza,"['Pizza dough', 'Tomato sauce', 'Fresh mozzare..."
2,Coconut Chickpea Curry,"['Chickpeas', 'Coconut milk', 'Onion', 'Ginger..."
3,Pad See Ew with Tofu,"['Wide rice noodles', 'Tofu', 'Chinese broccol..."
4,Beef Bourguignon,"['Beef chuck', 'Red wine', 'Bacon', 'Onion', '..."


In [None]:
# mapping between intake and usda datasets

match_dict = {
    "TotalFat": "Fat",
    "VitaminC": "Vitamin C",
    "VitaminE": "Vitamin E",
    "VitaminD": "Vitamin D"
}

usda.rename(columns = match_dict, inplace = True)

# clean the intake df 'Life Stage' column
intake[('Life Stage', 'units')] = intake[('Life Stage', 'units')].apply( lambda s: s.replace('\u2003', ' ').replace('\xa0', ' ').strip())



In [None]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('paraphrase-MiniLM-L3-v2')

# create embeddings for recipes
recipe_names = recipes['recipe'].astype(str).tolist()
embeddings = embedder.encode(recipe_names, convert_to_tensor=True)

# normalize
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# normalize
#embeddings = embeddings / torch.norm(embeddings, dim=1, keepdim=True)

In [None]:
# FAISS vector search

dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)


In [None]:
# #model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_name = "google/flan-t5-base"

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# #model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# # create inference pipeline
# flan = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

Device set to use cpu


prompt = (f"The ingredient is '{ingredient}', "
            f" get the best match from this list: {cand_names}. "
            "Give one closest matching ingredient name only.")

In [None]:
# this function gets the top 2 recipes using world recipes dataset
def get_recipe(query, top_k = 2):

  query_embedding = embedder.encode([query], convert_to_numpy=True)
  query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)

  dist, ind = index.search(query_embedding, top_k)
  results = []
  for idx, score in zip(ind[0], dist[0]):
    row = recipes.iloc[idx]
    results.append({'item': row['recipe'], 'ingredients': row['ingredients'], 'score': score})

  return results

# this function combines ingredients from chosen top recipes
def combine_ingredients(results):
  top_recipes = get_recipe(user_input)
  comb_ingredients = set()
  for r in top_recipes:
    #ingredients = r['ingredients'].lower().split(';')
    ingredients = ast.literal_eval(r['ingredients'].lower())
    comb_ingredients.update([i.strip() for i in ingredients])

  all_ingredients = list(comb_ingredients)

  return all_ingredients

In [None]:
# This function matches a single recipe ingredient with the best USDA entry match.

def get_top_candidate(ingredient, top_k = 5, threshold = 70):

  usda_list = usda['clean_description'].tolist()
  candidates = process.extract(ingredient, usda_list, scorer = fuzz.partial_ratio, limit = top_k)
  best_cand = max(candidates, key = lambda x: x[1])

  if best_cand[1] >= threshold:
    return best_cand[0]
  else:
    return None


In [None]:
# # If there is not a direct match for an ingredient, use LLM to generate new names for ingredient.
# def llm_suggestion(ingredient, flan):

#   usda_list = usda['clean_description'].tolist()

#   # prompt = (f"The given ingredient name is '{ingredient}.' "
#   # 'Provide exactly 3 different alternative synonyms for this ingredient.'
#   # 'Do NOT repeat the original ingredient name.')

#   prompt = (f"What food is {ingredient}")
#   print(prompt)
#   response = flan(prompt)[0]['generated_text'].strip()

#   print(response)

In [None]:
# ho = get_top_candidate('pork')
# ho

'soup,bean w/ pork,cnd,cond'

In [None]:
# # testing llm_suggestion prompt
# ingredient = 'guanciale (or pancetta/bacon)'
# hey = llm_suggestion(ingredient, flan)
# hey

What food is guanciale (or pancetta/bacon)
pork


In [None]:
def calculate_nutrition(matched_df, intake, group, life_stage, nutrient_cols):
  """
  Calculate percent of daily nutrient requirements which are met.
  """

  # get relevant rows
  row = intake[(intake[('Group', 'Unnamed: 0_level_1')] == group) & (intake[('Life Stage', 'units')] == life_stage)]

  # if no relevant rows raise error
  if row.empty:
    raise ValueError(f" No matching intake data")
  row = row.iloc[0]

  full_nutrient_cols = [idx for idx in row.index if idx[0] in nutrient_cols]

  # calculate total nutrients
  total_nutrients = matched_df[nutrient_cols].sum()

  req_values = row[full_nutrient_cols].replace(['ND', 'NDc', 'NaN', None], 0).astype(float)

  total_nutrients.index = pd.MultiIndex.from_tuples([(nutrient, '') for nutrient in total_nutrients.index])
  #print(total_nutrients)

  totals = pd.Series({idx: total_nutrients.get((idx[0], ''), 0) for idx in full_nutrient_cols})

  coverage = (totals / req_values) * 100
  return totals, coverage

In [None]:
if __name__ == "__main__":

  user_input = input("What food would you like to look at today? ").strip().lower()

  ingredients = get_recipe(user_input)

  all_ingredients = combine_ingredients(ingredients)
  print(all_ingredients)

  while True:
    action = input("Would you like to add, remove, or no? (choose one)").strip().lower()

    if action == 'no':
      break

    elif action == 'add':
      adding = input("Enter ingredient to add: ").strip().lower()
      if adding and adding not in all_ingredients:
        all_ingredients.append(adding)
        print(all_ingredients)

    elif action == 'remove':
      rv = input("Enter ingredient to remove: ").strip().lower()
      if rv in all_ingredients:
        all_ingredients.remove(rv)
        print(all_ingredients)

  results = []
  for i in all_ingredients:
    ing = get_top_candidate(i)
    if ing is not None:
      row = usda[usda['clean_description'] == ing]
      results.append(row)
    else:
      print(f"No match for {i}.")

  if results:
    matched_df = pd.concat(results, ignore_index=True)
  else:
    print('\nNo Matches')

  matched_cols =  ["Protein", "Fat", "Carbohydrate", "Sodium", "Calcium", "Iron", "Potassium", "Vitamin C", "Vitamin E", "Vitamin D"]

  group_options = intake[('Group', 'Unnamed: 0_level_1')].unique()

  group = input(f"Which group are you part of? Select the one which best fits you. : {list(group_options)} ").strip().title()
  group_name = (group, 'Unnamed: 0_level_1')
  life_stage_options = intake.loc[intake[('Group', 'Unnamed: 0_level_1')] == group, ('Life Stage', 'units')].unique()

  cleaned_life_stage = [s.replace('\u2003', ' ').replace('\xa0', ' ').strip() for s in life_stage_options]
  life_stage = input(f"Which stage of life are you part of? Select one: {list(cleaned_life_stage)}").strip()
  life_stage = life_stage.replace('-', '–')

  nutrient_input = input("Would you like to calculate nutrition? (yes or no) ").strip().lower()

  if nutrient_input == 'yes':

    options_str = ", ".join(matched_cols)
    answer = input(f"Which nutrients would you like to consider? If you would like all of them, please answer 'all.' \nHere are your options: {matched_cols}\n" ).strip().lower()

    if answer == 'all':
      nutrient_info = matched_cols
    else:
      nutrient_info = [n.strip().title() for n in answer.split(',')]

  else:
    print('Great, have a great rest of your day!')
    nutrient_info = None

  if nutrient_info is not None:
    totals, coverage = calculate_nutrition(matched_df, intake, group, life_stage, nutrient_info)
    print("\nPercentage of Daily Nutrient Requirements: ")
    print(coverage)
    print("\nTotal Amounts in Food: ")
    print(totals)




What food would you like to look at today? spaghetti
['spaghetti', 'allspice', 'bay leaf', 'eggs', 'guanciale (or pancetta/bacon)', 'black pepper', 'pepper', 'tomato paste', 'pasta (penne or ziti)', 'salt', 'bechamel sauce (milk, butter, flour, nutmeg, cheese)', 'olive oil', 'ground beef or lamb', 'onion', 'red wine', 'pecorino romano cheese', 'cinnamon', 'garlic']
Would you like to add, remove, or no? (choose one)add
Enter ingredient to add: shrimp
['spaghetti', 'allspice', 'bay leaf', 'eggs', 'guanciale (or pancetta/bacon)', 'black pepper', 'pepper', 'tomato paste', 'pasta (penne or ziti)', 'salt', 'bechamel sauce (milk, butter, flour, nutmeg, cheese)', 'olive oil', 'ground beef or lamb', 'onion', 'red wine', 'pecorino romano cheese', 'cinnamon', 'garlic', 'shrimp']
Would you like to add, remove, or no? (choose one)no
No match for guanciale (or pancetta/bacon).
No match for pasta (penne or ziti).
No match for bechamel sauce (milk, butter, flour, nutmeg, cheese).
Which group are you p

In [None]:
# make coverage_df
coverage_df = pd.DataFrame(coverage, columns=['Percent(%) Daily'])
coverage_df = coverage_df.reset_index()
coverage_df = coverage_df.rename(columns={'level_0': 'Nutrient', 'level_1': 'Unit'})

# get info
docs = []

for _, row in coverage_df.iterrows():
  text = f"{row['Nutrient']} {row['Unit']}: {row['Percent(%) Daily']:.1f} % of daily needs."
  docs.append(text)

docs

['Vitamin C (mg/d): 134.5 % of daily needs.',
 'Vitamin D (μg/d): 20.7 % of daily needs.',
 'Vitamin E (mg/d): 60.1 % of daily needs.',
 'Calcium (mg/d): 455.7 % of daily needs.',
 'Iron (mg/d): 1117.8 % of daily needs.',
 'Potassium (mg/d): 283.8 % of daily needs.',
 'Sodium (mg/d): 356.3 % of daily needs.',
 'Carbohydrate (g/d): 258.4 % of daily needs.',
 'Fat (g/d): inf % of daily needs.',
 'Protein (g/d): 161.9 % of daily needs.']

In [None]:
# build keyword retriever
def retrieve(query, docs):
  query = query.lower()
  keywords = query.split()
  results = [doc for doc in docs if any(kw in doc.lower() for kw in keywords)]
  return results if results else ["No relevant information found."]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

# token
login(token = "YOUR_HUGGINGFACE_AUTH_TOKEN")

# initialize model
model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

def generate_response(user_input, docs):

  user_input = input(f"What would you like to know? \n")
  ret_info = retrieve(user_input, docs)
  context = "\n".join(ret_info)
  prompt = f"Here is some nutrition info: \n{context}\n\nAnswer the question\n{user_input} in a friendly way."

  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
  outputs = model.generate(
      **inputs,
      max_new_tokens=150,
      do_sample=True,
      temperature=0.7,
      top_p=0.9,
      pad_token_id = tokenizer.eos_token_id
  )
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return response


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
quest_answer = generate_response(user_input, docs)
print(quest_answer)

What would you like to know? 
how much calcium in food
Here is some nutrition info: 
Vitamin C (mg/d): 134.5 % of daily needs.
Vitamin D (μg/d): 20.7 % of daily needs.
Vitamin E (mg/d): 60.1 % of daily needs.
Calcium (mg/d): 455.7 % of daily needs.
Fat (g/d): inf % of daily needs.
Protein (g/d): 161.9 % of daily needs.

Answer the question
how much calcium in food in a friendly way. 
There is a lot of calcium in this food. 
The amount of calcium in this food is 455.7 % of daily needs. This is much more than what we need for the day. 
We get 455.7 % of daily needs of calcium from this food. 
This food contains a lot of calcium, 455.7 % of daily needs. 
There is 455.7 % of daily needs of calcium in this food. 
This food has 455.7 % of daily needs of calcium. 
There is a lot of calcium in this food: 455.7 % of daily needs. 
The food has 455.7 % of daily needs of calcium. 
There is a lot of calcium


In [None]:
# user_input = 'how much calcium '

# ret_info = retrieve(user_input, docs)
# context = "\n".join(ret_info)
# prompt = f"Here is some nutrition info: \n{context}\n\nAnswer the question\n{user_input} and add some more info"

# flan_response = flan(prompt, do_sample = True, temperature = 0.7, top_p = 0.9)
# print(flan_response[0]['generated_text'])

# Produced Output: Calcium (mg/d): 631.3 % of daily needs.


### Observations

Used the text2text generation pipeline with flan-t5-base and it keeps giving exact, short to near-copy outputs.

I tried several prompts like:
prompt = f"You are a friendly nutrition expert. Based on some nutrition info: \n{context}\n\nExplain the content in a friendly, helpful way\n{user_input}"


prompt = f"Here is some nutrition info: \n{context}\n\nAnswer the question\n{user_input} and add some more info"
