# Load Libraries and packages

In [None]:
import pandas as pd
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,  BloomForCausalLM, GenerationConfig

In [2]:
df = pd.read_excel('./nutrition_data.xlsx',index_col = 0)

# We prefer not to drop the rows with missing values since just one column (`saturated_fact) contians missing values.
# Instead we will fill that missing vlaues with -1 which to flag the value
# At the end, we can modify the LLM model, not to include features in response with -1 values.
df["saturated_fat"].fillna(-1,  inplace = True) 


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["saturated_fat"].fillna(-1,  inplace = True)


In [3]:
result = df[df['name'].str.contains('eggplant', case=False, na=False)]
result

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,choline,folate,folic_acid,...,fat,saturated_fatty_acids,monounsaturated_fatty_acids,polyunsaturated_fatty_acids,fatty_acids_total_trans,alcohol,ash,caffeine,theobromine,water
2,"Eggplant, raw",100 g,25,0.2g,-1,0,2.00 mg,6.9 mg,22.00 mcg,0.00 mcg,...,0.18 g,0.034 g,0.016 g,0.076 g,0.00 mg,0.0 g,0.66 g,0.00 mg,0.00 mg,92.30 g
382,"Eggplant, pickled",100 g,49,0.7g,0.1g,0,1674.00 mg,11.9 mg,20.00 mcg,0.00 mcg,...,0.70 g,0.140 g,0.063 g,0.294 g,0.00 mg,0.0 g,1.73 g,0.00 mg,0.00 mg,86.90 g
2777,"Eggplant, with salt, drained, boiled, cooked",100 g,33,0.2g,-1,0,239.00 mg,9.4 mg,14.00 mcg,0.00 mcg,...,0.23 g,0.044 g,0.020 g,0.093 g,0.00 mg,0,1.13 g,0,0,89.67 g
3642,"Eggplant, without salt, drained, boiled, cooked",100 g,35,0.2g,-1,0,1.00 mg,9.4 mg,14.00 mcg,0.00 mcg,...,0.23 g,0.044 g,0.020 g,0.093 g,0.00 mg,0.0 g,0.54 g,0.00 mg,0.00 mg,89.67 g


In [None]:
data = df.to_dict('records') ## convert dataframe to dictionary since the vector database needs data in this format


# Create Vecotor database and insert data 

In [7]:
encoder = SentenceTransformer('all-MiniLM-L6-v2') # Model to create embeddings
print("Size of vectors in this encoder:", encoder.get_sentence_embedding_dimension()) # print size of vectors for this encoder

Size of vectors in this encoder: 384


In [6]:
# create the vector database client
qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance

In [8]:
# Create collection to store nutrition facts
qdrant.recreate_collection(
    collection_name="NutritionFacts",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

  qdrant.recreate_collection(


True

In [9]:
# vectorize data while insterting to the database
qdrant.upload_points(
    collection_name="NutritionFacts",
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(doc["name"]).tolist(),#converts each food's name into a vector representation that is then stored in the Qdrant vector database
            payload=doc,
        ) for idx, doc in enumerate(data)
    ]
)

In [10]:
# Retrieve three first vectors

result = qdrant.retrieve(
    collection_name="NutritionFacts",
    ids=[0, 1, 2],
    with_vectors = True,  
)

# Print the retrieved points
for point in result:
    print(point.vector)  
    print(point.payload)

[-0.061841584742069244, -0.029764438048005104, -0.016321543604135513, -0.0395667590200901, -0.020090274512767792, -0.04370187968015671, 0.0702875629067421, -0.0009381874115206301, -0.047837626188993454, -0.00741471815854311, -0.001942173927091062, -0.03020583651959896, -0.040940556675195694, -0.0921119675040245, -0.0927346870303154, 0.01581023633480072, 0.07192113995552063, 0.00581180676817894, 0.03096054308116436, -0.05723259225487709, -0.10090691596269608, -0.0001583121920702979, 0.020948324352502823, 0.02622985653579235, 0.01878185011446476, 0.07416931539773941, -0.04164836183190346, -0.0007163010886870325, -0.0417761355638504, -0.1384485512971878, 0.07779998332262039, 0.02203291282057762, -0.07269972562789917, 0.04466301575303078, 0.020892152562737465, -0.0260228980332613, -0.030750274658203125, 0.0015605385415256023, -0.003917152062058449, 0.016807004809379578, 0.028147447854280472, -0.09563429653644562, -0.03698631748557091, -0.021277140825986862, 0.01177098136395216, 0.057510856

# Query the database with user propmt

In [11]:
# Search time for finding nutrition facts of some food!
user_prompt = "How many calories does eggplant have?"
hits = qdrant.search(
    collection_name="NutritionFacts",
    query_vector=encoder.encode(user_prompt).tolist(),
    limit=3
)
for hit in hits:
  print(hit.payload, "score:", hit.score)

{'name': 'Eggplant, with salt, drained, boiled, cooked', 'serving_size': '100 g', 'calories': 33, 'total_fat': '0.2g', 'saturated_fat': -1, 'cholesterol': 0, 'sodium': '239.00 mg', 'choline': '9.4 mg', 'folate': '14.00 mcg', 'folic_acid': '0.00 mcg', 'niacin': '0.600 mg', 'pantothenic_acid': '0.075 mg', 'riboflavin': '0.020 mg', 'thiamin': '0.076 mg', 'vitamin_a': '37.00 IU', 'vitamin_a_rae': '2.00 mcg', 'carotene_alpha': '0.00 mcg', 'carotene_beta': '22.00 mcg', 'cryptoxanthin_beta': '0.00 mcg', 'lutein_zeaxanthin': '0.00 mcg', 'lucopene': 0, 'vitamin_b12': '0.00 mcg', 'vitamin_b6': '0.086 mg', 'vitamin_c': '1.3 mg', 'vitamin_d': '0.00 IU', 'vitamin_e': '0.41 mg', 'tocopherol_alpha': '0.41 mg', 'vitamin_k': '2.9 mcg', 'calcium': '6.00 mg', 'copper': '0.059 mg', 'irom': '0.25 mg', 'magnesium': '11.00 mg', 'manganese': '0.113 mg', 'phosphorous': '15.00 mg', 'potassium': '123.00 mg', 'selenium': '0.1 mcg', 'zink': '0.12 mg', 'protein': '0.83 g', 'alanine': '0.042 g', 'arginine': '0.046 g

In [12]:
# define a variable to hold the search results
search_results = [hit.payload for hit in hits]
search_results

[{'name': 'Eggplant, with salt, drained, boiled, cooked',
  'serving_size': '100 g',
  'calories': 33,
  'total_fat': '0.2g',
  'saturated_fat': -1,
  'cholesterol': 0,
  'sodium': '239.00 mg',
  'choline': '9.4 mg',
  'folate': '14.00 mcg',
  'folic_acid': '0.00 mcg',
  'niacin': '0.600 mg',
  'pantothenic_acid': '0.075 mg',
  'riboflavin': '0.020 mg',
  'thiamin': '0.076 mg',
  'vitamin_a': '37.00 IU',
  'vitamin_a_rae': '2.00 mcg',
  'carotene_alpha': '0.00 mcg',
  'carotene_beta': '22.00 mcg',
  'cryptoxanthin_beta': '0.00 mcg',
  'lutein_zeaxanthin': '0.00 mcg',
  'lucopene': 0,
  'vitamin_b12': '0.00 mcg',
  'vitamin_b6': '0.086 mg',
  'vitamin_c': '1.3 mg',
  'vitamin_d': '0.00 IU',
  'vitamin_e': '0.41 mg',
  'tocopherol_alpha': '0.41 mg',
  'vitamin_k': '2.9 mcg',
  'calcium': '6.00 mg',
  'copper': '0.059 mg',
  'irom': '0.25 mg',
  'magnesium': '11.00 mg',
  'manganese': '0.113 mg',
  'phosphorous': '15.00 mg',
  'potassium': '123.00 mg',
  'selenium': '0.1 mcg',
  'zink': '

# Integrate LLMs with retrieved data


In the following, we connect the retrieved data from vector database to three large language models: BLOOM, FLAN-T5 and BART.

These models are not large enough to utilize the retrieved data to produce a good reposne. But, at least they show how we can utilize an LLM to provide a good response to uset.

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,  BloomForCausalLM, GenerationConfig


# Load model and tokenizer
## BLOOM
# model_name = "bigscience/bloom-560m"
# tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
# model = BloomForCausalLM.from_pretrained("bigscience/bloom-560m")

# BART
model_name = "facebook/bart-large-cnn" 
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

## FLAN-T5
# model_name = 'google/flan-t5-base'
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

## Make retrieved text ready
features = list(df.columns)
features_preprocessed= [i.replace("_", " ") for i in features]

retrieved_texts = []
for doc in search_results:
    output = ""
    for ind in range(len(features)):
        output += f"{features_preprocessed[ind]} : {doc[features[ind]]}\n "
    retrieved_texts.append(output)

augmented_input = "\n\n".join(retrieved_texts) + "Summarize the key nutritional highlights from the data above related to user prompt.\n\nUser_prompt:" + user_prompt

# Tokenize the input text
inputs_encoded = tokenizer.encode(augmented_input, return_tensors="pt", max_length=1024, truncation=True)

# Generate the summary using the BART model
output_encoded = model.generate(inputs_encoded,max_length=1024, min_length=50, length_penalty=3.0, num_beams=5, early_stopping=True)

# Decode the generated summary back to text
output = tokenizer.decode(output_encoded[0], skip_special_tokens=True)

print(output)

name : Eggplant, raw Carbuncle serving size : 100 g calories : 25 Carbuncle total fat : 0.2g, saturated fat : -1, cholesterol : 2.00 mg, choline : 6.9 mg folate : 22.00 mcg, niacin: 0.649 mg, riboflavin:0.037 mg, thiamin: 1.039 mg, vitamin a: 23.00 IU, vitamin b: 2.2 mg, Vitamin b6: 3.5 mg, and vitamin d: 4.0 mg.
