<a href="https://colab.research.google.com/github/ratul41907/Food-and-Nutrition-Based-RAG-Chatbot/blob/main/CraveBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing libraries

In [1]:
!pip install scikit-learn sentence-transformers transformers faiss-cpu pandas pyarrow


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [5]:
import pandas as pd


input_path = "/content/sample_data/food_sample_10k.parquet"


df = pd.read_parquet(input_path)

# Inspect the data
print(df.shape)
print(df.columns)
df.head(3)


(10000, 110)
Index(['additives_n', 'additives_tags', 'allergens_tags', 'brands_tags',
       'brands', 'categories', 'categories_tags', 'categories_properties',
       'checkers_tags', 'ciqual_food_name_tags',
       ...
       'states_tags', 'stores_tags', 'stores', 'traces_tags', 'unique_scans_n',
       'unknown_ingredients_n', 'unknown_nutrients_tags', 'vitamins_tags',
       'with_non_nutritive_sweeteners', 'with_sweeteners'],
      dtype='object', length=110)


Unnamed: 0,additives_n,additives_tags,allergens_tags,brands_tags,brands,categories,categories_tags,categories_properties,checkers_tags,ciqual_food_name_tags,...,states_tags,stores_tags,stores,traces_tags,unique_scans_n,unknown_ingredients_n,unknown_nutrients_tags,vitamins_tags,with_non_nutritive_sweeteners,with_sweeteners
26085,0.0,[],[en:fish],[orca-bay-seafoods-inc],Orca Bay Seafoods Inc.,undefined,[en:undefined],"{'agribalyse_food_code': None, 'agribalyse_pro...",[],[unknown],...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],,0.0,[],[],,
16614,2.0,"[en:e330, en:e440]",[],[fischer-wieser-specialty-foods-inc],Fischer & Wieser Specialty Foods Inc.,"Condiments, Sauces, Groceries","[en:condiments, en:sauces, en:groceries]","{'agribalyse_food_code': None, 'agribalyse_pro...",[],[unknown],...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],,1.0,[],[],,
9037,6.0,"[en:e110, en:e129, en:e322, en:e322i, en:e407,...",[en:soybeans],[over-the-top],Over The Top,undefined,[en:undefined],"{'agribalyse_food_code': None, 'agribalyse_pro...",[],[unknown],...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],,1.0,[],[],,


# Data Cleaning

In [8]:
df_cleaned = df.dropna(subset=["product_name"])


In [9]:
#replacing values
df_cleaned["ingredients_text"] = df_cleaned["ingredients_text"].fillna("")
df_cleaned["categories"] = df_cleaned["categories"].fillna("")
df_cleaned["brands"] = df_cleaned["brands"].fillna("Unknown")
df_cleaned["nutriscore_grade"] = df_cleaned["nutriscore_grade"].fillna("Not Rated")


In [10]:
#display first few rows
df_cleaned[['ingredients_text', 'categories', 'brands', 'nutriscore_grade']].head()

Unnamed: 0,ingredients_text,categories,brands,nutriscore_grade
26085,"[{'lang': 'main', 'text': 'PACIFIC COD (GADUS ...",undefined,Orca Bay Seafoods Inc.,a
16614,"[{'lang': 'main', 'text': 'Organic sugar, orga...","Condiments, Sauces, Groceries",Fischer & Wieser Specialty Foods Inc.,d
9037,"[{'lang': 'main', 'text': 'SUGAR, RICE FLOUR, ...",undefined,Over The Top,unknown
3155,"[{'lang': 'main', 'text': 'ERYTHRITOL, REB A*,...",undefined,Kroger,unknown
10917,"[{'lang': 'main', 'text': 'HIGH FRUCTOSE CORN ...",en:barbecue-sauce,Sweet Baby Ray's,e


In [11]:
# Function to safely extract nutrient values
def safe_get(d, key):
    try:
        return d.get(key, 0) if isinstance(d, dict) else 0
    except:
        return 0

# Extract key nutrients
df_cleaned["energy_kcal"] = df_cleaned["nutriments"].apply(lambda x: safe_get(x, "energy-kcal_100g"))
df_cleaned["proteins"] = df_cleaned["nutriments"].apply(lambda x: safe_get(x, "proteins_100g"))
df_cleaned["fat"] = df_cleaned["nutriments"].apply(lambda x: safe_get(x, "fat_100g"))
df_cleaned["sugar"] = df_cleaned["nutriments"].apply(lambda x: safe_get(x, "sugars_100g"))

# Display the extracted nutritional columns
df_cleaned[['product_name', 'energy_kcal', 'proteins', 'fat', 'sugar']].head()


Unnamed: 0,product_name,energy_kcal,proteins,fat,sugar
26085,"[{'lang': 'main', 'text': 'Wild Cod Fillet'}, ...",0,0,0,0
16614,"[{'lang': 'main', 'text': 'Fischer & Wieser, O...",0,0,0,0
9037,"[{'lang': 'main', 'text': 'Decorating'}, {'lan...",0,0,0,0
3155,"[{'lang': 'main', 'text': 'Zero Calorie Sweete...",0,0,0,0
10917,"[{'lang': 'main', 'text': 'Barbecue Sauce'}, {...",0,0,0,0


In [12]:
# Display the first few values
df_cleaned['nutriments'].head()


Unnamed: 0,nutriments
26085,"[{'100g': 192.0, 'name': 'energy', 'prepared_1..."
16614,"[{'100g': 0.0, 'name': 'fiber', 'prepared_100g..."
9037,"[{'100g': None, 'name': 'saturated-fat', 'prep..."
3155,"[{'100g': None, 'name': 'energy-kcal', 'prepar..."
10917,"[{'100g': 50.0, 'name': 'carbohydrates', 'prep..."


In [13]:
# Limit the ingredients_text to the first 300 characters
df_cleaned["ingredients_text"] = df_cleaned["ingredients_text"].astype(str).str[:300]

# Display the first few rows to verify
df_cleaned[['ingredients_text']].head()


Unnamed: 0,ingredients_text
26085,"[{'lang': 'main', 'text': 'PACIFIC COD (GADUS ..."
16614,"[{'lang': 'main', 'text': 'Organic sugar, orga..."
9037,"[{'lang': 'main', 'text': ""SUGAR, RICE FLOUR, ..."
3155,"[{'lang': 'main', 'text': 'ERYTHRITOL, REB A*,..."
10917,"[{'lang': 'main', 'text': 'HIGH FRUCTOSE CORN ..."


In [15]:
#drop rows without nutrients

df_cleaned = df_cleaned.dropna(subset=["nutriments"])
print('After dropping shape')
print( {df_cleaned.shape})


After dropping shape
{(9901, 114)}


In [16]:
# Save the cleaned dataset to Google Drive
df_cleaned.to_parquet("/content/drive/MyDrive/Colab Notebooks/food_cleaned_sample.parquet")

# Confirm the file has been saved
print("Cleaned data saved")


Cleaned data saved


# **Text Procesisng**

# Text Tokenization

In [None]:
#tokenization splits text into smaller units tokens, one of the step of NLP
#transfers longe sequence text into manageable pieces

In [None]:
#natural language toolkit will be used

In [20]:
import nltk
from nltk.tokenize import word_tokenize

#  download punkt_tab toknizer model
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [21]:


# Tokenize the ingredients_text column
df_cleaned['ingredients_tokens'] = df_cleaned['ingredients_text'].apply(word_tokenize)

# Display the first few rows to verify
df_cleaned[['ingredients_text', 'ingredients_tokens']].head()


Unnamed: 0,ingredients_text,ingredients_tokens
26085,"[{'lang': 'main', 'text': 'PACIFIC COD (GADUS ...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
16614,"[{'lang': 'main', 'text': 'Organic sugar, orga...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
9037,"[{'lang': 'main', 'text': ""SUGAR, RICE FLOUR, ...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
3155,"[{'lang': 'main', 'text': 'ERYTHRITOL, REB A*,...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
10917,"[{'lang': 'main', 'text': 'HIGH FRUCTOSE CORN ...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."


In [None]:
#remove stopwards a common practice in text processing
#stopwards are high frequency words that carry high frequency models

In [22]:
from nltk.corpus import stopwords

#Download stopwords
nltk.download('stopwords')

#Define stopwords
stop_words = set(stopwords.words('english'))

#remove stopwords from tokenized text
df_cleaned['ingredients_tokens'] = df_cleaned['ingredients_tokens'].apply(
    lambda tokens: [word for word in tokens if word.lower() not in stop_words]
)

#Display the first few rows to verify
df_cleaned[['ingredients_text', 'ingredients_tokens']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,ingredients_text,ingredients_tokens
26085,"[{'lang': 'main', 'text': 'PACIFIC COD (GADUS ...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
16614,"[{'lang': 'main', 'text': 'Organic sugar, orga...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
9037,"[{'lang': 'main', 'text': ""SUGAR, RICE FLOUR, ...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
3155,"[{'lang': 'main', 'text': 'ERYTHRITOL, REB A*,...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
10917,"[{'lang': 'main', 'text': 'HIGH FRUCTOSE CORN ...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."


In [23]:
#Stemming
#reducing words to threir base form, running-run
#improves efficiency in ML model
#we use porterstemmer for it
from nltk.stem import PorterStemmer

#initialize the stemmer
stemmer = PorterStemmer()

#apply stemming to the tokenized text
df_cleaned['ingredients_tokens'] = df_cleaned['ingredients_tokens'].apply(
    lambda tokens: [stemmer.stem(word) for word in tokens]
)

df_cleaned[['ingredients_text', 'ingredients_tokens']].head()


Unnamed: 0,ingredients_text,ingredients_tokens
26085,"[{'lang': 'main', 'text': 'PACIFIC COD (GADUS ...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
16614,"[{'lang': 'main', 'text': 'Organic sugar, orga...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
9037,"[{'lang': 'main', 'text': ""SUGAR, RICE FLOUR, ...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
3155,"[{'lang': 'main', 'text': 'ERYTHRITOL, REB A*,...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
10917,"[{'lang': 'main', 'text': 'HIGH FRUCTOSE CORN ...","[[, {, 'lang, ', :, 'main, ', ,, 'text, ', :, ..."
