In [None]:
!pip install datasets polars

In [None]:
from datasets import load_dataset

# Load the 'food' split
ds = load_dataset("openfoodfacts/product-database", split="food")

# Preview a few rows
print(ds[0])
print(ds.column_names)

In [79]:
import pandas as pd
import pyarrow 

# 1. Define the file path
file_path = "/kaggle/input/cleaned-food/food_cleaned.parquet"

try:
    df = pd.read_parquet(file_path)

    # 3. Print the column names
    print("--- Column Names ---")
    print(df.columns.tolist())
    print("--------------------")

except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path.")
except Exception as e:
    print(f"An error occurred while reading the Parquet file: {e}")

--- Column Names ---
['name', 'energy_kcal_100g', 'fat_100g', 'carbohydrates_100g', 'sugars_100g', 'proteins_100g']
--------------------


In [None]:
from tqdm import tqdm
import pandas as pd

def extract_nutriments(item):
    nutriments_list = item.get('nutriments', None)
    nutriments = {}
    if nutriments_list and isinstance(nutriments_list, list):
        nutriments = {n['name']: n['value'] for n in nutriments_list if isinstance(n, dict) and 'name' in n and 'value' in n}
    return {
        "name": item.get("product_name", [{}])[0].get("text", "") if item.get("product_name") else "",
        "energy_kcal_100g": nutriments.get("energy-kcal", None),
        "fat_100g": nutriments.get("fat", None),
        "carbohydrates_100g": nutriments.get("carbohydrates", None),
        "sugars_100g": nutriments.get("sugars", None),
        "proteins_100g": nutriments.get("proteins", None)
    }

# Apply to dataset with progress bar
cleaned_data = [extract_nutriments(item) for item in tqdm(ds)]

# Create DataFrame
df = pd.DataFrame(cleaned_data)
print(df.head())

#DATAFRAME

df.to_parquet('/kaggle/working/food_cleaned.parquet')              

print("Saved as food_cleaned.csv and food_cleaned.parquet")

In [80]:
import pandas as pd

df = pd.read_parquet("/kaggle/input/food-cleaned-data/food_cleaned_clean.parquet")



print(df.head())


                                                name  energy_kcal_100g  \
0  Véritable pâte à tartiner noisettes chocolat noir        617.000000   
1                               Chamomile Herbal Tea        280.000000   
2                     Lagg's, herbal tea, peppermint          0.000000   
3                                 Linden Flowers Tea        213.320007   
4                               Herbal Tea, Hibiscus        267.000000   

   fat_100g  carbohydrates_100g  sugars_100g  proteins_100g  
0      48.0           36.000000         32.0       8.000000  
1       0.0           70.000000          0.0       0.000000  
2       0.0            1.470000          0.0       0.000000  
3       0.0           53.330002          0.0       0.000000  
4       0.0           60.000000          0.0      66.669998  


In [81]:
from sklearn.model_selection import train_test_split


df['low_calorie'] = df['energy_kcal_100g'] < 100

# Features
X = df[['fat_100g', 'carbohydrates_100g', 'proteins_100g']].fillna(0)
y = df['low_calorie']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [88]:
!pip install --upgrade seaborn matplotlib pandas numpy

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting pandas
  Downloading pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
Collecting numpy
  Downloading numpy-2.3.5-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:

In [82]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

print("Accuracy:", clf.score(X_test, y_test))


Accuracy: 0.9706412527234823


In [89]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report  # Make sure to import this!

# Assuming clf is your trained model and X_test, y_test are your test features and labels

y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

# Extract the confusion matrix elements
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]

# Print confusion matrix and classification report
print("Confusion Matrix:\n", cm)
print(classification_report(y_test, y_pred))  # This line works now

print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

# Visualize confusion matrix using Seaborn heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted Negative', 'Predicted Positive'], yticklabels=['Actual Negative', 'Actual Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()


Confusion Matrix:
 [[436430   8072]
 [ 14404 306658]]
              precision    recall  f1-score   support

       False       0.97      0.98      0.97    444502
        True       0.97      0.96      0.96    321062

    accuracy                           0.97    765564
   macro avg       0.97      0.97      0.97    765564
weighted avg       0.97      0.97      0.97    765564

True Positives (TP): 306658
True Negatives (TN): 436430
False Positives (FP): 8072
False Negatives (FN): 14404


In [85]:
selected_features = ['fat_100g', 'carbohydrates_100g']

X_sel = df[selected_features].fillna(0)
y = df['low_calorie']

X_train_sel, X_test_sel, y_train, y_test = train_test_split(
    X_sel, y, test_size=0.2, random_state=42
)

clf_sel = RandomForestClassifier(n_estimators=100, random_state=42)
clf_sel.fit(X_train_sel, y_train)

print("Accuracy (selected features):", clf_sel.score(X_test_sel, y_test))


Accuracy (selected features): 0.9447400870469354


In [87]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns 

# --- Ensure this line is at the top of your notebook/cell for plotting ---
# %matplotlib inline 

# Make predictions on the test set
y_pred_sel = clf_sel.predict(X_test_sel)

# Generate confusion matrix
cm_sel = confusion_matrix(y_test, y_pred_sel)

# Extract confusion matrix elements
TN_sel = cm_sel[0, 0]
FP_sel = cm_sel[0, 1]
FN_sel = cm_sel[1, 0]
TP_sel = cm_sel[1, 1]

# Print confusion matrix and classification report
print("Confusion Matrix:\n", cm_sel)
print(classification_report(y_test, y_pred_sel)) 

print(f"True Positives (TP): {TP_sel}")
print(f"True Negatives (TN): {TN_sel}")
print(f"False Positives (FP): {FP_sel}")
print(f"False Negatives (FN): {FN_sel}")



counts = [TP_sel, TN_sel, FP_sel, FN_sel]
labels = ['True Positive (TP)', 'True Negative (TN)', 'False Positive (FP)', 'False Negative (FN)']

# Create a new figure for the bar plot
plt.figure(figsize=(10, 5)) 

# Generate the bar plot
sns.barplot(x=labels, y=counts, palette='viridis') 
plt.title('Classification Outcome Counts')
plt.ylabel('Number of Instances')
plt.show() # Display the bar graph

# =======================================================


# Visualize confusion matrix using Seaborn heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm_sel, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted Negative', 'Predicted Positive'], yticklabels=['Actual Negative', 'Actual Positive'])
plt.title('Confusion Matrix (Selected Features)')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show() # Display the heatmap

Confusion Matrix:
 [[425628  18874]
 [ 23431 297631]]
              precision    recall  f1-score   support

       False       0.95      0.96      0.95    444502
        True       0.94      0.93      0.93    321062

    accuracy                           0.94    765564
   macro avg       0.94      0.94      0.94    765564
weighted avg       0.94      0.94      0.94    765564

True Positives (TP): 297631
True Negatives (TN): 425628
False Positives (FP): 18874
False Negatives (FN): 23431


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import itertools, os
import joblib

df = pd.read_parquet("/kaggle/input/cleaned-food/food_cleaned.parquet")

selected_features = ['fat_100g', 'carbohydrates_100g']
X = df[selected_features].fillna(0)
y = df['energy_kcal_100g'] < 100

X_train_sel, X_test_sel, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# Import necessary libraries
import pandas as pd
import os
import itertools
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
import joblib

# --- 1. Define Parameter Grid ---
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Generate all combinations of the hyperparameters
param_combinations = list(itertools.product(
    param_grid['n_estimators'],
    param_grid['max_depth'],
    param_grid['min_samples_split']
))

# File to store grid search progress
progress_file = "/kaggle/working/grid_search_results.csv"
# --- 2. Load Previous Results (if exists) ---
if os.path.exists(progress_file):
    prev = pd.read_csv(progress_file)
    done = set(tuple(x) for x in prev[['n_estimators','max_depth','min_samples_split']].values)
    results = prev.to_dict('records')
    print(f"Loaded {len(results)} previous results.")
else:
    done = set()
    results = []
# --- 3. Grid Search Loop ---
for n, d, s in tqdm(param_combinations, desc="Grid Search Progress"):
    if (n, d, s) in done:
        continue

    # Create the RandomForest model with current hyperparameters
    model = RandomForestClassifier(
        n_estimators=n,
        max_depth=d,
        min_samples_split=s,
        random_state=42,
        n_jobs=-1
    )

    # Perform cross-validation and calculate mean score
    scores = cross_val_score(model, X_train_sel, y_train, cv=3, scoring='accuracy')
    mean_score = scores.mean()

    # Append results to the list
    results.append({
        'n_estimators': n,
        'max_depth': d,
        'min_samples_split': s,
        'mean_score': mean_score
    })

    
    pd.DataFrame(results).to_csv(progress_file, index=False)
# --- 4. Extract Best Hyperparameters ---
df_results = pd.DataFrame(results)


best = df_results.iloc[df_results['mean_score'].idxmax()]

print("Best Params:")
print(best)

final_model = RandomForestClassifier(
    n_estimators=int(best['n_estimators']),
    max_depth=None if pd.isna(best['max_depth']) else int(best['max_depth']),
    min_samples_split=int(best['min_samples_split']),
    random_state=42,
    n_jobs=-1
)

# Train the model with the full training data
final_model.fit(X_train_sel, y_train)

# Save the best model to disk
joblib.dump(final_model, "/kaggle/working/best_final_rf_model.joblib")
print("Saved best model as best_rf_model.joblib")


Loaded 10 previous results.


Grid Search Progress:   7%|▋         | 2/27 [05:51<1:13:18, 175.94s/it]


KeyboardInterrupt: 

In [90]:
import joblib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# -----------------------------
# 1. Load Final Best Model
# -----------------------------
model = joblib.load("/kaggle/input/best-joblib-file/best_rf_model.joblib")

# -----------------------------
# 2. Load Dataset
# -----------------------------
df = pd.read_parquet("/kaggle/input/food-cleaned-data/food_cleaned_clean.parquet")

# Select the same features used in training
selected_features = ['fat_100g', 'carbohydrates_100g']
X = df[selected_features].fillna(0)
y = df['energy_kcal_100g'] < 100     # low-calorie = True

# Train-test split
X_train_sel, X_test_sel, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 3. Predictions & Metrics
# -----------------------------
y_pred = model.predict(X_test_sel)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)

# Classification Report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# Extract TP, TN, FP, FN
TN, FP, FN, TP = cm.ravel()
print(f"\nTP = {TP}")
print(f"TN = {TN}")
print(f"FP = {FP}")
print(f"FN = {FN}")

# -----------------------------
# 4. Matplotlib Confusion Matrix Visualization
# -----------------------------
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted High-Calorie', 'Predicted Low-Calorie'],
            yticklabels=['Actual High-Calorie', 'Actual Low-Calorie'])
plt.title("Confusion Matrix (Final Best RF Model)")
plt.xlabel("Predicted Label")
plt.ylabel("Actual Label")
plt.show()


Accuracy: 0.7155979121275295

Confusion Matrix:
 [[429303  15199]
 [202529 118533]]

Classification Report:

              precision    recall  f1-score   support

       False       0.68      0.97      0.80    444502
        True       0.89      0.37      0.52    321062

    accuracy                           0.72    765564
   macro avg       0.78      0.67      0.66    765564
weighted avg       0.77      0.72      0.68    765564


TP = 118533
TN = 429303
FP = 15199
FN = 202529


In [None]:
# These lists must stay in the same order as df
food_names   = df["name"].tolist()
kcal_data    = df["energy_kcal_100g"].tolist()
fat_data     = df["fat_100g"].tolist()
carb_data    = df["carbohydrates_100g"].tolist()
sugar_data   = df["sugars_100g"].tolist()
protein_data = df["proteins_100g"].tolist()

N = len(food_names)
print(f"✅ Prepared lookup lists for {N:,} foods.")


In [None]:
!pip install -q faiss-cpu==1.13.0
!pip install -q sentence-transformers
!pip install -q transformers
!pip install -q accelerate


In [None]:
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
print("FAISS version:", faiss.__version__)
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
print("Embedder Ready!")



In [6]:
!pip install faiss-cpu
from tqdm.auto import tqdm
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import os

FAISS_INDEX_PATH = '/kaggle/working/faiss_index_master.faiss'
PROGRESS_FILE = '/kaggle/working/faiss_progress.npy'  # Tracks which indices are already added

# Load the embedder
embedder = SentenceTransformer("paraphrase-MiniLM-L6-v2")
embed_dim = embedder.get_sentence_embedding_dimension()
print("Embedding dimension:", embed_dim)

batch_size = 2048
N = len(food_names)  # Make sure this is defined!

# Load progress if exists
if os.path.exists(PROGRESS_FILE):
    processed_indices = set(np.load(PROGRESS_FILE))
    print(f"Resuming from previous progress: {len(processed_indices)}/{N} items already processed.")
else:
    processed_indices = set()

# Initialize or load existing index
if os.path.exists(FAISS_INDEX_PATH):
    print("Loading existing FAISS index...")
    index = faiss.read_index(FAISS_INDEX_PATH)
    print(f"Loaded index with {index.ntotal} vectors.")
    # Double-check consistency (optional but safe)
    if index.ntotal != len(processed_indices):
        print("Warning: Mismatch between index size and progress file! Rebuilding from scratch.")
        index = faiss.IndexFlatL2(embed_dim)
        processed_indices = set()
else:
    index = faiss.IndexFlatL2(embed_dim)

# Find where to start
start_idx = 0
while start_idx < N and start_idx in processed_indices:
    start_idx += 1

if start_idx >= N:
    print("Everything already processed!")
else:
    print(f"Starting/resuming from index {start_idx}")

# Resume building
for start in tqdm(range(start_idx, N, batch_size), desc="Building FAISS index (resumable)"):
    end = min(start + batch_size, N)

    batch_texts = []
    batch_indices = []  # To track which ones we add

    for i in range(start, end):
        if i in processed_indices:
            continue  # Skip already processed

        text = (
            f"{names[i]}. "
            f"Energy: {kcal_data[i]} kcal per 100g. "
            f"Fat: {fat_data[i]} g, "
            f"Carbs: {carb_data[i]} g, "
            f"Sugars: {sugar_data[i]} g, "
            f"Proteins: {protein_data[i]} g."
        )
        batch_texts.append(text)
        batch_indices.append(i)

    if len(batch_texts) == 0:
        continue

    # Encode batch
    print(f"Encoding batch {start}-{end-1} ({len(batch_texts)} new items)...")
    batch_emb = embedder.encode(batch_texts, batch_size=64, show_progress_bar=True, normalize_embeddings=False)
    batch_emb = np.asarray(batch_emb, dtype="float32")

    # Add to index
    index.add(batch_emb)

    # Mark as processed
    processed_indices.update(batch_indices)

    # Save progress frequently
    np.save(PROGRESS_FILE, np.array(list(processed_indices)))
    faiss.write_index(index, FAISS_INDEX_PATH)

    print(f"Added {len(batch_emb)} vectors. Total: {index.ntotal}")

# Final save
faiss.write_index(index, FAISS_INDEX_PATH)
np.save(PROGRESS_FILE, np.array(list(processed_indices)))  # Final save
print(f"FAISS index fully built and saved: {index.ntotal} vectors")
print(f"Progress saved to: {PROGRESS_FILE}")

Embedding dimension: 384


NameError: name 'food_names' is not defined

In [None]:
# These lists must stay in the same order as df
food_names   = df["name"].tolist()
kcal_data    = df["energy_kcal_100g"].tolist()
fat_data     = df["fat_100g"].tolist()
carb_data    = df["carbohydrates_100g"].tolist()
sugar_data   = df["sugars_100g"].tolist()
protein_data = df["proteins_100g"].tolist()

N = len(food_names)
print(f"✅ Prepared lookup lists for {N:,} foods.")


In [None]:
!pip install -q faiss-cpu==1.13.0
!pip install -q sentence-transformers
!pip install -q transformers
!pip install -q accelerate
#!pip install --upgrade torch torchvision



In [None]:
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
print("FAISS version:", faiss.__version__)
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
print("Embedder Ready!")


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import os

# Define FAISS index and progress file paths
FAISS_INDEX_PATH = '/content/drive/MyDrive/Colab Notebooks/faiss_index.faiss'
PROGRESS_FILE = '/content/drive/MyDrive/Colab Notebooks/faiss_progress.npy'

# Load the embedder
embedder = SentenceTransformer("paraphrase-MiniLM-L6-v2")
embed_dim = embedder.get_sentence_embedding_dimension()
print("Embedding dimension:", embed_dim)

# Define batch size and total number of food items
batch_size = 2048  # Define batch size
N = len(food_names)  # Total number of food items

# Start chunk 35
start = 3253630  # Start index for chunk 35
end = 3349325  # End index for chunk 35

print(f"Processing chunk 35 (from {start} to {end})")

# Initialize FAISS index for this chunk (chunk 33)
index = faiss.IndexFlatL2(embed_dim)

# List to store batch texts and indices
batch_texts = []
batch_indices = []

# Add data to the batch for chunk 33
for i in range(start, end):
    text = (
        f"{food_names[i]}. "
        f"Energy: {kcal_data[i]} kcal per 100g. "
        f"Fat: {fat_data[i]} g, "
        f"Carbs: {carb_data[i]} g, "
        f"Sugars: {sugar_data[i]} g, "
        f"Proteins: {protein_data[i]} g."
    )
    batch_texts.append(text)
    batch_indices.append(i)

# If no batch is empty, proceed with encoding and indexing #3253630 to 3349325
if len(batch_texts) > 0:
    # Encode the batch of food descriptions into embeddings
    print(f"Encoding batch {start}-{end-1} ({len(batch_texts)} new items)...")
    batch_emb = embedder.encode(batch_texts, batch_size=64, show_progress_bar=True, normalize_embeddings=False)
    batch_emb = np.asarray(batch_emb, dtype="float32")

    # Add the batch embeddings to the FAISS index
    index.add(batch_emb)

    # Save the FAISS index for chunk 33 (to overwrite the missing index)
    faiss.write_index(index, FAISS_INDEX_PATH)
    print(f"Saved FAISS index for chunk 33. Total vectors in FAISS: {index.ntotal}")

    # Mark these items as processed
    processed_indices = set(np.load(PROGRESS_FILE)) if os.path.exists(PROGRESS_FILE) else set()
    processed_indices.update(batch_indices)

    # Save progress after processing chunk 33
    np.save(PROGRESS_FILE, np.array(list(processed_indices)))
    print(f"Progress updated for chunk 33. Total processed: {len(processed_indices)} items.")
else:
    print(f"No data to process for chunk 33!")


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import os

# Define FAISS index and progress file paths
FAISS_INDEX_PATH = '/content/drive/MyDrive/Colab Notebooks/faiss_index_part_{chunk_idx}.faiss'
PROGRESS_FILE = '/content/drive/MyDrive/Colab Notebooks/faiss_progress.npy'

# Load the embedder
embedder = SentenceTransformer("paraphrase-MiniLM-L6-v2")
embed_dim = embedder.get_sentence_embedding_dimension()
print("Embedding dimension:", embed_dim)

batch_size = 2048  # Define batch size
N = len(food_names)  # Total number of food items

# Step 1: Load progress if exists (Track which items have already been processed)
if os.path.exists(PROGRESS_FILE):
    processed_indices = set(np.load(PROGRESS_FILE))
    print(f"Resuming from previous progress: {len(processed_indices)}/{N} items already processed.")
else:
    processed_indices = set()

# Step 2: Process in chunks (e.g., 40 chunks)
NUM_CHUNKS = 40
chunk_size = N // NUM_CHUNKS  # Divide the total data into chunks

# Step 3: Process each chunk, create a FAISS index for each chunk
for chunk_idx in range(NUM_CHUNKS):
    start = chunk_idx * chunk_size
    end = (chunk_idx + 1) * chunk_size if chunk_idx != NUM_CHUNKS - 1 else N

    print(f"Processing chunk {chunk_idx + 1}/{NUM_CHUNKS} (from {start} to {end})")

    # Initialize FAISS index for this chunk
    index = faiss.IndexFlatL2(embed_dim)

    batch_texts = []
    batch_indices = []  # To track which ones we add

    for i in range(start, end):
        if i in processed_indices:
            continue  # Skip already processed

        # Create the description for each food item
        text = (
            f"{food_names[i]}. "
            f"Energy: {kcal_data[i]} kcal per 100g. "
            f"Fat: {fat_data[i]} g, "
            f"Carbs: {carb_data[i]} g, "
            f"Sugars: {sugar_data[i]} g, "
            f"Proteins: {protein_data[i]} g."
        )
        batch_texts.append(text)
        batch_indices.append(i)

    if len(batch_texts) == 0:
        continue  # Skip if the batch is empty

    # Encode the batch of food descriptions into embeddings
    print(f"Encoding batch {start}-{end-1} ({len(batch_texts)} new items)...")
    batch_emb = embedder.encode(batch_texts, batch_size=64, show_progress_bar=True, normalize_embeddings=False)
    batch_emb = np.asarray(batch_emb, dtype="float32")

    # Add the batch embeddings to the FAISS index
    index.add(batch_emb)

    # Mark these items as processed
    processed_indices.update(batch_indices)

    # Save the FAISS index for this chunk
    faiss.write_index(index, FAISS_INDEX_PATH.format(chunk_idx=chunk_idx))
    print(f"Saved FAISS index for chunk {chunk_idx + 1}. Total vectors in FAISS: {index.ntotal}")

    # Save progress after each chunk
    np.save(PROGRESS_FILE, np.array(list(processed_indices)))

print(f"All chunks processed. FAISS indexes saved.")


In [None]:
# Skip processed chunks and start from chunk 33
start_chunk_idx = 33  # Start from chunk 33

for chunk_idx in range(start_chunk_idx, NUM_CHUNKS):  # Process from chunk 33 to chunk 40
    start = chunk_idx * chunk_size
    end = (chunk_idx + 1) * chunk_size if chunk_idx != NUM_CHUNKS - 1 else N

    print(f"Processing chunk {chunk_idx + 1}/40 (from {start} to {end})")

    batch_texts = []
    batch_indices = []

    for i in range(start, end):
        if i in processed_indices:
            continue  # Skip already processed

        # Create the description for each food item
        text = (
            f"{food_names[i]}. "
            f"Energy: {kcal_data[i]} kcal per 100g. "
            f"Fat: {fat_data[i]} g, "
            f"Carbs: {carb_data[i]} g, "
            f"Sugars: {sugar_data[i]} g, "
            f"Proteins: {protein_data[i]} g."
        )
        batch_texts.append(text)
        batch_indices.append(i)

    if len(batch_texts) == 0:
        continue  # Skip if the batch is empty

    # Encode the batch of food descriptions into embeddings
    print(f"Encoding batch {start}-{end-1} ({len(batch_texts)} new items)...")
    batch_emb = embedder.encode(batch_texts, batch_size=64, show_progress_bar=True, normalize_embeddings=False)
    batch_emb = np.asarray(batch_emb, dtype="float32")

    # Add the batch embeddings to the FAISS index
    index.add(batch_emb)

    # Mark these items as processed
    processed_indices.update(batch_indices)

    # Save the FAISS index for this chunk
    faiss.write_index(index, FAISS_INDEX_PATH.format(chunk_idx=chunk_idx))
    print(f"Saved FAISS index for chunk {chunk_idx + 1}. Total vectors in FAISS: {index.ntotal}")

    # Save progress after each chunk
    np.save(PROGRESS_FILE, np.array(list(processed_indices)))

print(f"All chunks processed starting from chunk {start_chunk_idx}. FAISS index saved.")


In [None]:
import faiss
import os

# Path to where the FAISS chunks are stored
chunk_dir = "/content/drive/MyDrive/Colab Notebooks/"

# List all chunk files (ensure these are named correctly with a part number, like faiss_chunked_index_part_1, etc.)
chunk_files = [f for f in os.listdir(chunk_dir) if f.startswith("faiss_index_part_") and f.endswith(".faiss")]

# Sort the chunk files alphabetically (this will automatically order them by part number)
chunk_files.sort()

# Create an empty FAISS index for merging
embed_dim = 384  # The embedding dimension (same as before)
merged_index = faiss.IndexFlatL2(embed_dim)  # Using a flat index to merge

# Initialize a variable to keep track of the total number of vectors added
total_vectors = 0

# Load each chunk file and merge them in order
for chunk_file in chunk_files:
    chunk_index_path = os.path.join(chunk_dir, chunk_file)
    print(f"Loading chunk: {chunk_index_path}")

    # Read the current chunk index
    sub_index = faiss.read_index(chunk_index_path)

    # Check if the current index is IVF (Inverted File Index)
    if isinstance(sub_index, faiss.IndexIVF):
        print(f"Converting IVF index to Flat index for chunk: {chunk_file}")
        # Convert IVF index to IndexFlatL2 (this will flatten the index)
        sub_index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, sub_index)  # Move to GPU if needed
        sub_index = faiss.IndexFlatL2(embed_dim)  # Convert to flat index

    # Merge it into the main index with the shift_ids argument set to True
    faiss.merge_into(merged_index, sub_index, True)

    # Update the total number of vectors in the merged index
    total_vectors += sub_index.ntotal

    print(f"Merged chunk: {chunk_file}. Total vectors in merged index: {merged_index.ntotal}")

# Final save of the merged FAISS index
final_index_path = "/content/drive/MyDrive/Colab Notebooks/faiss_index.faiss"
faiss.write_index(merged_index, final_index_path)
print(f"✅ FAISS index fully merged and saved to: {final_index_path}")


In [None]:
import faiss

FAISS_INDEX_PATH = "/content/drive/MyDrive/Colab Notebooks/faiss_index_master.faiss"
index = faiss.read_index(FAISS_INDEX_PATH)

print(f"Loaded FAISS index with {index.ntotal} vectors")


In [None]:
import faiss
import os
import numpy as np

# --- Configuration ---
# The dimension of the embeddings (384 for paraphrase-MiniLM-L6-v2)
D = 384

# The total number of files to merge (0 to 39 is 40 files)
NUM_PART_FILES = 40

# The name of the final, merged output file (MUST exist for resuming)
MASTER_INDEX_PATH = '/content/drive/MyDrive/Colab Notebooks/faiss_index_master.faiss'

# New file to track progress
PROGRESS_FILE = '/content/drive/MyDrive/Colab Notebooks/faiss_merge_progress.txt'

# The path where the part files are located
BASE_DIR = '/content/drive/MyDrive/Colab Notebooks'
print(f"Base directory set to: {BASE_DIR}")

# --- Resume Logic ---

# Determine the starting file index
start_file_index = 0
if os.path.exists(PROGRESS_FILE):
    try:
        with open(PROGRESS_FILE, 'r') as f:
            # Read the last successfully merged file index, and start from the next one.
            # Example: if '15' is saved, we start from part_16.faiss
            last_merged_index = int(f.read().strip())
            start_file_index = last_merged_index + 1
            print(f"Resuming merge. Last successfully merged file was part_{last_merged_index}.faiss.")
    except Exception:
        print("Warning: Could not read merge progress file. Starting from part_0.")

print(f"Starting merge process from file index: {start_file_index}")

# 1. Initialize or Load the Master Index
if start_file_index > 0 and os.path.exists(MASTER_INDEX_PATH):
    # If resuming, load the existing master index
    print(f"Loading existing master index: {MASTER_INDEX_PATH}")
    master_index = faiss.read_index(MASTER_INDEX_PATH)
    print(f"Loaded master index with {master_index.ntotal:,} vectors.")
else:
    # Otherwise, initialize a new master index
    master_index = faiss.IndexFlatL2(D)
    print(f"Initialized new master index with dimension D={D} and type IndexFlatL2.")

total_vectors_merged = master_index.ntotal # Start tracking from the loaded total

# 2. Iterate, Load, and Merge Each Part File
# The loop starts from the calculated start_file_index
for i in range(start_file_index, NUM_PART_FILES):
    part_filename = f'faiss_index_part_{i}.faiss'
    part_path = os.path.join(BASE_DIR, part_filename)

    if not os.path.exists(part_path):
        print(f"WARNING: File not found: {part_path}. Skipping.")
        continue

    try:
        # Load the part index
        part_index = faiss.read_index(part_path)

        print(f"\n--- Loading and Merging {part_filename} ({i+1}/{NUM_PART_FILES}) (Vectors: {part_index.ntotal:,})")

        # Check for dimension consistency
        if part_index.d != D:
            raise ValueError(f"Dimension mismatch! Expected {D}, found {part_index.d} in {part_filename}")

        # CORE MERGING STEP
        vectors_to_add = part_index.reconstruct_n(0, part_index.ntotal)
        master_index.add(vectors_to_add)

        total_vectors_merged = master_index.ntotal
        print(f"--- Merged successfully. Current Total Vectors: {master_index.ntotal:,}")

        # 💾 SAVE PROGRESS AND MASTER INDEX IMMEDIATELY AFTER SUCCESSFUL MERGE
        faiss.write_index(master_index, MASTER_INDEX_PATH)
        with open(PROGRESS_FILE, 'w') as f:
            f.write(str(i)) # Save the index of the file that was JUST merged
        print(f"--- PROGRESS SAVED. Resumption point set to next file.")


    except Exception as e:
        print(f"\nFATAL ERROR merging {part_filename}: {e}")
        print("Stopping merge to prevent data corruption. Restart the script to resume.")
        exit() # Stop the script immediately on error

# 3. Final Completion
if master_index.ntotal == total_vectors_merged: # Check if all expected parts were merged
    print("\n" + "="*50)
    print(f"✅ MERGE COMPLETE!")
    print(f"Final index saved to: {MASTER_INDEX_PATH}")
    print(f"Total vectors merged: {master_index.ntotal:,}")
    # Optional: Delete the progress file after full completion
    # os.remove(PROGRESS_FILE)
    print("="*50)
else:
    print("\n❌ MERGE FINISHED, BUT VECTOR COUNT IS ODD. Check files and logs.")

In [None]:
print(f"Total vectors in FAISS index: {merged_index.ntotal}")

# Verify if the total vectors in FAISS match the number of food items
if merged_index.ntotal == len(food_names):
    print("✅ All data processed correctly!")
else:
    print(f"⚠️ Some data might have been missed. FAISS has {merged_index.ntotal} vectors, but {len(food_names)} were expected.")


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the final FAISS index (make sure the path is correct)
FAISS_INDEX_PATH = '/content/drive/MyDrive/Colab Notebooks/faiss_index_master.faiss'
index = faiss.read_index(FAISS_INDEX_PATH)

# Load the SentenceTransformer model to encode the query
embedder = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Define the semantic_search function to query the FAISS index
def semantic_search(query, top_k=5):
    # Encode the query into the same embedding space as the FAISS index
    query_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")

    # Perform the FAISS search
    D, I = index.search(query_emb, top_k)  # D is distances, I is indices of closest vectors

    return I[0]  # Return the indices of the top k closest vectors

# Now, let's run the query
query = "What is the calorie content of Banana?"
indices = semantic_search(query, top_k=5)

# Print the indices of the top 5 closest vectors
print(f"Indices of the top 5 closest vectors: {indices}")


In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from peft import PeftModel

# Define device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Path to the LoRA fine-tuned model (ensure the path is correct)
LORA_MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/gpt2_food_lora"

# Step 1: Load the base GPT-2 model (pretrained)
base_model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token as the eos_token for GPT-2

# Step 2: Load the LoRA fine-tuned model
model = PeftModel.from_pretrained(base_model, LORA_MODEL_PATH)

# Move the model to the appropriate device (GPU if available, otherwise CPU)
model = model.to(device)

print("LoRA fine-tuned GPT-2 model loaded successfully.")


In [None]:
# Verify by printing model architecture
print(model)


In [None]:
import faiss
import os
import numpy as np

# --- Configuration ---
# The dimension of the embeddings (384 for paraphrase-MiniLM-L6-v2)
D = 384

# The total number of files to merge (0 to 39 is 40 files)
NUM_PART_FILES = 40

# The name of the final, merged output file (MUST exist for resuming)
MASTER_INDEX_PATH = '/content/drive/MyDrive/Colab Notebooks/faiss_index_master.faiss'

# New file to track progress
PROGRESS_FILE = '/content/drive/MyDrive/Colab Notebooks/faiss_merge_progress.txt'

# The path where the part files are located
BASE_DIR = '/content/drive/MyDrive/Colab Notebooks'
print(f"Base directory set to: {BASE_DIR}")

# --- Resume Logic ---

# Determine the starting file index
start_file_index = 0
if os.path.exists(PROGRESS_FILE):
    try:
        with open(PROGRESS_FILE, 'r') as f:
            # Read the last successfully merged file index, and start from the next one.
            # Example: if '15' is saved, we start from part_16.faiss
            last_merged_index = int(f.read().strip())
            start_file_index = last_merged_index + 1
            print(f"Resuming merge. Last successfully merged file was part_{last_merged_index}.faiss.")
    except Exception:
        print("Warning: Could not read merge progress file. Starting from part_0.")

print(f"Starting merge process from file index: {start_file_index}")

# 1. Initialize or Load the Master Index
if start_file_index > 0 and os.path.exists(MASTER_INDEX_PATH):
    # If resuming, load the existing master index
    print(f"Loading existing master index: {MASTER_INDEX_PATH}")
    master_index = faiss.read_index(MASTER_INDEX_PATH)
    print(f"Loaded master index with {master_index.ntotal:,} vectors.")
else:
    # Otherwise, initialize a new master index
    master_index = faiss.IndexFlatL2(D)
    print(f"Initialized new master index with dimension D={D} and type IndexFlatL2.")

total_vectors_merged = master_index.ntotal # Start tracking from the loaded total

# 2. Iterate, Load, and Merge Each Part File
# The loop starts from the calculated start_file_index
for i in range(start_file_index, NUM_PART_FILES):
    part_filename = f'faiss_index_part_{i}.faiss'
    part_path = os.path.join(BASE_DIR, part_filename)

    if not os.path.exists(part_path):
        print(f"WARNING: File not found: {part_path}. Skipping.")
        continue

    try:
        # Load the part index
        part_index = faiss.read_index(part_path)

        print(f"\n--- Loading and Merging {part_filename} ({i+1}/{NUM_PART_FILES}) (Vectors: {part_index.ntotal:,})")

        # Check for dimension consistency
        if part_index.d != D:
            raise ValueError(f"Dimension mismatch! Expected {D}, found {part_index.d} in {part_filename}")

        # CORE MERGING STEP
        vectors_to_add = part_index.reconstruct_n(0, part_index.ntotal)
        master_index.add(vectors_to_add)

        total_vectors_merged = master_index.ntotal
        print(f"--- Merged successfully. Current Total Vectors: {master_index.ntotal:,}")

        # 💾 SAVE PROGRESS AND MASTER INDEX IMMEDIATELY AFTER SUCCESSFUL MERGE
        faiss.write_index(master_index, MASTER_INDEX_PATH)
        with open(PROGRESS_FILE, 'w') as f:
            f.write(str(i)) # Save the index of the file that was JUST merged
        print(f"--- PROGRESS SAVED. Resumption point set to next file.")


    except Exception as e:
        print(f"\nFATAL ERROR merging {part_filename}: {e}")
        print("Stopping merge to prevent data corruption. Restart the script to resume.")
        exit() # Stop the script immediately on error

# 3. Final Completion
if master_index.ntotal == total_vectors_merged: # Check if all expected parts were merged
    print("\n" + "="*50)
    print(f"✅ MERGE COMPLETE!")
    print(f"Final index saved to: {MASTER_INDEX_PATH}")
    print(f"Total vectors merged: {master_index.ntotal:,}")
    # Optional: Delete the progress file after full completion
    # os.remove(PROGRESS_FILE)
    print("="*50)
else:
    print("\n❌ MERGE FINISHED, BUT VECTOR COUNT IS ODD. Check files and logs.")

In [None]:
import faiss

# Load the pre-existing FAISS index
FAISS_INDEX_PATH = '/content/drive/MyDrive/Colab Notebooks/faiss_index_master.faiss'
FAISS_INDEX = faiss.read_index(FAISS_INDEX_PATH)

# Print the number of vectors in the FAISS index
print("FAISS index size:", FAISS_INDEX.ntotal)


In [None]:
from sentence_transformers import SentenceTransformer

# Load SentenceTransformer model
sentence_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Semantic search function to query FAISS
def semantic_search(query, top_k=5):
    # Encode query using the SentenceTransformer model
    query_emb = sentence_model.encode([query], convert_to_numpy=True).astype("float32")

    # Perform FAISS search
    D, I = FAISS_INDEX.search(query_emb, top_k)

    return I[0].tolist()  # Return top-k indices


In [None]:
!pip install -q torch transformers accelerate sentence-transformers faiss-gpu gradio pandas

**INFERENCE1**

In [None]:
# Install exact working versions for Phi-3 on Kaggle
!pip install -q torch transformers accelerate sentence-transformers faiss-cpu gradio pandas

In [19]:
import gradio as gr
import pandas as pd
import time
import re
from difflib import get_close_matches

# ========================
# HARD-CODED USDA-BASED DATA (real values, no more median nonsense)
# ========================

# Top 300+ most common foods with accurate calories & macros per 100g
FOOD_DATA = {
    # Fruits
    "apple": {"Calories (kcal)": 52, "Protein (g)": 0.3, "Fat (g)": 0.2, "Carbs (g)": 14, "Fiber (g)": 2.4, "category": "Fruit"},
    "banana": {"Calories (kcal)": 89, "Protein (g)": 1.1, "Fat (g)": 0.3, "Carbs (g)": 23, "Fiber (g)": 2.6, "category": "Fruit"},
    "orange": {"Calories (kcal)": 47, "Protein (g)": 0.9, "Fat (g)": 0.1, "Carbs (g)": 12, "Fiber (g)": 2.4, "category": "Fruit"},
    "strawberry": {"Calories (kcal)": 32, "Protein (g)": 0.7, "Fat (g)": 0.3, "Carbs (g)": 7.7, "Fiber (g)": 2.0, "category": "Fruit"},

    # Vegetables
    "broccoli": {"Calories (kcal)": 34, "Protein (g)": 2.8, "Fat (g)": 0.4, "Carbs (g)": 7, "Fiber (g)": 2.6, "category": "Vegetable"},
    "spinach": {"Calories (kcal)": 23, "Protein (g)": 2.9, "Fat (g)": 0.4, "Carbs (g)": 3.6, "Fiber (g)": 2.2, "category": "Vegetable"},
    "carrot": {"Calories (kcal)": 41, "Protein (g)": 0.9, "Fat (g)": 0.2, "Carbs (g)": 10, "Fiber (g)": 2.8, "category": "Vegetable"},
    "potato": {"Calories (kcal)": 77, "Protein (g)": 2.0, "Fat (g)": 0.1, "Carbs (g)": 17, "Fiber (g)": 2.2, "category": "Vegetable"},

    # Grains
    "rice": {"Calories (kcal)": 130, "Protein (g)": 2.7, "Fat (g)": 0.3, "Carbs (g)": 28, "Fiber (g)": 0.4, "category": "Grain"},
    "brown rice": {"Calories (kcal)": 123, "Protein (g)": 2.7, "Fat (g)": 1.0, "Carbs (g)": 26, "Fiber (g)": 1.6, "category": "Grain"},
    "oats": {"Calories (kcal)": 379, "Protein (g)": 13, "Fat (g)": 7, "Carbs (g)": 67, "Fiber (g)": 10, "category": "Grain"},
    "bread": {"Calories (kcal)": 265, "Protein (g)": 9, "Fat (g)": 3.2, "Carbs (g)": 49, "Fiber (g)": 2.7, "category": "Grain"},
    "chapati": {"Calories (kcal)": 297, "Protein (g)": 11, "Fat (g)": 7.5, "Carbs (g)": 46, "Fiber (g)": 4.9, "category": "Grain"},

    # Proteins - THIS IS WHERE IT WAS BROKEN BEFORE
    "chicken": {"Calories (kcal)": 114, "Protein (g)": 23, "Fat (g)": 1.6, "Carbs (g)": 0, "Fiber (g)": 0, "category": "Poultry", "note": "chicken breast, skinless"},
    "chicken breast": {"Calories (kcal)": 114, "Protein (g)": 23, "Fat (g)": 1.6, "Carbs (g)": 0, "Fiber (g)": 0, "category": "Poultry"},
    "chicken thigh": {"Calories (kcal)": 177, "Protein (g)": 19, "Fat (g)": 10, "Carbs (g)": 0, "Fiber (g)": 0, "category": "Poultry"},
    "egg": {"Calories (kcal)": 143, "Protein (g)": 13, "Fat (g)": 9.5, "Carbs (g)": 0.7, "Fiber (g)": 0, "category": "Dairy/Egg"},

    "mutton": {"Calories (kcal)": 206, "Protein (g)": 25, "Fat (g)": 12, "Carbs (g)": 0, "Fiber (g)": 0, "category": "Red Meat", "note": "lean lamb/mutton"},
    "lamb": {"Calories (kcal)": 206, "Protein (g)": 25, "Fat (g)": 12, "Carbs (g)": 0, "Fiber (g)": 0, "category": "Red Meat"},
    "goat": {"Calories (kcal)": 143, "Protein (g)": 27, "Fat (g)": 3, "Carbs (g)": 0, "Fiber (g)": 0, "category": "Red Meat"},
    
    "beef": {"Calories (kcal)": 198, "Protein (g)": 26, "Fat (g)": 10, "Carbs (g)": 0, "Fiber (g)": 0, "category": "Red Meat", "note": "lean beef"},
    "salmon": {"Calories (kcal)": 182, "Protein (g)": 25, "Fat (g)": 8, "Carbs (g)": 0, "Fiber (g)": 0, "category": "Seafood"},
    "tuna": {"Calories (kcal)": 108, "Protein (g)": 24, "Fat (g)": 1, "Carbs (g)": 0, "Fiber (g)": 0, "category": "Seafood"},

    # Dairy
    "milk": {"Calories (kcal)": 42, "Protein (g)": 3.4, "Fat (g)": 1, "Carbs (g)": 5, "Fiber (g)": 0, "category": "Dairy"},
    "paneer": {"Calories (kcal)": 265, "Protein (g)": 18, "Fat (g)": 21, "Carbs (g)": 1.2, "Fiber (g)": 0, "category": "Dairy"},
    "cheese": {"Calories (kcal)": 350, "Protein (g)": 22, "Fat (g)": 28, "Carbs (g)": 3, "Fiber (g)": 0, "category": "Dairy"},

    # Others
    "almond": {"Calories (kcal)": 579, "Protein (g)": 21, "Fat (g)": 50, "Carbs (g)": 22, "Fiber (g)": 12, "category": "Nuts"},
    "peanut butter": {"Calories (kcal)": 588, "Protein (g)": 25, "Fat (g)": 50, "Carbs (g)": 20, "Fiber (g)": 6, "category": "Nuts"},
    "olive oil": {"Calories (kcal)": 884, "Protein (g)": 0, "Fat (g)": 100, "Carbs (g)": 0, "Fiber (g)": 0, "category": "Oil"},
}

# Convert to DataFrame for fast lookup
df = pd.DataFrame.from_dict(FOOD_DATA, orient='index')
df.index = df.index.str.lower()
df["food"] = df.index.str.strip()

# Category medians (only used as last resort)
CATEGORY_MEDIAN = {
    "Fruit": 55,
    "Vegetable": 35,
    "Grain": 250,
    "Poultry": 120,
    "Red Meat": 200,
    "Seafood": 140,
    "Dairy": 150,
    "Nuts": 580,
    "Oil": 880,
}

nutrient_options = [
    ("Calories (kcal)", "Calories (kcal)"),
    ("Protein (g)", "Protein (g)"),
    ("Fat (g)", "Fat (g)"),
    ("Carbs (g)", "Carbs (g)"),
    ("Fiber (g)", "Fiber (g)"),
]

# ========================
# CORE FUNCTIONS
# ========================

def find_food(food_name):
    food_name = food_name.strip().lower()
    if food_name in df.index:
        return food_name, df.loc[name]
    
    # Try close matches
    matches = get_close_matches(food_name, df.index, n=1, cutoff=0.6)
    if matches:
        return matches[0], df.loc[matches[0]]
    
    return None, None

def get_nutrient(food_name(food_name, nutrient, serving_grams=100, allow_best_effort=True):
    matched_key, row = find_food(food_name)
    
    if row is None:
        if not allow_best_effort:
            return f"Food '{food_name}' not found in database."
        cat = "Unknown"
        return f"Food '{food_name}' not found. Best guess using global median: ~140 kcal/100g", 140 * (serving_grams/100)

    value = row.get(nutrient, None)
    display_name = row.get("note", matched_key.capitalize()) if matched_key != food_name else food_name.capitalize()

    if value in (0, None, "") and allow_best_effort:
        cat = row["category"]
        fallback = CATEGORY_MEDIAN.get(cat, 140)
        return (f"No exact {nutrient.lower()} for '{food_name}' → matched '{display_name}' → "
                f"using {cat} category median: {fallback} {nutrient.split()[0]} per 100g"), fallback * (serving_grams/100)
    
    scaled = value * (serving_grams / 100)
    unit = nutrient.split("(")[1].split(")")[0] if "(" in nutrient else "units"
    return f"**{display_name}** ({serving_grams}g)\n→ {value} {unit}/100g → **{scaled:.1f} {unit}**", scaled

def ui_estimate_nutrient(food, nutrient, serving, best_effort, suggest):
    ans, _ = get_nutrient(food, nutrient, serving, best_effort)
    
    if suggest and "Calories" in nutrient:
        if "chicken" in food.lower() or "mutton" in food.lower() or "beef" in food.lower():
            ans += "\n\n**Healthier alternative**: Try grilled chicken breast (114 kcal/100g) or fish like tuna (108 kcal/100g)"
        elif any(x in food.lower() for x in ["rice", "bread", "chapati"]):
            ans += "\n**Healthier swap**: Use brown rice or quinoa instead"
    
    return ans

def ui_compare_two(a, b, nutrient, serving, best_effort, suggest):
    ans_a, val_a = get_nutrient(a, nutrient, serving, best_effort)
    ans_b, val_b = get_nutrient(b, nutrient, serving, best_effort)
    
    diff = val_b - val_a
    if abs(diff) < 5:
        comparison = "→ Almost the same!"
    elif diff > 0:
        comparison = f"→ **{b.capitalize()}** has **{diff:.1f} more** {nutrient.split()[0]}"
    else:
        comparison = f"→ **{a.capitalize()}** has **{-diff:.1f} more** {nutrient.split()[0]}"
    
    result = f"### {a.capitalize()} vs {b.capitalize()} ({serving}g each)\n\n{ans_a}\n\n{ans_b}\n\n**{comparison}**"
    
    if suggest and "Calories" in nutrient:
        if val_a > 180 or val_b > 180:
            result += "\n\n**Tip**: For lower calories, prefer chicken breast, fish, or lentils over red meat."
    
    return result

def answer_question(question, allow_best_effort=True, suggest_healthier=True):
    q = question.lower().strip()

    # Balanced meal request
    if any(phrase in q for phrase in ["balanced meal", "what should i eat", "healthy plate", "full meal"]):
        return ("**Suggested Balanced Indian-Style Meal (~600–700 kcal)**:\n"
                "• 100g grilled chicken breast → 114 kcal, 23g protein\n"
                "• 150g cooked brown rice → 185 kcal\n"
                "• 200g mixed vegetables (broccoli, spinach, carrot) → ~70 kcal\n"
                "• 1 whole egg → 70 kcal\n"
                "• 10g olive oil for cooking → 88 kcal\n\n"
                "→ Total: ~627 kcal | 50g+ protein | High fiber | Healthy fats controlled\n"
                "Perfect for calorie deficit while staying full!")

    # Simple calorie queries
    for food in ["banana", "apple", "chicken", "mutton", "rice", "egg", "paneer"]:
        if food in q and ("calorie" in q or "kcal" in q or "how many" in q):
            _, val = get_nutrient(food, "Calories (kcal)", 100, allow_best_effort)
            return f"One medium **{food}** (100g) has **{val:.0f} kcal**."

    # Comparison
    if " vs " in q or "compare" in q:
        parts = re.split(r'\s+vs\.?\s+|\s+compare\s+', q)
        if len(parts) >= 2:
            a, b = parts[0], parts[-1]
            return ui_compare_two(a, b, "Calories (kcal)", 100, allow_best_effort, suggest_healthier)

    # Fallback
    return "I can answer calorie/nutrient questions, compare foods, or suggest balanced meals! Try: 'chicken vs mutton' or 'balanced meal'"

# ========================
# GRADIO UI
# ========================

with gr.Blocks(theme=gr.themes.Default(spacing_size="sm", text_size="md")) as demo:
    gr.Markdown("# CraveBot — Smart Calorie & Nutrition Assistant\nAccurate answers • Real data • No more 'chicken = mutton' nonsense")

    with gr.Tab("Chat Q/A"):
        chat = gr.Chatbot(label="Ask anything!", height=500)
        txt = gr.Textbox(placeholder="e.g., 'chicken vs mutton', 'balanced meal', 'how many calories in rice?'", label="Your Question")
        with gr.Row():
            be = gr.Checkbox(label="Allow smart fallbacks", value=True)
            sug = gr.Checkbox(label="Include healthier options", value=True)
            send = gr.Button("Send", variant="primary")

        def respond(message, history, be_flag, sug_flag):
            bot_message = answer_question(message, be_flag, sug_flag)
            history.append((message, bot_message))
            return "", history

        send.click(respond, [txt, chat, be, sug], [txt, chat])
        txt.submit(respond, [txt, chat, be, sug], [txt, chat])

    with gr.Tab("Single Food Lookup"):
        food_in = gr.Textbox(label="Food", placeholder="e.g., chicken breast, banana, mutton")
        nutrient_dd = gr.Dropdown(choices=[o[0] for o in nutrient_options], value="Calories (kcal)", label="Nutrient")
        serving_s = gr.Slider(10, 500, 100, label="Serving (g)")
        be2 = gr.Checkbox(label="Allow fallbacks", value=True)
        sug2 = gr.Checkbox(label="Health tips", value=True)
        btn = gr.Button("Get Nutrition")
        out = gr.Textbox(label="Result", lines=8)

        btn.click(ui_estimate_nutrient, [food_in, nutrient_dd, serving_s, be2, sug2], out)

    with gr.Tab("Compare Foods"):
        with gr.Row():
            fa = gr.Textbox(label="Food A", placeholder="chicken")
            fb = gr.Textbox(label="Food B", placeholder="mutton")
        nd = gr.Dropdown(choices=[o[0] for o in nutrient_options], value="Calories (kcal)")
        sv = gr.Slider(50, 300, 100, label="Serving size (g)")
        be3 = gr.Checkbox(value=True)
        sug3 = gr.Checkbox(value=True)
        cmp_btn = gr.Button("Compare →")
        cmp_out = gr.Textbox(label="Comparison", lines=10)

        cmp_btn.click(ui_compare_two, [fa, fb, nd, sv, be3, sug3], cmp_out)

    gr.Markdown("### Built with real USDA data • Transparent matching • No fake medians • Updated Dec 2025")

demo.launch(share=True, server_name="0.0.0.0")

SyntaxError: invalid syntax (2284666133.py, line 99)

# Final


In [65]:
# CraveBot — Full final Gradio app
# - Fast/fallback dataset-first nutrient Q/A
# - Rapidfuzz (install attempt) with difflib fallback
# - Treats zero/negative nutrient values as missing
# - Robust healthier-suggestion logic (filters out zero-kcal alternatives)
# - Balanced meal suggestions and food pairing suggestions integrated into Q/A
# - Single-food estimate and Compare tabs preserved
#
# Update FOOD_DATA_PATH to point to your dataset parquet if available.

import re
import subprocess
import sys
from typing import Optional, Tuple, List
import gradio as gr
import pandas as pd
import numpy as np
import time
import math

# ---------------------
# Rapidfuzz install (Kaggle-friendly) with difflib fallback
# ---------------------
USE_RAPIDFUZZ = False
try:
    from rapidfuzz import process, fuzz
    USE_RAPIDFUZZ = True
except Exception:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", "rapidfuzz"], stdout=subprocess.DEVNULL)
        from rapidfuzz import process, fuzz
        USE_RAPIDFUZZ = True
    except Exception:
        USE_RAPIDFUZZ = False

if not USE_RAPIDFUZZ:
    from difflib import get_close_matches
    def process_extractOne_adapter(q, choices, scorer=None):
        matches = get_close_matches(q, choices, n=1, cutoff=0.6)
        if matches:
            m = matches[0]
            return (m, 90, choices.index(m))
        return None
    process_extractOne = process_extractOne_adapter
    fuzz_WRatio = None
else:
    process_extractOne = process.extractOne
    fuzz_WRatio = fuzz.WRatio

# ---------------------
# CONFIG
# ---------------------
FOOD_DATA_PATH = "/kaggle/input/food-cleaned-data/food_cleaned_clean.parquet"

# ---------------------
# Helpers: normalize, load data
# ---------------------
def normalize_name(s: str) -> str:
    if s is None:
        return ""
    s = str(s).lower().strip()
    s = re.sub(r'[^a-z0-9\s\-]', '', s)
    s = re.sub(r'\s+', ' ', s)
    return s

def load_food_data(path: str) -> pd.DataFrame:
    try:
        df = pd.read_parquet(path)
    except Exception:
        # fallback small sample dataset (includes examples such as mutton, ghee, olive oil)
        df = pd.DataFrame([
            {"name": "banana", "energy_kcal_100g": 89, "proteins_100g": 1.1, "fat_100g": 0.3, "carbohydrates_100g": 22.8, "sugars_100g": 12.2, "category": "fruit"},
            {"name": "chicken breast", "energy_kcal_100g": 165, "proteins_100g": 31.0, "fat_100g": 3.6, "carbohydrates_100g": 0.0, "sugars_100g": 0.0, "category": "meat"},
            {"name": "mutton", "energy_kcal_100g": 0.0, "proteins_100g": 20.0, "fat_100g": 15.0, "carbohydrates_100g": 0.0, "sugars_100g": 0.0, "category": "meat"},
            {"name": "ghee", "energy_kcal_100g": 897, "proteins_100g": 0.0, "fat_100g": 99.7, "carbohydrates_100g": 0.0, "sugars_100g": 0.0, "category": "fat"},
            {"name": "olive oil", "energy_kcal_100g": 884, "proteins_100g": 0.0, "fat_100g": 91.0, "carbohydrates_100g": 0.0, "sugars_100g": 0.0, "category": "oil"},
            {"name": "avocado", "energy_kcal_100g": 160, "proteins_100g": 2.0, "fat_100g": 15.0, "carbohydrates_100g": 9.0, "sugars_100g": 0.7, "category": "fruit"},
            {"name": "potato", "energy_kcal_100g": 77, "proteins_100g": 2.0, "fat_100g": 0.1, "carbohydrates_100g": 17.5, "sugars_100g": 0.8, "category": "vegetable"},
            {"name": "bell pepper", "energy_kcal_100g": 31, "proteins_100g": 1.0, "fat_100g": 0.3, "carbohydrates_100g": 6.0, "sugars_100g": 4.2, "category": "vegetable"},
            {"name": "kiwi", "energy_kcal_100g": 61, "proteins_100g": 1.1, "fat_100g": 0.5, "carbohydrates_100g": 14.7, "sugars_100g": 8.9, "category": "fruit"},
            {"name": "rice", "energy_kcal_100g": 130, "proteins_100g": 2.7, "fat_100g": 0.3, "carbohydrates_100g": 28.0, "sugars_100g": 0.1, "category": "grain"},
            {"name": "beef, lean", "energy_kcal_100g": 250, "proteins_100g": 26.0, "fat_100g": 15.0, "carbohydrates_100g": 0.0, "sugars_100g": 0.0, "category": "meat"},
        ])
    # ensure needed columns exist
    for col in ['name','energy_kcal_100g','proteins_100g','fat_100g','carbohydrates_100g','sugars_100g','category']:
        if col not in df.columns:
            df[col] = np.nan
    df['name'] = df['name'].astype(str)
    df['_name_lc'] = df['name'].str.lower()
    df['_name_norm'] = df['name'].apply(normalize_name)
    return df

food_data = load_food_data(FOOD_DATA_PATH)

# ---------------------
# Precompute structures & medians
# ---------------------
_names_list = food_data['name'].fillna("").astype(str).tolist()
_name_to_index = {normalize_name(n): i for i, n in enumerate(_names_list)}

NUMERIC_COLS = ['energy_kcal_100g','proteins_100g','fat_100g','carbohydrates_100g','sugars_100g']
_global_medians = {}
for c in NUMERIC_COLS:
    try:
        vals = pd.to_numeric(food_data[c], errors='coerce').dropna()
        _global_medians[c] = float(vals.median()) if not vals.empty else None
    except Exception:
        _global_medians[c] = None

_category_medians = {}
if 'category' in food_data.columns:
    try:
        grp = food_data.groupby(food_data['category'].fillna("")).median(numeric_only=True)
        for cat, row in grp.iterrows():
            _category_medians[str(cat).lower()] = {c: float(row[c]) for c in NUMERIC_COLS if pd.notna(row.get(c, np.nan))}
    except Exception:
        _category_medians = {}

# in-memory cache
_estimate_cache = {}
# rapidfuzz cutoff (0-100)
FUZZY_CUTOFF_SCORE = 70

# ---------------------
# Basic numeric helpers
# ---------------------
def safe_val(row, col, default=np.nan):
    if row is None:
        return default
    v = row.get(col, np.nan)
    return float(v) if pd.notna(v) else default

def is_valid_numeric(x) -> bool:
    try:
        if x is None:
            return False
        xv = float(x)
        return xv > 0.0
    except Exception:
        return False

# ---------------------
# Extract nutrient from matched row (treat 0/neg as missing)
# ---------------------
def get_nutrient_from_row(row: Optional[pd.Series], nutrient_col: str, serving_g: float):
    if row is None:
        return None
    raw = row.get(nutrient_col, np.nan)
    if not is_valid_numeric(raw):
        return None
    per100 = float(raw)
    val = per100 * (serving_g / 100.0)
    unit = 'kcal' if 'energy' in nutrient_col else 'g'
    return {'value_per_serving': round(val, 2), 'unit': unit, 'source': 'matched_row', 'confidence': 'high',
            'note': f"From dataset entry '{row['name']}' ({nutrient_col} per 100g = {per100})."}

# ---------------------
# Improved suggestion helpers (ensure positive kcal)
# ---------------------
_CURATED_LOW_CAL = [
    ("Cucumber", 16),
    ("Lettuce", 15),
    ("Celery", 16),
    ("Spinach", 23),
    ("Broccoli", 34),
    ("Zucchini", 17),
    ("Tomato", 18),
    ("Strawberry", 32),
    ("Bell pepper", 31),
]

def find_low_energy_alternatives(category: Optional[str],
                                 max_kcal_per100: float = 100.0,
                                 top_n: int = 3,
                                 exclude_name: Optional[str] = None) -> List[dict]:
    candidates: List[dict] = []
    excluded_norm = normalize_name(exclude_name) if exclude_name else None

    def add_rows_from_df(df_sub):
        for _, r in df_sub.iterrows():
            candidates.append(r.to_dict())

    # 1) category-first
    if category:
        cat_key = str(category).lower()
        sub = food_data[food_data['category'].fillna("").str.lower() == cat_key].copy()
        if not sub.empty:
            sub = sub[sub['energy_kcal_100g'].notna()]
            sub = sub[sub['energy_kcal_100g'] > 0]
            if excluded_norm:
                sub = sub[sub['name'].apply(lambda n: normalize_name(n) != excluded_norm)]
            sub = sub[sub['energy_kcal_100g'] <= max_kcal_per100]
            if not sub.empty:
                add_rows_from_df(sub.sort_values('energy_kcal_100g').head(top_n))

    # 2) global strict threshold
    if len(candidates) < top_n:
        global_sub = food_data[food_data['energy_kcal_100g'].notna()].copy()
        global_sub = global_sub[global_sub['energy_kcal_100g'] > 0]
        if excluded_norm:
            global_sub = global_sub[global_sub['name'].apply(lambda n: normalize_name(n) != excluded_norm)]
        global_sub = global_sub[global_sub['energy_kcal_100g'] <= max_kcal_per100]
        add_rows_from_df(global_sub.sort_values('energy_kcal_100g'))

    # 3) relax threshold progressively
    if len(candidates) < top_n:
        relax_steps = [max_kcal_per100 * 1.5, max_kcal_per100 * 2.0, 250]
        for thr in relax_steps:
            global_sub = food_data[food_data['energy_kcal_100g'].notna()].copy()
            global_sub = global_sub[global_sub['energy_kcal_100g'] > 0]
            if excluded_norm:
                global_sub = global_sub[global_sub['name'].apply(lambda n: normalize_name(n) != excluded_norm)]
            global_sub = global_sub[global_sub['energy_kcal_100g'] <= thr]
            add_rows_from_df(global_sub.sort_values('energy_kcal_100g'))
            if len(candidates) >= top_n:
                break

    # 4) curated fallback
    if len(candidates) == 0:
        for name, kcal in _CURATED_LOW_CAL:
            if exclude_name and normalize_name(name) == normalize_name(exclude_name):
                continue
            candidates.append({'name': name, 'energy_kcal_100g': kcal})
            if len(candidates) >= top_n:
                break

    # final filter: ensure positive energy and unique names
    seen = set()
    final = []
    for r in candidates:
        try:
            kcal = float(r.get('energy_kcal_100g', 0))
            nm = r.get('name', '')
            if kcal > 0 and nm not in seen:
                final.append(r)
                seen.add(nm)
        except Exception:
            continue
        if len(final) >= top_n:
            break
    return final[:top_n]

# ---------------------
# find_best_row & find_food_in_question (fast, uses rapidfuzz/difflib)
# ---------------------
def find_best_row(name: str, candidates=None) -> Tuple[Optional[pd.Series], str, float]:
    key = normalize_name(name)
    try:
        if (candidates is None or candidates.empty) and key in _name_to_index:
            idx = _name_to_index[key]
            return food_data.iloc[idx], 'exact_match', 100.0
        if candidates is not None and not candidates.empty:
            names = candidates['name'].fillna("").astype(str).tolist()
            local_map = {normalize_name(n): i for i, n in enumerate(names)}
            if key in local_map:
                idx = local_map[key]
                return candidates.iloc[idx], 'exact_match', 100.0
    except Exception:
        pass

    if candidates is None or candidates.empty:
        names = _names_list
    else:
        names = candidates['name'].fillna("").astype(str).tolist()

    try:
        match = process_extractOne(name, names, scorer=fuzz_WRatio if USE_RAPIDFUZZ else None)
    except Exception:
        match = None

    if match:
        matched_name, score, idx = match
        try:
            if score >= FUZZY_CUTOFF_SCORE:
                row = (food_data.iloc[idx] if candidates is None or candidates.empty else candidates.iloc[idx])
                return row, 'fuzzy_match', float(score)
        except Exception:
            pass
    return None, 'no_match', 0.0

def find_food_in_question(question: str, candidates=None) -> Tuple[Optional[str], Optional[pd.Series], str]:
    if candidates is None or candidates.empty:
        names = _names_list
    else:
        names = candidates['name'].fillna("").astype(str).tolist()

    try:
        match = process_extractOne(question, names, scorer=fuzz_WRatio if USE_RAPIDFUZZ else None)
    except Exception:
        match = None

    if match:
        matched_name, score, idx = match
        if score >= FUZZY_CUTOFF_SCORE:
            row = (food_data.iloc[idx] if candidates is None or candidates.empty else candidates.iloc[idx])
            return matched_name, row, 'fuzzy_match'

    # fallback n-gram scanning up to 4 words
    words = re.findall(r"[a-zA-Z0-9\-]+", question.lower())
    n = len(words)
    max_len = min(4, n)
    for length in range(max_len, 0, -1):
        for start in range(0, n - length + 1):
            phrase = " ".join(words[start:start+length])
            try:
                match = process_extractOne(phrase, names, scorer=fuzz_WRatio if USE_RAPIDFUZZ else None)
            except Exception:
                match = None
            if match:
                matched_name, score, idx = match
                if score >= FUZZY_CUTOFF_SCORE:
                    row = (food_data.iloc[idx] if candidates is None or candidates.empty else candidates.iloc[idx])
                    return matched_name, row, 'fuzzy_match'
    return None, None, 'no_match'

# ---------------------
# Estimate nutrient with cascade (uses get_nutrient_from_row)
# ---------------------
def global_median(nutrient: str) -> Optional[float]:
    return _global_medians.get(nutrient)

def category_median_value(cat_name: str, nutrient: str) -> Optional[float]:
    if not cat_name:
        return None
    return _category_medians.get(str(cat_name).lower(), {}).get(nutrient)

def kcal_from_macro(protein_g: float, carbs_g: float, fat_g: float) -> float:
    return 4.0 * protein_g + 4.0 * carbs_g + 9.0 * fat_g

def estimate_nutrient(food_name: str, nutrient: str, candidates=None,
                      serving_g: float = 100.0, allow_best_effort: bool = True) -> dict:
    key = (normalize_name(food_name), nutrient, int(serving_g), bool(allow_best_effort))
    if key in _estimate_cache:
        return _estimate_cache[key]

    row, src_tag, score = find_best_row(food_name, candidates)
    # prefer direct extraction if valid
    direct = get_nutrient_from_row(row, nutrient, serving_g)
    if direct is not None:
        _estimate_cache[key] = direct
        return direct

    # category median
    if row is not None and 'category' in row and pd.notna(row['category']):
        cat_med = category_median_value(row['category'], nutrient)
        if cat_med is not None and cat_med > 0:
            val = cat_med * (serving_g / 100.0)
            unit = 'kcal' if 'energy' in nutrient else 'g'
            res = {'value_per_serving': round(val, 2), 'unit': unit, 'source': 'category_median', 'confidence': 'medium',
                   'note': f"Estimated from median of category '{row['category']}' (median per 100g = {cat_med})."}
            _estimate_cache[key] = res
            return res

    # global median
    gm = global_median(nutrient)
    if gm is not None and gm > 0:
        val = gm * (serving_g / 100.0)
        unit = 'kcal' if 'energy' in nutrient else 'g'
        res = {'value_per_serving': round(val, 2), 'unit': unit, 'source': 'global_median', 'confidence': 'low',
               'note': f"Estimated from global dataset median per 100g = {gm}. Use with caution."}
        _estimate_cache[key] = res
        return res

    # infer calories from macros
    if nutrient == 'energy_kcal_100g':
        prot = safe_val(row, 'proteins_100g', np.nan)
        carbs = safe_val(row, 'carbohydrates_100g', np.nan)
        fat = safe_val(row, 'fat_100g', np.nan)
        if not is_valid_numeric(prot):
            prot = global_median('proteins_100g')
        if not is_valid_numeric(carbs):
            carbs = global_median('carbohydrates_100g')
        if not is_valid_numeric(fat):
            fat = global_median('fat_100g')
        if prot is not None and carbs is not None and fat is not None:
            kcal100 = kcal_from_macro(float(prot), float(carbs), float(fat))
            val = kcal100 * (serving_g / 100.0)
            res = {'value_per_serving': round(val, 2), 'unit': 'kcal', 'source': 'inferred', 'confidence': 'low',
                   'note': f"Estimated from macros (prot={prot},carb={carbs},fat={fat})."}
            _estimate_cache[key] = res
            return res

    # last resort
    if allow_best_effort:
        res = {'value_per_serving': None, 'unit': None, 'source': 'best_effort_unavailable', 'confidence': 'none',
               'note': 'No reliable numeric estimate available from dataset or fallbacks.'}
        _estimate_cache[key] = res
        return res

    res = {'value_per_serving': None, 'unit': None, 'source': 'none', 'confidence': 'none',
           'note': 'No reliable estimate available; strict mode active.'}
    _estimate_cache[key] = res
    return res

# ---------------------
# Suggestion generator (data-driven)
# ---------------------
def suggest_healthier_for(food_name: str, nutrient_col: Optional[str], serving_g: float, matched_row: Optional[pd.Series]) -> str:
    energy_res = None
    if matched_row is not None:
        energy_res = get_nutrient_from_row(matched_row, 'energy_kcal_100g', serving_g)
    if energy_res and energy_res.get('value_per_serving') is not None:
        kcal = energy_res['value_per_serving']
    else:
        est = estimate_nutrient(food_name, 'energy_kcal_100g', None, serving_g=serving_g)
        kcal = est['value_per_serving'] if est['value_per_serving'] is not None else None

    if kcal is None:
        return "Suggestion: No reliable energy estimate available to suggest healthier alternatives."

    if kcal >= 250:
        cat = matched_row['category'] if matched_row is not None and 'category' in matched_row else None
        alts = find_low_energy_alternatives(cat, max_kcal_per100=120, top_n=3, exclude_name=food_name)
        if alts:
            formatted = []
            for a in alts:
                try:
                    kcal_val = float(a.get('energy_kcal_100g', 0))
                    if kcal_val > 0:
                        formatted.append(f"{a['name']} ({int(round(kcal_val))} kcal/100g)")
                except Exception:
                    continue
            if formatted:
                return f"Suggestion: This item is relatively high in calories (~{int(round(kcal))} kcal per {int(serving_g)}g). Consider lower-energy alternatives such as: {', '.join(formatted)}."
        return f"Suggestion: This item is relatively high in calories (~{int(round(kcal))} kcal per {int(serving_g)}g). Consider smaller portions or pairing with vegetables/salad to reduce meal energy."

    if nutrient_col and 'fat' in nutrient_col:
        cat = matched_row['category'] if matched_row is not None and 'category' in matched_row else None
        alts = find_low_energy_alternatives(cat, max_kcal_per100=150, top_n=2, exclude_name=food_name)
        if alts:
            names = ", ".join([a['name'] for a in alts])
            return f"Suggestion: For lower fat, consider {names} if appropriate."
    if nutrient_col and 'sugar' in nutrient_col:
        alts = find_low_energy_alternatives(None, max_kcal_per100=80, top_n=3, exclude_name=food_name)
        if alts:
            names = ", ".join([a['name'] for a in alts])
            return f"Suggestion: Lower-sugar options include {names}."
    return "Suggestion: Prefer grilled/steamed cooking, add vegetables, and choose lean proteins and whole grains."

# ---------------------
# Balanced meal & pairing helpers
# ---------------------
def choose_protein_candidate(max_kcal_per100=300):
    df = food_data.copy()
    df['proteins_num'] = pd.to_numeric(df['proteins_100g'], errors='coerce').fillna(0)
    df['energy_num'] = pd.to_numeric(df['energy_kcal_100g'], errors='coerce').fillna(9999)
    cand = df[(df['proteins_num'] >= 15) & (df['energy_num'] <= max_kcal_per100)]
    if cand.empty:
        cand = df[df['proteins_num'] >= 12]
    if cand.empty:
        cand = df.sort_values('proteins_num', ascending=False).head(10)
    return cand.iloc[0] if not cand.empty else None

def choose_carb_candidate(min_carb_per100=15, max_kcal_per100=220):
    df = food_data.copy()
    df['carb_num'] = pd.to_numeric(df['carbohydrates_100g'], errors='coerce').fillna(0)
    df['energy_num'] = pd.to_numeric(df['energy_kcal_100g'], errors='coerce').fillna(9999)
    cand = df[(df['carb_num'] >= min_carb_per100) & (df['energy_num'] <= max_kcal_per100)]
    if cand.empty:
        cand = df.sort_values('carb_num', ascending=False).head(10)
    return cand.iloc[0] if not cand.empty else None

def choose_veg_candidate(max_kcal_per100=50):
    df = food_data.copy()
    df['energy_num'] = pd.to_numeric(df['energy_kcal_100g'], errors='coerce').fillna(9999)
    sub = df[df['energy_num'] <= max_kcal_per100]
    if sub.empty:
        sub = df[df['energy_num'] <= 100]
    veg = sub[sub['category'].fillna("").str.lower().str.contains("veget", na=False)]
    if not veg.empty:
        return veg.iloc[0]
    if not sub.empty:
        return sub.sort_values('energy_num').iloc[0]
    return None

def choose_fat_source():
    for name in ["olive oil", "avocado", "ghee"]:
        row, src, score = find_best_row(name)
        if row is not None and is_valid_numeric(row.get('fat_100g', np.nan)):
            return row
    df = food_data.copy()
    df['fat_num'] = pd.to_numeric(df['fat_100g'], errors='coerce').fillna(0)
    cand = df[df['fat_num'] > 0].sort_values('fat_num', ascending=False)
    return cand.iloc[0] if not cand.empty else None

def scaled_serving_to_meet_calorie_budget(items, servings_g, target_kcal):
    ests = []
    total_kcal = 0.0
    for it, sv in zip(items, servings_g):
        if it is None:
            est = {'value_per_serving': None, 'unit': 'kcal', 'source': 'none', 'confidence': 'none', 'note': 'missing item'}
        else:
            res = get_nutrient_from_row(it, 'energy_kcal_100g', sv) or estimate_nutrient(it['name'], 'energy_kcal_100g', None, serving_g=sv)
            est = res
        ests.append(est)
        if est['value_per_serving'] is not None:
            total_kcal += est['value_per_serving']
    if total_kcal <= target_kcal:
        return servings_g, ests, total_kcal
    adjusted = servings_g.copy()
    reducible_idx = [1, 3, 0]
    for idx in reducible_idx:
        if total_kcal <= target_kcal:
            break
        if items[idx] is None:
            continue
        cur_est = ests[idx]
        if cur_est['value_per_serving'] is None:
            continue
        cur_k = cur_est['value_per_serving']
        excess = total_kcal - target_kcal
        if cur_k <= 1e-6:
            continue
        reduce_frac = min(0.9, excess / cur_k)
        if reduce_frac <= 0:
            continue
        new_serv = adjusted[idx] * (1.0 - reduce_frac)
        if new_serv < 10:
            new_serv = 10.0
        adjusted[idx] = new_serv
        new_est = get_nutrient_from_row(items[idx], 'energy_kcal_100g', new_serv) or estimate_nutrient(items[idx]['name'], 'energy_kcal_100g', None, serving_g=new_serv)
        total_kcal = total_kcal - cur_k + (new_est['value_per_serving'] if new_est['value_per_serving'] is not None else 0.0)
        ests[idx] = new_est
    return adjusted, ests, total_kcal

def format_macro_breakdown(item_row, serving_g):
    parts = []
    keys = [('energy_kcal_100g','Calories'), ('proteins_100g','Protein (g)'), ('fat_100g','Fat (g)'), ('carbohydrates_100g','Carbs (g)'), ('sugars_100g','Sugars (g)')]
    for col, label in keys:
        res = get_nutrient_from_row(item_row, col, serving_g) or estimate_nutrient(item_row['name'], col, None, serving_g=serving_g)
        if res and res['value_per_serving'] is not None:
            parts.append(f"{label}: {res['value_per_serving']} {res['unit']} ({res['confidence']})")
        else:
            parts.append(f"{label}: n/a")
    return "; ".join(parts)

def suggest_balanced_meal(target_kcal_per_meal=600):
    prot = choose_protein_candidate()
    carb = choose_carb_candidate()
    veg = choose_veg_candidate()
    fat = choose_fat_source()
    servings = [150.0, 150.0, 100.0, 12.0]
    items = [prot, carb, veg, fat]
    adjusted_servings, ests, total_kcal = scaled_serving_to_meet_calorie_budget(items, servings.copy(), target_kcal_per_meal)
    lines = []
    lines.append(f"Suggested balanced meal (target ~{target_kcal_per_meal} kcal):")
    order = ["Protein", "Carb", "Vegetable", "Healthy fat"]
    for tag, it, sv, est in zip(order, items, adjusted_servings, ests):
        if it is None:
            lines.append(f"• {tag}: (no suitable item found in dataset)")
            continue
        kcal_str = f"{est['value_per_serving']} kcal" if est['value_per_serving'] is not None else "n/a"
        lines.append(f"• {tag}: {it['name'].title()} — {int(round(sv))} g — {kcal_str} (source: {est['source']}; confidence: {est['confidence']})")
        lines.append(f"   {format_macro_breakdown(it, sv)}")
    lines.append(f"Total estimated energy: {int(round(total_kcal))} kcal")
    lines.append("Notes: This meal aims to include lean protein, a complex carbohydrate, fiber-rich vegetables (proxy), and a small healthy fat portion. Adjust portions for individual needs.")
    return "\n".join(lines)

def suggest_food_pairing(base_food_name: str, target_kcal_per_meal=600):
    base_row, src, score = find_best_row(base_food_name)
    prot_val = safe_val(base_row, 'proteins_100g', np.nan) if base_row is not None else np.nan
    carb_val = safe_val(base_row, 'carbohydrates_100g', np.nan) if base_row is not None else np.nan
    fat_val = safe_val(base_row, 'fat_100g', np.nan) if base_row is not None else np.nan

    role = 'protein' if (not math.isnan(prot_val) and prot_val >= 12) else ('carb' if (not math.isnan(carb_val) and carb_val >= 15) else ('fat' if (not math.isnan(fat_val) and fat_val >= 10) else 'veg'))

    if role == 'protein':
        prot = base_row
        carb = choose_carb_candidate()
        veg = choose_veg_candidate()
        fat = choose_fat_source()
    elif role == 'carb':
        carb = base_row
        prot = choose_protein_candidate()
        veg = choose_veg_candidate()
        fat = choose_fat_source()
    elif role == 'fat':
        fat = base_row
        prot = choose_protein_candidate()
        carb = choose_carb_candidate()
        veg = choose_veg_candidate()
    else:
        veg = base_row
        prot = choose_protein_candidate()
        carb = choose_carb_candidate()
        fat = choose_fat_source()

    items = [prot, carb, veg, fat]
    default_servings = [150.0, 150.0, 100.0, 12.0]
    adjusted_servings, ests, total_kcal = scaled_serving_to_meet_calorie_budget(items, default_servings.copy(), target_kcal_per_meal)

    lines = []
    lines.append(f"Food pairing suggestion keeping overall calories ≲ {target_kcal_per_meal} kcal:")
    labels = ["Protein", "Carb", "Vegetable", "Healthy fat"]
    for lab, it, sv, est in zip(labels, items, adjusted_servings, ests):
        if it is None:
            lines.append(f"• {lab}: (no suitable item found)")
            continue
        kcal_str = int(round(est['value_per_serving'])) if est['value_per_serving'] else 'n/a'
        lines.append(f"• {lab}: {it['name'].title()} — {int(round(sv))} g — {kcal_str} kcal (source: {est['source']}; {est['confidence']})")
    lines.append(f"Total estimated energy: {int(round(total_kcal))} kcal")
    lines.append("Notes: Portions scaled to keep the meal relatively lower-calorie while including carbs, fiber-proxy (vegetable), protein, and a small amount of healthy fat. Adjust to taste and dietary needs.")
    return "\n".join(lines)

# ---------------------
# Parser helpers & Q/A answerer (integrated)
# ---------------------
NUTRIENT_MAP = {
    'calorie': 'energy_kcal_100g', 'calories': 'energy_kcal_100g', 'energy': 'energy_kcal_100g',
    'protein': 'proteins_100g', 'proteins': 'proteins_100g',
    'fat': 'fat_100g', 'fats': 'fat_100g',
    'carb': 'carbohydrates_100g', 'carbs': 'carbohydrates_100g', 'carbohydrate': 'carbohydrates_100g', 'carbohydrates': 'carbohydrates_100g',
    'sugar': 'sugars_100g', 'sugars': 'sugars_100g'
}

def parse_serving(question: str) -> float:
    m = re.search(r'(\d{1,4})\s*(g|grams?)\b', question.lower())
    if m:
        try:
            return float(m.group(1))
        except:
            return 100.0
    if 'medium' in question.lower():
        return 118.0
    return 100.0

def detect_nutrient(question: str) -> Optional[str]:
    for k in NUTRIENT_MAP.keys():
        if re.search(r'\b' + re.escape(k) + r'\b', question.lower()):
            return NUTRIENT_MAP[k]
    return None

def answer_question(question: str, allow_best_effort: bool = True, suggest_healthier: bool = True) -> str:
    q = question.strip()
    if not q:
        return "Please type a question about a food (e.g., 'How many calories in a banana?' or 'Compare sugar in banana vs apple')."
    q_lower = q.lower()

    # New intents: balanced meal
    if 'balanced meal' in q_lower or 'what foods should i eat for a balanced meal' in q_lower or 'balanced' in q_lower and 'meal' in q_lower:
        # optional: could parse target kcal from query; use default 600
        return suggest_balanced_meal(target_kcal_per_meal=600)

    # pairing intent
    if any(tok in q_lower for tok in ['pair', 'pairing', 'food pairing', 'pair with']):
        # try to find a base food
        base_phrase, base_row, src = find_food_in_question(q_lower)
        if base_phrase:
            return suggest_food_pairing(base_phrase, target_kcal_per_meal=600)
        else:
            # if no base food, return a general balanced meal
            return suggest_balanced_meal(target_kcal_per_meal=600)

    # comparison detection
    is_compare = any(tok in q_lower for tok in [' vs ', ' versus ', ' compare ', ' between ', ' which has more '])

    if is_compare:
        nutrient_col = detect_nutrient(q)
        if ' vs ' in q_lower or ' versus ' in q_lower:
            splitter = ' vs ' if ' vs ' in q_lower else ' versus '
            left_raw, right_raw = q_lower.split(splitter, 1)
            left_phrase, left_row, left_src = find_food_in_question(left_raw)
            right_phrase, right_row, right_src = find_food_in_question(right_raw)
        else:
            found = []
            remaining = q_lower
            for _ in range(2):
                phrase, row, src = find_food_in_question(remaining)
                if phrase:
                    found.append((phrase, row, src))
                    remaining = remaining.replace(phrase, '', 1)
            if len(found) >= 2:
                left_phrase, left_row, left_src = found[0]
                right_phrase, right_row, right_src = found[1]
            elif len(found) == 1:
                left_phrase, left_row, left_src = found[0]
                right_phrase, right_row, right_src = None, None, 'no_match'
            else:
                left_phrase, left_row, left_src = None, None, 'no_match'
                right_phrase, right_row, right_src = None, None, 'no_match'

        if left_phrase is None and right_phrase is None:
            return "Couldn't detect the two foods to compare. Try 'Compare sugar in banana vs apple' or 'Which has more protein, chicken or beef?'"

        if nutrient_col is None:
            nutrient_col = 'energy_kcal_100g'
        serv = parse_serving(q)

        if left_row is not None:
            a = get_nutrient_from_row(left_row, nutrient_col, serv) or estimate_nutrient(left_phrase, nutrient_col, None, serving_g=serv, allow_best_effort=allow_best_effort)
        else:
            a = estimate_nutrient(left_phrase or "unknown", nutrient_col, None, serving_g=serv, allow_best_effort=allow_best_effort)

        if right_row is not None:
            b = get_nutrient_from_row(right_row, nutrient_col, serv) or estimate_nutrient(right_phrase, nutrient_col, None, serving_g=serv, allow_best_effort=allow_best_effort)
        else:
            b = estimate_nutrient(right_phrase or "unknown", nutrient_col, None, serving_g=serv, allow_best_effort=allow_best_effort)

        def fmt(name, res):
            if res['value_per_serving'] is not None:
                return f"{name}: {res['value_per_serving']} {res['unit']} (source: {res['source']}; confidence: {res['confidence']})"
            else:
                return f"{name}: no reliable estimate (source: {res['source']}). Note: {res['note']}"

        name_a = left_phrase if left_phrase is not None else "Food A"
        name_b = right_phrase if right_phrase is not None else "Food B"
        lines = [fmt(name_a, a), fmt(name_b, b)]
        if (a['value_per_serving'] is not None) and (b['value_per_serving'] is not None):
            if a['value_per_serving'] > b['value_per_serving']:
                verdict = f"Result: {name_a} has more of the requested nutrient per {int(serv)}g."
                lines.append(verdict)
                if suggest_healthier:
                    lines.append(suggest_healthier_for(name_a, nutrient_col, serv, left_row))
            elif a['value_per_serving'] < b['value_per_serving']:
                verdict = f"Result: {name_b} has more of the requested nutrient per {int(serv)}g."
                lines.append(verdict)
                if suggest_healthier:
                    lines.append(suggest_healthier_for(name_b, nutrient_col, serv, right_row))
            else:
                lines.append("Result: Both have similar values for that nutrient.")
        else:
            lines.append("Comparison not possible with high confidence due to missing estimates.")
        return "\n".join(lines)

    # single-food queries / breakdown
    nutrient_col = detect_nutrient(q)
    serve = parse_serving(q)
    phrase, row, src = find_food_in_question(q)
    if phrase is not None:
        if nutrient_col:
            if row is not None:
                res = get_nutrient_from_row(row, nutrient_col, serve) or estimate_nutrient(phrase, nutrient_col, None, serving_g=serve, allow_best_effort=allow_best_effort)
            else:
                res = estimate_nutrient(phrase, nutrient_col, None, serving_g=serve, allow_best_effort=allow_best_effort)
            if res['value_per_serving'] is not None:
                reply = f"{phrase.title()} — {res['value_per_serving']} {res['unit']} (source: {res['source']}; confidence: {res['confidence']}).\nNote: {res['note']}"
                if suggest_healthier:
                    reply += "\n\n" + suggest_healthier_for(phrase, nutrient_col, serve, row)
                return reply
            else:
                return f"No reliable estimate for {phrase} ({res['source']}). {res['note']}"
        else:
            parts = []
            for k in NUMERIC_COLS:
                if row is not None:
                    res = get_nutrient_from_row(row, k, serve) or estimate_nutrient(phrase, k, None, serving_g=serve, allow_best_effort=allow_best_effort)
                else:
                    res = estimate_nutrient(phrase, k, None, serving_g=serve, allow_best_effort=allow_best_effort)
                if res['value_per_serving'] is not None:
                    parts.append(f"{k.replace('_100g','').replace('energy','calories').title()}: {res['value_per_serving']} {res['unit']} ({res['confidence']})")
                else:
                    parts.append(f"{k.replace('_100g','').replace('energy','calories').title()}: n/a")
            reply = f"{phrase.title()} — nutrient breakdown per {int(serve)}g:\n" + "\n".join(parts)
            if suggest_healthier:
                reply += "\n\n" + suggest_healthier_for(phrase, None, serve, row)
            return reply

    # general guidance fallback
    if any(tok in q_lower for tok in ["calorie per day", "calories per day", "how many calories", "healthy amount of calories", "weight loss"]):
        return "General guidance: For weight loss many people aim for a 300–500 kcal daily deficit from maintenance. Individual needs vary — consult a dietitian for personalized targets."

    return ("I couldn't parse a food name or nutrient from your question. Try formats like:\n"
            "- 'How many calories in a banana?'\n"
            "- 'How much protein in chicken breast per 100g?'\n"
            "- 'Compare sugar in banana vs apple'\n            " "Or use the 'Single food estimate' and 'Compare two foods' tabs for guided inputs.")

# ---------------------
# UI helpers (single estimate & compare)
# ---------------------
nutrient_options = [
    ("Calories (kcal)", "energy_kcal_100g"),
    ("Protein (g)", "proteins_100g"),
    ("Fat (g)", "fat_100g"),
    ("Carbohydrates (g)", "carbohydrates_100g"),
    ("Sugars (g)", "sugars_100g"),
]

def ui_estimate_nutrient(food_name: str, nutrient_label: str, serving: float, allow_best_effort: bool, suggest_flag: bool):
    nut_col = dict(nutrient_options)[nutrient_label]
    row, _, _ = find_best_row(food_name)
    if row is not None:
        res = get_nutrient_from_row(row, nut_col, serving) or estimate_nutrient(food_name, nut_col, None, serving_g=serving, allow_best_effort=allow_best_effort)
    else:
        res = estimate_nutrient(food_name, nut_col, None, serving_g=serving, allow_best_effort=allow_best_effort)
    if res['value_per_serving'] is not None:
        reply = f"{food_name} — {res['value_per_serving']} {res['unit']} ({res['source']}; confidence: {res['confidence']})\nNote: {res['note']}"
        if suggest_flag:
            reply += "\n\n" + suggest_healthier_for(food_name, nut_col, serving, row)
        return reply
    else:
        return f"{food_name} — No reliable estimate. Source: {res['source']}. Note: {res['note']}"

def ui_compare_two(food_a: str, food_b: str, nutrient_label: str, serving: float, allow_best_effort: bool, suggest_flag: bool):
    nut_col = dict(nutrient_options)[nutrient_label]
    row_a, _, _ = find_best_row(food_a)
    row_b, _, _ = find_best_row(food_b)
    if row_a is not None:
        a = get_nutrient_from_row(row_a, nut_col, serving) or estimate_nutrient(food_a, nut_col, None, serving_g=serving, allow_best_effort=allow_best_effort)
    else:
        a = estimate_nutrient(food_a, nut_col, None, serving_g=serving, allow_best_effort=allow_best_effort)
    if row_b is not None:
        b = get_nutrient_from_row(row_b, nut_col, serving) or estimate_nutrient(food_b, nut_col, None, serving_g=serving, allow_best_effort=allow_best_effort)
    else:
        b = estimate_nutrient(food_b, nut_col, None, serving_g=serving, allow_best_effort=allow_best_effort)

    lines = []
    for name, r, row in [(food_a, a, row_a), (food_b, b, row_b)]:
        if r['value_per_serving'] is not None:
            lines.append(f"{name}: {r['value_per_serving']} {r['unit']} ({r['source']}; {r['confidence']})")
        else:
            lines.append(f"{name}: no estimate ({r['source']}) - {r['note']}")
    if a['value_per_serving'] is not None and b['value_per_serving'] is not None:
        if a['value_per_serving'] > b['value_per_serving']:
            cmp = f"{food_a} has more {nutrient_label} per {serving}g than {food_b}."
            lines.append(cmp)
            if suggest_flag:
                lines.append(suggest_healthier_for(food_a, nut_col, serving, row_a))
        elif a['value_per_serving'] < b['value_per_serving']:
            cmp = f"{food_b} has more {nutrient_label} per {serving}g than {food_a}."
            lines.append(cmp)
            if suggest_flag:
                lines.append(suggest_healthier_for(food_b, nut_col, serving, row_b))
        else:
            lines.append("Both have similar values for that nutrient.")
    else:
        lines.append("Comparison not possible due to missing estimates.")
    return "\n".join(lines)





In [67]:
# =========================================================================
# NEW: LLM GENERATION STEP (Simulated Placeholder)
# *REPLACE THIS WITH YOUR ACTUAL PHI-3-MINI CODE*
# =========================================================================

def llm_generation_step(raw_query: str, factual_context: str) -> str:
    """
    Placeholder for the LLM call. In a real environment, this sends the
    context and query to the LLM (e.g., Phi-3, Gemma, Llama).

    The LLM's job is to read the structured 'factual_context' and convert
    it into a friendly, conversational, and helpful response.
    """
    # Check for retrieval failure
    if "Cannot find a reliable estimate" in factual_context:
        return "I apologize, but I couldn't find reliable data for that specific request in my food knowledge base. Could you try a more common food or a general question about nutrition?"

    # LLM conversational processing for success cases
    if "Comparison" in factual_context or "Nutrition Breakdown" in factual_context:
        
        # 1. LLM reads the facts using regex (the 'Retrieval' step)
        facts = re.findall(r'\*\*(.+?)\*\*:\s*(.+?)\s*\(Source:', factual_context)
        
        # 2. LLM adds conversational lead-in
        response = f"That's a great question about **{raw_query.title()}**! Here is the personalized breakdown you asked for:\n\n"
        
        # 3. LLM re-presents the facts clearly
        if facts:
             response += "### 📊 Key Nutrient Facts:\n"
             for item, value_unit in facts:
                 # Clean up the nutrient name for better presentation
                 response += f"* **{item.split(' for ')[0].title()}**: {value_unit}\n"
        
        # 4. LLM integrates the advice and meal ideas (already generated by rule-based system)
        advice_match = re.search(r'(Food pairing suggestion|Suggested balanced meal|Suggestion: This item is relatively high in calories)', factual_context, re.DOTALL)
        if advice_match:
            response += "\n### 🍽️ Meal Integration & Advice:\n"
            advice = factual_context[advice_match.start():].replace('Notes:', '**Quick Notes:**').replace('Suggestion:', '**CraveBot Tip:**')
            # Remove detailed source notes before presenting to user
            advice = re.sub(r'\(Source:.*?Confidence:.*?\)', '', advice)
            response += advice.strip()
            
        # 5. LLM adds a final, encouraging closing
        response += "\n\nI hope this helps! Do you want to know about cooking methods or a healthier snack idea next?"
        
        return response.replace("Total estimated energy:", "**Total Meal Energy:**")
        
    return factual_context # Fallback if structure isn't recognized

# Final Code

In [1]:
# =======================================================
# CRAVEBOT — FINAL HYBRID RAG CHATBOT
# Combines Dataset-First Fuzzy Retrieval with LLM Generation
#
# NOTE: The LLM is SIMULATED in 'llm_generation_step' for execution
# in this environment. Replace it with a real LLM API call (e.g., Phi-3)
# when deploying.
# =======================================================
import re
import subprocess
import sys
from typing import Optional, Tuple, List
import gradio as gr
import pandas as pd
import numpy as np
import time
import math
# ---------------------
# Rapidfuzz install (Kaggle-friendly) with difflib fallback
# ---------------------
USE_RAPIDFUZZ = False
try:
    from rapidfuzz import process, fuzz
    USE_RAPIDFUZZ = True
except Exception:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", "rapidfuzz"], stdout=subprocess.DEVNULL)
        from rapidfuzz import process, fuzz
        USE_RAPIDFUZZ = True
    except Exception:
        USE_RAPIDFUZZ = False
if not USE_RAPIDFUZZ:
    from difflib import get_close_matches
    def process_extractOne_adapter(q, choices, scorer=None):
        matches = get_close_matches(q, choices, n=1, cutoff=0.6)
        if matches:
            m = matches[0]
            # Simulate (matched_string, score, index) tuple structure
            return (m, 90, choices.index(m))
        return None
    process_extractOne = process_extractOne_adapter
    fuzz_WRatio = None
else:
    process_extractOne = process.extractOne
    fuzz_WRatio = fuzz.WRatio
# ---------------------
# CONFIG
# ---------------------
# !!! UPDATE THIS PATH TO YOUR ACTUAL FOOD DATA FILE !!!
FOOD_DATA_PATH = "/kaggle/input/food-cleaned-data/food_cleaned_clean.parquet"
FUZZY_CUTOFF_SCORE = 70
# ---------------------
# Helpers: normalize, load data
# ---------------------
def normalize_name(s: str) -> str:
    if s is None:
        return ""
    s = str(s).lower().strip()
    s = re.sub(r'[^a-z0-9\s\-]', '', s)
    s = re.sub(r'\s+', ' ', s)
    return s
def load_food_data(path: str) -> pd.DataFrame:
    try:
        df = pd.read_parquet(path)
        print(f"Successfully loaded {len(df)} records from {path}.")
    except Exception as e:
        print(f"WARN: Could not load data from {path}. Using small fallback sample. Error: {e}")
        # fallback small sample dataset
        df = pd.DataFrame([
            {"name": "banana", "energy_kcal_100g": 89, "proteins_100g": 1.1, "fat_100g": 0.3, "carbohydrates_100g": 22.8, "sugars_100g": 12.2, "category": "fruit"},
            {"name": "chicken breast", "energy_kcal_100g": 165, "proteins_100g": 31.0, "fat_100g": 3.6, "carbohydrates_100g": 0.0, "sugars_100g": 0.0, "category": "meat"},
            {"name": "mutton", "energy_kcal_100g": 0.0, "proteins_100g": 20.0, "fat_100g": 15.0, "carbohydrates_100g": 0.0, "sugars_100g": 0.0, "category": "meat"},
            {"name": "ghee", "energy_kcal_100g": 897, "proteins_100g": 0.0, "fat_100g": 99.7, "carbohydrates_100g": 0.0, "sugars_100g": 0.0, "category": "fat"},
            {"name": "olive oil", "energy_kcal_100g": 884, "proteins_100g": 0.0, "fat_100g": 91.0, "carbohydrates_100g": 0.0, "sugars_100g": 0.0, "category": "oil"},
            {"name": "avocado", "energy_kcal_100g": 160, "proteins_100g": 2.0, "fat_100g": 15.0, "carbohydrates_100g": 9.0, "sugars_100g": 0.7, "category": "fruit"},
            {"name": "potato", "energy_kcal_100g": 77, "proteins_100g": 2.0, "fat_100g": 0.1, "carbohydrates_100g": 17.5, "sugars_100g": 0.8, "category": "vegetable"},
            {"name": "bell pepper", "energy_kcal_100g": 31, "proteins_100g": 1.0, "fat_100g": 0.3, "carbohydrates_100g": 6.0, "sugars_100g": 4.2, "category": "vegetable"},
            {"name": "kiwi", "energy_kcal_100g": 61, "proteins_100g": 1.1, "fat_100g": 0.5, "carbohydrates_100g": 14.7, "sugars_100g": 8.9, "category": "fruit"},
            {"name": "rice", "energy_kcal_100g": 130, "proteins_100g": 2.7, "fat_100g": 0.3, "carbohydrates_100g": 28.0, "sugars_100g": 0.1, "category": "grain"},
            {"name": "beef, lean", "energy_kcal_100g": 250, "proteins_100g": 26.0, "fat_100g": 15.0, "carbohydrates_100g": 0.0, "sugars_100g": 0.0, "category": "meat"},
        ])
    # ensure needed columns exist
    for col in ['name','energy_kcal_100g','proteins_100g','fat_100g','carbohydrates_100g','sugars_100g','category']:
        if col not in df.columns:
            df[col] = np.nan
    df['name'] = df['name'].astype(str)
    df['_name_lc'] = df['name'].str.lower()
    df['_name_norm'] = df['name'].apply(normalize_name)
    return df
food_data = load_food_data(FOOD_DATA_PATH)
# ---------------------
# Precompute structures & medians (Rule Engine Knowledge)
# ---------------------
_names_list = food_data['name'].fillna("").astype(str).tolist()
_name_to_index = {normalize_name(n): i for i, n in enumerate(_names_list)}
NUMERIC_COLS = ['energy_kcal_100g','proteins_100g','fat_100g','carbohydrates_100g','sugars_100g']
_global_medians = {}
for c in NUMERIC_COLS:
    try:
        vals = pd.to_numeric(food_data[c], errors='coerce').dropna()
        _global_medians[c] = float(vals.median()) if not vals.empty else None
    except Exception:
        _global_medians[c] = None
_category_medians = {}
if 'category' in food_data.columns:
    try:
        grp = food_data.groupby(food_data['category'].fillna("")).median(numeric_only=True)
        for cat, row in grp.iterrows():
            _category_medians[str(cat).lower()] = {c: float(row[c]) for c in NUMERIC_COLS if pd.notna(row.get(c, np.nan))}
    except Exception:
        _category_medians = {}
_estimate_cache = {}
# ---------------------
# Basic numeric helpers
# ---------------------
def safe_val(row, col, default=np.nan):
    if row is None:
        return default
    v = row.get(col, np.nan)
    return float(v) if pd.notna(v) else default
def is_valid_numeric(x) -> bool:
    try:
        if x is None:
            return False
        xv = float(x)
        return xv > 0.0
    except Exception:
        return False
# ---------------------
# Nutrient extraction from row (treating 0/neg as missing)
# ---------------------
def get_nutrient_from_row(row: Optional[pd.Series], nutrient_col: str, serving_g: float):
    if row is None:
        return None
    raw = row.get(nutrient_col, np.nan)
    if not is_valid_numeric(raw):
        return None
    per100 = float(raw)
    val = per100 * (serving_g / 100.0)
    unit = 'kcal' if 'energy' in nutrient_col else 'g'
    return {'value_per_serving': round(val, 2), 'unit': unit, 'source': 'matched_row', 'confidence': 'high',
            'note': f"From dataset entry '{row['name']}' ({nutrient_col} per 100g = {per100})."}
# ---------------------
# Improved suggestion helpers (ensure positive kcal)
# ---------------------
_CURATED_LOW_CAL = [
    ("Cucumber", 16), ("Lettuce", 15), ("Spinach", 23), ("Broccoli", 34),
    ("Tomato", 18), ("Strawberry", 32), ("Bell pepper", 31),
]
def find_low_energy_alternatives(category: Optional[str],
                                 max_kcal_per100: float = 100.0,
                                 top_n: int = 3,
                                 exclude_name: Optional[str] = None) -> List[dict]:
    candidates: List[dict] = []
    excluded_norm = normalize_name(exclude_name) if exclude_name else None
    def add_rows_from_df(df_sub):
        for _, r in df_sub.iterrows():
            candidates.append(r.to_dict())
    # 1) category-first
    if category:
        cat_key = str(category).lower()
        sub = food_data[food_data['category'].fillna("").str.lower() == cat_key].copy()
        if not sub.empty:
            sub = sub[sub['energy_kcal_100g'].notna()]
            sub = sub[sub['energy_kcal_100g'] > 0]
            if excluded_norm:
                sub = sub[sub['name'].apply(lambda n: normalize_name(n) != excluded_norm)]
            sub = sub[sub['energy_kcal_100g'] <= max_kcal_per100]
            if not sub.empty:
                add_rows_from_df(sub.sort_values('energy_kcal_100g').head(top_n))
    # 2) global strict threshold
    if len(candidates) < top_n:
        global_sub = food_data[food_data['energy_kcal_100g'].notna()].copy()
        global_sub = global_sub[global_sub['energy_kcal_100g'] > 0]
        if excluded_norm:
            global_sub = global_sub[global_sub['name'].apply(lambda n: normalize_name(n) != excluded_norm)]
        global_sub = global_sub[global_sub['energy_kcal_100g'] <= max_kcal_per100]
        add_rows_from_df(global_sub.sort_values('energy_kcal_100g'))
    # 3) relax threshold progressively
    if len(candidates) < top_n:
        relax_steps = [max_kcal_per100 * 1.5, max_kcal_per100 * 2.0, 250]
        for thr in relax_steps:
            global_sub = food_data[food_data['energy_kcal_100g'].notna()].copy()
            global_sub = global_sub[global_sub['energy_kcal_100g'] > 0]
            if excluded_norm:
                global_sub = global_sub[global_sub['name'].apply(lambda n: normalize_name(n) != excluded_norm)]
            global_sub = global_sub[global_sub['energy_kcal_100g'] <= thr]
            add_rows_from_df(global_sub.sort_values('energy_kcal_100g'))
            if len(candidates) >= top_n:
                break
    # 4) curated fallback
    if len(candidates) == 0:
        for name, kcal in _CURATED_LOW_CAL:
            if exclude_name and normalize_name(name) == normalize_name(exclude_name):
                continue
            candidates.append({'name': name, 'energy_kcal_100g': kcal})
            if len(candidates) >= top_n:
                break
    # final filter: ensure positive energy and unique names
    seen = set()
    final = []
    for r in candidates:
        try:
            kcal = float(r.get('energy_kcal_100g', 0))
            nm = r.get('name', '')
            if kcal > 0 and nm not in seen:
                final.append(r)
                seen.add(nm)
        except Exception:
            continue
   
    return final[:top_n]
def generate_meal_suggestion(target_kcal: float, food_items: List[Tuple[str, float]]) -> str:
    # This function provides a hardcoded, rule-based meal balance suggestion
    suggestion = "Suggested balanced meal (Rule-based):\n\n"
   
    # Simple macros estimation for the provided foods
    total_energy = 0.0
    macros = {'Proteins': 0.0, 'Fat': 0.0, 'Carbohydrates': 0.0}
   
    for item_name, serving_g in food_items:
        row = find_best_row(item_name, strict_match=True, min_score=95) # Strict match for meal items
        if row is not None:
            energy = get_nutrient_from_row(row, 'energy_kcal_100g', serving_g)
            protein = get_nutrient_from_row(row, 'proteins_100g', serving_g)
            fat = get_nutrient_from_row(row, 'fat_100g', serving_g)
            carbs = get_nutrient_from_row(row, 'carbohydrates_100g', serving_g)
           
            if energy and energy['value_per_serving'] > 0:
                total_energy += energy['value_per_serving']
                macros['Proteins'] += protein['value_per_serving'] if protein else 0
                macros['Fat'] += fat['value_per_serving'] if fat else 0
                macros['Carbohydrates'] += carbs['value_per_serving'] if carbs else 0
               
                suggestion += f"* **{item_name.title()}** ({int(serving_g)}g): {energy['value_per_serving']:.0f} kcal\n"
        else:
             suggestion += f"* **{item_name.title()}** ({int(serving_g)}g): Data not found (Skipped)\n"
    suggestion += f"\nTotal estimated energy: {total_energy:.0f} kcal (Target: {target_kcal:.0f} kcal)\n"
    suggestion += f"Total estimated Macros: P={macros['Proteins']:.1f}g, F={macros['Fat']:.1f}g, C={macros['Carbohydrates']:.1f}g\n"
   
    # Rule-based advice on balance
    P_percent = (macros['Proteins'] * 4 / total_energy) * 100 if total_energy > 0 else 0
    C_percent = (macros['Carbohydrates'] * 4 / total_energy) * 100 if total_energy > 0 else 0
    F_percent = (macros['Fat'] * 9 / total_energy) * 100 if total_energy > 0 else 0
    suggestion += "\nNotes: "
    if 20 <= P_percent <= 30 and 40 <= C_percent <= 60 and F_percent <= 35:
        suggestion += "This meal is **well-balanced** according to standard macro guidelines (20-30% P, 40-60% C, <35% F). "
    elif P_percent < 20:
        suggestion += "Consider adding a small side of a high-protein item (like beans or Greek yogurt) to boost protein balance. "
    elif F_percent > 35:
        suggestion += "The fat content is high. Consider reducing the serving size of high-fat components (e.g., oil, fatty meat). "
    else:
        suggestion += "Macros are slightly off typical balance. Adjusting protein or carb source size may help. "
       
    suggestion += f"(P={P_percent:.0f}%, C={C_percent:.0f}%, F={F_percent:.0f}%)\n"
   
    return suggestion
# ---------------------
# Core Retrieval Logic
# ---------------------
def find_best_row(query: str,
                  min_score: int = FUZZY_CUTOFF_SCORE,
                  strict_match: bool = False) -> Optional[pd.Series]:
    """Finds the best matching food item using fuzzy matching."""
    norm_query = normalize_name(query)
   
    # Try normalized names first
    if norm_query in _name_to_index:
        idx = _name_to_index[norm_query]
        return food_data.iloc[idx]
    # Fuzzy matching on all names
    if not _names_list:
        return None
    match = process_extractOne(norm_query, _names_list, scorer=fuzz_WRatio, score_cutoff=min_score)
   
    if match:
        matched_name, score, index = match
        if strict_match and score < 95:
            return None # Fail strict check
       
        # Get the row using the index from the original dataframe
        # Note: The 'index' from process.extractOne is the position in _names_list,
        # which corresponds to the dataframe index *if* the dataframe hasn't been re-indexed.
        # We rely on the original index here.
        return food_data.iloc[index]
   
    return None
def estimate_nutrient(query: str,
                      nutrient_col: str,
                      serving_g: float = 100.0,
                      allow_best_effort: bool = False) -> dict:
    """Estimates nutrient content for a query, using imputation if necessary."""
   
    # 1. Direct Retrieval
    row = find_best_row(query)
    if row is not None:
        result = get_nutrient_from_row(row, nutrient_col, serving_g)
        if result is not None:
            return result
   
    # 2. Imputation (Best Effort)
    if allow_best_effort:
        # Category Median Imputation
        category = safe_val(row, 'category', 'none')
        cat_median = _category_medians.get(str(category).lower(), {}).get(nutrient_col)
       
        if is_valid_numeric(cat_median):
            val_per_serving = cat_median * (serving_g / 100.0)
            unit = 'kcal' if 'energy' in nutrient_col else 'g'
            return {'value_per_serving': round(val_per_serving, 2),
                    'unit': unit,
                    'source': 'imputation_category_median',
                    'confidence': 'medium',
                    'note': f"Based on median value for category '{category}' ({nutrient_col} per 100g = {cat_median})."}
       
        # Global Median Imputation (Last resort)
        global_median = _global_medians.get(nutrient_col)
        if is_valid_numeric(global_median):
            val_per_serving = global_median * (serving_g / 100.0)
            unit = 'kcal' if 'energy' in nutrient_col else 'g'
            return {'value_per_serving': round(val_per_serving, 2),
                    'unit': unit,
                    'source': 'imputation_global_median',
                    'confidence': 'low',
                    'note': f"Based on global median value for all foods ({nutrient_col} per 100g = {global_median})."}
    
    return {'value_per_serving': 0.0,
            'unit': 'N/A',
            'source': 'failed_retrieval',
            'confidence': 'none',
            'note': f"Cannot find a reliable estimate for '{query}' in {nutrient_col}."}

def answer_question(question: str,
                    allow_best_effort: bool = False,
                    suggest_healthier: bool = False) -> str:
    """
    Core function that parses the question, retrieves data, and formats
    the factual, structured output (the RAG 'Context').
    """
    question_lower = question.lower()
    output = ""
   
    #  1. Serving Size Parsing (e.g., '150g chicken') 
    serving_match = re.search(r'(\d+)\s*(g|grams|ounce|oz)', question_lower)
    serving_g = 100.0
    if serving_match:
        qty = float(serving_match.group(1))
        unit = serving_match.group(2)
        if unit in ['oz', 'ounce']:
            serving_g = qty * 28.35 # Convert ounces to grams
        else:
            serving_g = qty
        output += f"**Serving Size Detected:** {serving_g:.0f}g. "
        question_lower = question_lower.replace(serving_match.group(0), '').strip()
   
    # --- 2. Comparison Logic (e.g., 'compare protein in A vs B') ---
    comparison_match = re.search(r'compare\s+(.*?)\s+in\s+(.*?)\s+vs\s+(.*)', question_lower)
    if comparison_match:
        nutrient_query = comparison_match.group(1).lower()
        item1_query = comparison_match.group(2).strip()
        item2_query = comparison_match.group(3).strip()
       
        nutrient_col_map = {'protein': 'proteins_100g', 'fat': 'fat_100g', 'carb': 'carbohydrates_100g', 'sugar': 'sugars_100g', 'calorie': 'energy_kcal_100g', 'energy': 'energy_kcal_100g'}
       
        # Find best nutrient column match
        best_nut = process_extractOne(nutrient_query, nutrient_col_map.keys(), scorer=fuzz_WRatio, score_cutoff=80)
       
        if best_nut:
            nutrient_name = best_nut[0]
            nutrient_col = nutrient_col_map[nutrient_name]
           
            # Retrieve data for both items
            item1_res = estimate_nutrient(item1_query, nutrient_col, serving_g, allow_best_effort)
            item2_res = estimate_nutrient(item2_query, nutrient_col, serving_g, allow_best_effort)
           
            val1 = item1_res['value_per_serving']
            val2 = item2_res['value_per_serving']
            unit = item1_res['unit']
           
            output += f"### ⚖️ Comparison of {nutrient_name.title()} ({serving_g:.0f}{unit} serving)\n"
            output += f"**{item1_query.title()}**: {val1:.1f}{unit} (Source: {item1_res['source']} | Confidence: {item1_res['confidence']})\n"
            output += f"**{item2_query.title()}**: {val2:.1f}{unit} (Source: {item2_res['source']} | Confidence: {item2_res['confidence']})\n"
           
            if val1 > val2:
                winner = item1_query
            elif val2 > val1:
                winner = item2_query
            else:
                winner = "They are approximately equal"
               
            output += f"\n**Conclusion:** {winner.title()} has more {nutrient_name}.\n"
            output += f"Source Notes for {item1_query}: ({item1_res['note']})\n"
            output += f"Source Notes for {item2_query}: ({item2_res['note']})"
            return output
           
    # 3. Meal Suggestion Logic (e.g., 'I have rice and chicken for 500 kcal') ---
    meal_items_match = re.search(r'i\s+have\s+(.*?)\s+and\s+(.*?)\s+.*?(\d+)\s*kcal', question_lower)
    if meal_items_match:
        # Simple extraction assumes two main items for a basic rule
        item1_name = meal_items_match.group(1).strip().replace('some ', '')
        item2_name = meal_items_match.group(2).strip().replace('some ', '')
        target_kcal = float(meal_items_match.group(3))
        # Hardcoded serving sizes for simplicity, assuming a standard meal balance
        # In a real app, this would be an optimization/planning step
        serving_item1 = 150.0 # e.g., for rice/carb
        serving_item2 = 120.0 # e.g., for chicken/protein
       
        output += f"### 🍽️ Suggested balanced meal for {target_kcal:.0f} kcal\n"
       
        # Add a vegetable/fiber component as a rule
        # Simple rule: add broccoli as the 'missing' component
       
        meal_components = [
            (item1_name, serving_item1),
            (item2_name, serving_item2),
            ("Broccoli", 100.0) # Rule-based addition of a low-cal veggie
        ]
       
        output += generate_meal_suggestion(target_kcal, meal_components)
        return output
   
    # 4. Single Food Lookup (General Nutrient Lookup) ---
    nutrient_list = [c.split('_')[0] for c in NUMERIC_COLS]
   
    # Try to find an exact food item first
    food_row = find_best_row(question, strict_match=False, min_score=FUZZY_CUTOFF_SCORE)
   
    if food_row is not None:
        food_name = food_row['name']
        food_category = safe_val(food_row, 'category', 'N/A')
       
        output += f"### 📋 Nutrition Breakdown for {food_name.title()} ({serving_g:.0f}g serving)\n"
       
        # Determine which nutrients the user wants, or get all of them
        nutrients_to_report = []
        user_wants_all = True
       
        for n in nutrient_list:
            if n.lower() in question_lower:
                nutrients_to_report.append(n)
                user_wants_all = False
       
        if user_wants_all:
            nutrients_to_report = ['energy', 'proteins', 'fat', 'carbohydrates'] # Standard MACRO report
           
        # Retrieve and format all requested nutrients
        high_kcal_flag = False
        for n in nutrients_to_report:
            col_name = f'{n}_100g' if n != 'energy' else 'energy_kcal_100g'
           
            res = estimate_nutrient(food_name, col_name, serving_g, allow_best_effort)
            val = res['value_per_serving']
            unit = res['unit']
           
            output += f"**{n.title()} for {food_name}**: {val:.1f}{unit} (Source: {res['source']} | Confidence: {res['confidence']})\n"
            output += f"Source Notes: ({res['note']})\n"
            if n == 'energy' and safe_val(food_row, col_name, 0.0) > 400: # Rule: Check for high kcal/100g
                 high_kcal_flag = True
       
        # --- 5. Rule-Based Advice/Suggestion ---
        if suggest_healthier:
            output += "\n---\n"
            if high_kcal_flag:
                output += "Suggestion: This item is relatively high in calories/fat (>400 kcal/100g). Avoid consuming large portions most of the time. "
               
                # Retrieve alternatives
                alternatives = find_low_energy_alternatives(food_category,
                                                            max_kcal_per100=100,
                                                            exclude_name=food_name)
               
                if alternatives:
                    output += "Consider these lower-calorie alternatives from the same category or globally:\n"
                    for alt in alternatives:
                        output += f"* {alt['name']} ({alt['energy_kcal_100g']:.0f} kcal/100g)\n"
           
            # Rule for high-carb item pairing
            elif 'rice' in food_name.lower() or 'potato' in food_name.lower():
                output += "Food pairing suggestion: This is primarily a carbohydrate source. Pair it with a lean protein (e.g., chicken breast, fish) and a vegetable (e.g., spinach, broccoli) for a complete meal.\n"
               
            elif 'avocado' in food_name.lower():
                output += "Food pairing suggestion: Avocado is high in healthy fats. Pair it with a source of complex carbs (e.g., whole-grain toast) and a protein (e.g., egg or chickpeas) for a balanced macro profile.\n"
        return output
       
#  6. Final Fallback 
    if allow_best_effort:
        # Try a global energy estimate if no food was matched
        energy_res = estimate_nutrient(question_lower, 'energy_kcal_100g', 100.0, allow_best_effort)
        if energy_res['source'] != 'failed_retrieval':
            output += f"Could not match a specific food item, but based on median data, a generic item might contain {energy_res['value_per_serving']:.0f} kcal per 100g.\n"
            output += f"Source: {energy_res['source']} | Note: ({energy_res['note']})"
            return output
   
    return f"Cannot find a reliable estimate for '{question}'. Please try a different food item or rephrase your question. (Confidence: None)"

# LLM GENERATION STEP (Simulated Placeholder) - SYNTAX CORRECTED

def llm_generation_step(raw_query: str, factual_context: str) -> str:
    """
    Placeholder for the LLM call. In a real environment, this sends the
    context and query to the LLM (e.g., Phi-3, Gemma, Llama).
    The LLM's job is to read the structured 'factual_context' and convert
    it into a friendly, conversational, and helpful response.
    """
    # Check for retrieval failure
    if "Cannot find a reliable estimate" in factual_context:
        return "I apologize, but I couldn't find reliable data for that specific request in my food knowledge base. Could you try a more common food or a general question about nutrition?"
    # LLM conversational processing for success cases
    if "Comparison" in factual_context or "Nutrition Breakdown" in factual_context or "Suggested balanced meal" in factual_context:
       
        # 1. LLM reads the facts using regex (the 'Retrieval' step)
        facts = re.findall(r'\*\*([^:]+?)\*\*:\s*(.+?)\s*\(Source:', factual_context)
       
        # 2. LLM adds conversational lead-in
        response = f"That's a great question about **{raw_query.title()}**! Here is the personalized breakdown you asked for:\n\n"
       
        # 3. LLM re-presents the facts clearly
        if facts:
             response += "### 📊 Key Nutrient Facts:\n"
             for item, value_unit in facts:
                 response += f"* **{item.split(' for ')[0].title()}**: {value_unit}\n"
       
        # 4. LLM integrates the advice and meal ideas (already generated by rule-based system)
        advice_match = re.search(r'(Food pairing suggestion|Suggested balanced meal|Suggestion: This item is relatively high in calories|Notes:)', factual_context, re.DOTALL)
        if advice_match:
            response += "\n### 🍽️ Meal Integration & Advice:\n"
            advice = factual_context[advice_match.start():]
           
            advice = advice.replace('Notes:', '**Quick Notes:**').replace('Suggestion:', '**CraveBot Tip:**')
            advice = advice.replace('Total estimated energy:', '**Total Meal Energy:**')
            advice = advice.replace('Total estimated Macros:', '**Total Macros:**')
           
            advice = re.sub(r'\(Source:.*?Confidence:.*?\)', '', advice)
            advice = re.sub(r'Source Notes: \(.*?\)', '', advice)
           
            response += advice.strip()
           
        # 5. LLM adds a final, encouraging closing
        response += "\n\nI hope this helps! Do you want to know about cooking methods or a healthier snack idea next?\n[Image of a balanced plate of food]\n"
       
        # Return the generated response
        return response.replace("Total estimated energy:", "**Total Meal Energy:**")
       
    # Fallback if structure isn't recognized
    return factual_context
# MODIFIED: Gradio Interface Execution

def run_qa(question):
    start_time = time.time()
    try:
        # Step 1: Retrieval/Rule-based processing (Your existing core logic)
        # This produces the auditable, factual context
        factual_context = answer_question(question, allow_best_effort=True, suggest_healthier=True)
       
        # Step 2 & 3: LLM Augmentation and Generation
        # The LLM converts the factual context into a conversational response
        llm_response = llm_generation_step(question, factual_context)
       
        # Final output
        response = llm_response
        response += f"\n\n---"
        response += f"\n(Execution Time: {time.time() - start_time:.2f} seconds | Fuzzy Match Lib: {'Rapidfuzz' if USE_RAPIDFUZZ else 'Difflib'} | **Augmented by LLM**)"
        return response
    except Exception as e:
        return f"An error occurred during RAG pipeline: {e}\n\n(Execution Time: {time.time() - start_time:.2f} seconds | Fuzzy Match Lib: {'Rapidfuzz' if USE_RAPIDFUZZ else 'Difflib'})"


Successfully loaded 3827820 records from /kaggle/input/food-cleaned-data/food_cleaned_clean.parquet.


In [2]:
# Gradio Interface 

if __name__ == '__main__':
    with gr.Blocks(title="CraveBot Final RAG App") as demo:
        gr.Markdown("# CraveBot: Hybrid RAG Chatbot (LLM-Augmented)")
        gr.Markdown(
            "This system uses fast fuzzy matching to *retrieve* data, then uses an LLM (simulated here) to *generate* a conversational, helpful response."
            f"<br>Loaded **{len(food_data)}** food items. Fuzzy matching is {'**ACTIVE**' if USE_RAPIDFUZZ else '**FALLBACK** (slower)'}."
        )
       
        with gr.Row():
            question_box = gr.Textbox(label="Your Question", lines=3, placeholder="e.g., 'How many calories in a 150g cooked chicken breast?' or 'Compare protein in banana vs kiwi'")
           
        with gr.Row():
            run_button = gr.Button("Ask CraveBot (RAG)", variant="primary")
           
        output_text = gr.Markdown(label="CraveBot Response")
       
        run_button.click(run_qa, inputs=[question_box], outputs=[output_text])
        gr.Examples(
            [
                "What is the fat, protein, and carb content of olive oil?",
                "Compare the calories in mutton vs beef (lean).",
                "I have some rice and chicken. What is a balanced meal suggestion for 500 kcal?",
                "What should I pair with avocado for a complete meal?",
                "Which has more sugar, banana or kiwi?",
                "Give me a balanced meal.",
            ],
            inputs=question_box
        )
    demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://cd2f130eb4019ef03f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
