In [None]:
!pip install transformers torch scikit-learn pandas numpy



Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import pandas as pd

df = pd.read_csv("//foodAllergenDataset.csv")
df.head()

Unnamed: 0,Food Item,Ingredients,Allergens,Risk Level,Alternative Suggestions
0,Almond Milk,"Almonds, Water, Sugar",Nuts,High,"Oat Milk, Coconut Milk"
1,Apple Pie,"Apples, Flour, Butter, Eggs","Gluten, Dairy, Eggs",Medium,Gluten-Free Apple Pie
2,Peanut Butter,"Peanuts, Salt, Sugar",Peanuts,High,Sunflower Seed Butter
3,Scrambled Eggs,"Eggs, Butter, Salt","Eggs, Dairy",High,Tofu Scramble
4,Cheeseburger,"Beef, Cheese, Bun, Lettuce","Dairy, Gluten",High,Lettuce-Wrapped Burger


In [None]:
from sklearn.preprocessing import LabelEncoder

# Check for missing values
print(df.isnull().sum())

# Fill missing values if any
df.fillna("Unknown", inplace=True)

# Encode Risk Level
encoder = LabelEncoder()
df["Risk Level"] = encoder.fit_transform(df["Risk Level"])  # (Low → 1, Moderate → 2, High → 0)

# Show updated dataset
df.head()


Food Item                  0
Ingredients                0
Allergens                  1
Risk Level                 0
Alternative Suggestions    0
dtype: int64


Unnamed: 0,Food Item,Ingredients,Allergens,Risk Level,Alternative Suggestions
0,Almond Milk,"Almonds, Water, Sugar",Nuts,0,"Oat Milk, Coconut Milk"
1,Apple Pie,"Apples, Flour, Butter, Eggs","Gluten, Dairy, Eggs",2,Gluten-Free Apple Pie
2,Peanut Butter,"Peanuts, Salt, Sugar",Peanuts,0,Sunflower Seed Butter
3,Scrambled Eggs,"Eggs, Butter, Salt","Eggs, Dairy",0,Tofu Scramble
4,Cheeseburger,"Beef, Cheese, Bun, Lettuce","Dairy, Gluten",0,Lettuce-Wrapped Burger


In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Function to convert ingredients into BERT embeddings
def get_bert_embedding(text):
    tokens = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt", max_length=128)
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state[:, 0, :].squeeze().numpy()  # Use the CLS token representation


In [None]:
# Apply BERT embedding function to ingredients column
ingredient_embeddings = np.array([get_bert_embedding(ing) for ing in df["Ingredients"]])

# Features (X) → BERT embeddings
X = ingredient_embeddings

# Target (y) → Risk Level
y = df["Risk Level"]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split data into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 83.33%


In [None]:
import numpy as np

# Ensure required columns exist
required_columns = ["Allergens", "Ingredients", "Alternative Suggestions"]
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Extract unique allergens from the dataset
dataset_allergens = set(df["Allergens"].dropna().str.lower().str.split(", ").explode().unique())

# Clean dataset: Strip spaces and convert to lowercase
df["Ingredients"] = df["Ingredients"].astype(str).str.lower().str.strip()
df["Alternative Suggestions"] = df["Alternative Suggestions"].astype(str).str.strip()

# Create a dictionary mapping ingredients to their alternatives
alternative_mapping =  {
    ing.strip().lower(): alt.strip()
    for ing, alt in zip(df["Ingredients"], df["Alternative Suggestions"])
}

# Function to get BERT embedding (replace with actual BERT model)
def get_bert_embedding(text):
    return np.random.rand(768)  # Assuming a 768-dimensional BERT vector

# Function to detect allergens from user input
def detect_allergens(text, allergen_list):
    ingredients = set(text.lower().split(", "))
    detected = ingredients.intersection(allergen_list)
    return list(detected) if detected else ["None"]

# Function to get alternative ingredients
def get_alternative_ingredients(ingredients, mapping):
   return {ing: mapping.get(ing.strip().lower(), "No alternative found") for ing in ingredients}

# Take ingredient input from user
user_input = input("Enter ingredients (comma-separated): ").lower().strip()

# Extract detected allergens
detected_allergens = detect_allergens(user_input, dataset_allergens)

# Convert text to BERT embedding
user_food_vectorized = np.array([get_bert_embedding(user_input)])

# Predict risk level (assuming clf and encoder are already trained)
prediction = clf.predict(user_food_vectorized)
predicted_risk = encoder.inverse_transform(prediction)

# Extract alternative ingredient suggestions
user_ingredients = user_input.split(", ")
alternative_ingredients = get_alternative_ingredients(user_ingredients, alternative_mapping)

# Print results
print(f"\n🔹 Predicted Risk Level: {predicted_risk[0]}")
print(f"🔹 Detected Allergens: {', '.join(detected_allergens)}")

# Print alternative ingredient suggestions
#print("\n🔹 Alternative Ingredient Suggestions:")
#for ingredient, alternative in alternative_ingredients.items():
    #print(f"{ingredient} → {alternative}")



Enter ingredients (comma-separated): peanuts

🔹 Predicted Risk Level: High
🔹 Detected Allergens: peanuts


In [None]:
print(df.head())  # Check first few rows
print(df.info())  # Check data types and missing values
print(df["Risk Level"].unique())  # Confirm encoding values
print(df["Alternative Suggestions"].head())  # Ensure alternatives are diverse


        Food Item                  Ingredients            Allergens  \
0     Almond Milk        almonds, water, sugar                 Nuts   
1       Apple Pie  apples, flour, butter, eggs  Gluten, Dairy, Eggs   
2   Peanut Butter         peanuts, salt, sugar              Peanuts   
3  Scrambled Eggs           eggs, butter, salt          Eggs, Dairy   
4    Cheeseburger   beef, cheese, bun, lettuce        Dairy, Gluten   

   Risk Level Alternative Suggestions  
0           0  Oat Milk, Coconut Milk  
1           2   Gluten-Free Apple Pie  
2           0   Sunflower Seed Butter  
3           0           Tofu Scramble  
4           0  Lettuce-Wrapped Burger  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Food Item                30 non-null     object
 1   Ingredients              30 non-null     object
 2   Allergens     

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Check accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.8333333333333334
              precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           2       1.00      0.50      0.67         2

    accuracy                           0.83         6
   macro avg       0.90      0.75      0.78         6
weighted avg       0.87      0.83      0.81         6

