In [None]:
# -------------------------------
# Part 1: Download, Clean, and Preprocess the Dataset
# -------------------------------
import kagglehub
import pandas as pd
import os
import re
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer

# 📌 Step 1: Download Dataset from Kaggle
path = kagglehub.dataset_download("openfoodfacts/world-food-facts")
print("Dataset downloaded successfully!")
print("Path to dataset files:", path)

# Locate the TSV file
tsv_files = [f for f in os.listdir(path) if f.endswith(".tsv")]
if not tsv_files:
    raise FileNotFoundError("No TSV files found in the dataset folder.")

# Load the TSV file (using the first found TSV)
file_path = os.path.join(path, tsv_files[0])
print(f"Found dataset file: {file_path}")
df = pd.read_csv(file_path, delimiter="\t", low_memory=False)

# 📌 Step 2: Extract & Clean Data
# Define the columns we care about
columns_needed = [
    "product_name", "ingredients_text", "nutrition_grade_fr",
    "energy_100g", "fat_100g", "sugars_100g", "salt_100g"
]
df_filtered = df[columns_needed].copy()

# Drop rows that are missing a nutrition grade (needed for labeling)
df_filtered = df_filtered.dropna(subset=["nutrition_grade_fr"])

# For text columns, fill missing values with an empty string.
text_cols = ["product_name", "ingredients_text"]
df_filtered[text_cols] = df_filtered[text_cols].fillna("")

# For numeric columns, fill missing values with 0.
numeric_cols = ["energy_100g", "fat_100g", "sugars_100g", "salt_100g"]
df_filtered[numeric_cols] = df_filtered[numeric_cols].fillna(0)

# Function to classify food based on nutrition grade
def classify_health(nutrition_grade):
    grade = nutrition_grade.lower().strip()
    if grade in ["a", "b"]:
        return "healthy"
    elif grade == "c":
        return "moderately_healthy"
    else:
        return "unhealthy"

df_filtered["health_label"] = df_filtered["nutrition_grade_fr"].apply(classify_health)

# Function to clean the ingredient text
def clean_text(text):
    text = str(text).lower()                 # Convert to lowercase
    text = re.sub(r"\d+", "", text)           # Remove numbers
    text = re.sub(r"[^\w\s]", "", text)        # Remove punctuation
    text = text.strip()
    return text

df_filtered["clean_ingredients"] = df_filtered["ingredients_text"].apply(clean_text)

# (Optional) Remove any rows where "clean_ingredients" is empty after cleaning.
df_filtered = df_filtered[df_filtered["clean_ingredients"].str.strip() != ""]

# Save preprocessed data as CSV files for training later
train_df, test_df = train_test_split(df_filtered, test_size=0.2, random_state=42, stratify=df_filtered["health_label"])
train_df.to_csv("distilbert_train_data.csv", index=False)
test_df.to_csv("distilbert_test_data.csv", index=False)
print("Data preprocessing complete. Training and testing datasets are saved.")

In [None]:
# -------------------------------
# Part 2: Fine-Tune DistilBERT on the Preprocessed Data (GPU-Optimized)
# -------------------------------
import torch
import numpy as np
from datasets import Dataset
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Step 1: Load Preprocessed Dataset (CSV)
train_path = "distilbert_train_data.csv"
test_path = "distilbert_test_data.csv"
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

# Map health labels to numerical values
label_mapping = {"healthy": 0, "moderately_healthy": 1, "unhealthy": 2}
df_train["label"] = df_train["health_label"].map(label_mapping)
df_test["label"] = df_test["health_label"].map(label_mapping)

# Step 2: Tokenize the Text Data
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["clean_ingredients"], truncation=True, padding="max_length", max_length=128)

# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# Apply tokenization (batched processing)
train_dataset = train_dataset.map(lambda x: tokenize_function(x), batched=True)
test_dataset = test_dataset.map(lambda x: tokenize_function(x), batched=True)

# Step 3: Define Model & Training Parameters (with GPU enhancements)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
model.to(device)  # Ensure the model is on the correct device

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,                         # Enable half-precision training for GPUs
    dataloader_num_workers=4,          # Use multiple workers for faster data loading
    report_to="none"                   # Disable logging to WandB
)

# Step 4: Define Metrics for Evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Step 5: Fine-Tune the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# Step 6: Evaluate & Save the Model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

model.save_pretrained("distilbert_food_classifier")
tokenizer.save_pretrained("distilbert_food_classifier")
print("Model training complete and saved!")

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/184036 [00:00<?, ? examples/s]

Map:   0%|          | 0/46010 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4948,0.486645,0.797327,0.791968,0.797327,0.793348
2,0.436,0.449095,0.817474,0.81221,0.817474,0.813797
3,0.3651,0.452806,0.82469,0.82041,0.82469,0.822091




Evaluation Results: {'eval_loss': 0.45280560851097107, 'eval_accuracy': 0.8246902847207129, 'eval_precision': 0.8204097847196095, 'eval_recall': 0.8246902847207129, 'eval_f1': 0.8220914460798221, 'eval_runtime': 46.9683, 'eval_samples_per_second': 979.597, 'eval_steps_per_second': 61.233, 'epoch': 3.0}
Model training complete and saved!


In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# 1. Load your fine-tuned model and tokenizer
model_path = "distilbert_food_classifier"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.eval()  # put model in evaluation mode

# 2. Create your raw text input
# (Below is your extracted text; you can put it all in one string)
input_text = """Nutrition Facts
1 serving per container
Serving size 1 cup (40g)

Amount per serving
Calories 100
% Daily Value*
Total Fat 1g (1%)
Saturated Fat 0g (0%)
Trans Fat 0g
Cholesterol 0mg (0%)
Sodium 50mg (2%)
Total Carbohydrate 20g (7%)
Dietary Fiber 5g (20%)
Total Sugars 2g
Includes 0g Added Sugars (0%)
Protein 5g

Vitamin D 5mcg (25%)
Calcium 100mg (8%)
Iron 5mg (30%)
Potassium 300mg (6%)

This product is made with whole grains, low sugar, and no artificial ingredients.
"""

# 3. Tokenize the text
inputs = tokenizer(
    input_text,
    truncation=True,
    padding="max_length",
    max_length=128,
    return_tensors="pt"
)

# 4. Get predictions from the model
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()

# 5. Map the predicted class ID back to your labels
label_mapping = {0: "healthy", 1: "moderately_healthy", 2: "unhealthy"}
predicted_label = label_mapping[predicted_class_id]

print("Predicted Label:", predicted_label)

Predicted Label: unhealthy
