In [8]:
import pandas as pd

# Paths to the uploaded CSV files
file_paths = [
    'FOOD-DATA-GROUP1.csv',
    'FOOD-DATA-GROUP2.csv',
    'FOOD-DATA-GROUP3.csv',
    'FOOD-DATA-GROUP4.csv',
    'FOOD-DATA-GROUP5.csv'
]

# Combine all CSV files into a single DataFrame
combined_df = pd.concat((pd.read_csv(file) for file in file_paths), ignore_index=True)

# Save the combined DataFrame into a new CSV file
combined_csv_path = 'COMBINED-FOOD-DATA.csv'
combined_df.to_csv(combined_csv_path, index=False)

combined_csv_path, combined_df.head()


('COMBINED-FOOD-DATA.csv',
    Unnamed: 0.1  Unnamed: 0                              food  Caloric Value  \
 0             0           0                      cream cheese             51   
 1             1           1                 neufchatel cheese            215   
 2             2           2  requeijao cremoso light catupiry             49   
 3             3           3                    ricotta cheese             30   
 4             4           4              cream cheese low fat             30   
 
     Fat  Saturated Fats  Monounsaturated Fats  Polyunsaturated Fats  \
 0   5.0             2.9                   1.3                 0.200   
 1  19.4            10.9                   4.9                 0.800   
 2   3.6             2.3                   0.9                 0.000   
 3   2.0             1.3                   0.5                 0.002   
 4   2.3             1.4                   0.6                 0.042   
 
    Carbohydrates  Sugars  ...  Calcium  Copper   I

In [14]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.metrics import accuracy_score

# Step 1: Load and View the Data
file_path = 'COMBINED-FOOD-DATA.csv'
df = pd.read_csv(file_path)

# Clean the data (remove unnecessary columns)
df_cleaned = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

# Assume a food item is healthy if its "Nutrition Density" is higher than the median
median_nutrition_density = df_cleaned['Nutrition Density'].median()

# Assign label: 1 (Healthy) if Nutrition Density > median, else 0 (Unhealthy)
df_cleaned['label'] = df_cleaned['Nutrition Density'].apply(lambda x: 1 if x > median_nutrition_density else 0)

# Display the cleaned dataset
print("Cleaned Dataset Preview:")
print(df_cleaned.head())

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df_cleaned[['food', 'label']])

# Step 2: Tokenizer and Model Setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Check for MPS device (e.g., Apple silicon), otherwise default to CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
print(f"Device in use: {device}")

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['food'], padding="max_length", truncation=True, max_length=128)

# Apply tokenizer
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split dataset into train and test sets
dataset_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = dataset_split['train']
test_dataset = dataset_split['test']

# Step 3: Define Metrics (Accuracy)
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)  # predicted labels
    labels = p.label_ids  # true labels
    acc = accuracy_score(labels, preds)  # accuracy
    return {"accuracy": acc}

# Step 4: Training the Model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=9,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",  # Updated parameter name
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,  # tokenizer parameter removed
)


# Train the model
trainer.train()

# Save the trained model
trainer.save_model('./final_model')

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Step 5: Dynamic User Interaction
healthy_foods = set()
unhealthy_foods = set()

# Prediction function
def predict_health_status(food_item):
    inputs = tokenizer(food_item, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1)
    return "Healthy" if prediction.item() == 1 else "Unhealthy"

# User interaction function
def user_interaction():
    while True:
        user_input = input("\nEnter a food item to check its health status (or type 'exit' to quit, 'list' for list of foods): ")

        if user_input.lower() == 'exit':
            print("Exiting the program. Goodbye!")
            break
        elif user_input.lower() == 'list':
            print("\nHealthy Foods List:")
            for food in healthy_foods:
                print(f"- {food}")
            print("\nUnhealthy Foods List:")
            for food in unhealthy_foods:
                print(f"- {food}")
        else:
            health_status = predict_health_status(user_input)
            print(f"The health status of '{user_input}' is: {health_status}")

            if health_status == "Healthy":
                healthy_foods.add(user_input)
            else:
                unhealthy_foods.add(user_input)

# Step 6: Start user interaction
user_interaction()


Cleaned Dataset Preview:
                               food  Caloric Value   Fat  Saturated Fats  \
0                      cream cheese             51   5.0             2.9   
1                 neufchatel cheese            215  19.4            10.9   
2  requeijao cremoso light catupiry             49   3.6             2.3   
3                    ricotta cheese             30   2.0             1.3   
4              cream cheese low fat             30   2.3             1.4   

   Monounsaturated Fats  Polyunsaturated Fats  Carbohydrates  Sugars  Protein  \
0                   1.3                 0.200            0.8   0.500      0.9   
1                   4.9                 0.800            3.1   2.700      7.8   
2                   0.9                 0.000            0.9   3.400      0.8   
3                   0.5                 0.002            1.5   0.091      1.5   
4                   0.6                 0.042            1.2   0.900      1.2   

   Dietary Fiber  ...  Copper  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Device in use: mps


Map:   0%|          | 0/2395 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6736,0.685012,0.607516
2,0.7005,0.620454,0.670146
3,0.6637,0.746709,0.676409
4,0.5063,0.703973,0.676409
5,0.3469,1.213111,0.659708
6,0.3662,1.479922,0.661795
7,0.3779,1.628153,0.651357
8,0.2507,1.769095,0.672234
9,0.1166,1.875738,0.682672


Evaluation Results: {'eval_loss': 1.8757377862930298, 'eval_accuracy': 0.6826722338204593, 'eval_runtime': 3.4558, 'eval_samples_per_second': 138.607, 'eval_steps_per_second': 17.362, 'epoch': 9.0}



Enter a food item to check its health status (or type 'exit' to quit, 'list' for list of foods):  quit


The health status of 'quit' is: Healthy



Enter a food item to check its health status (or type 'exit' to quit, 'list' for list of foods):  quit


The health status of 'quit' is: Healthy



Enter a food item to check its health status (or type 'exit' to quit, 'list' for list of foods):  exit


Exiting the program. Goodbye!


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
from sklearn.metrics import accuracy_score

# Step 1: Load and View the Data
file_path = 'COMBINED-FOOD-DATA.csv'
df = pd.read_csv(file_path)

# Clean the data (remove unnecessary columns)
df_cleaned = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1']).dropna()

# Assume a food item is healthy if its "Nutrition Density" is higher than the median
median_nutrition_density = df_cleaned['Nutrition Density'].median()

# Assign label: 1 (Healthy) if Nutrition Density > median, else 0 (Unhealthy)
df_cleaned['label'] = df_cleaned['Nutrition Density'].apply(lambda x: 1 if x > median_nutrition_density else 0)

# Display the cleaned dataset
print("Cleaned Dataset Preview:")
print(df_cleaned.head())

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df_cleaned[['food', 'label']])

# Step 2: Tokenizer and Model Setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Check for MPS device (e.g., Apple silicon), otherwise default to CPU
device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
model.to(device)
print(f"Device in use: {device}")

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['food'], padding="max_length", truncation=True, max_length=128)

# Apply tokenizer
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split dataset into train and test sets
dataset_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset_split['train']
test_dataset = dataset_split['test']

# Step 3: Define Metrics (Accuracy)
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Step 4: Training the Model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=20,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model('./final_model')

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Step 5: Dynamic User Interaction
healthy_foods = set()
unhealthy_foods = set()

# Prediction function
def predict_health_status(food_item):
    inputs = tokenizer(food_item, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1)
    return "Healthy" if prediction.item() == 1 else "Unhealthy"

# Enhanced user interaction function
def user_interaction():
    while True:
        user_input = input("\nEnter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists): ").strip()

        if not user_input:
            print("Please enter a valid food item.")
            continue
        elif user_input.lower() == 'exit':
            print("Exiting the program. Goodbye!")
            break
        elif user_input.lower() == 'list':
            if healthy_foods:
                print("\nHealthy Foods List:")
                for food in healthy_foods:
                    print(f"- {food}")
            else:
                print("\nNo healthy foods yet.")

            if unhealthy_foods:
                print("\nUnhealthy Foods List:")
                for food in unhealthy_foods:
                    print(f"- {food}")
            else:
                print("\nNo unhealthy foods yet.")
        elif user_input.lower() == 'clear':
            healthy_foods.clear()
            unhealthy_foods.clear()
            print("\nFood lists have been cleared.")
        else:
            health_status = predict_health_status(user_input)
            print(f"The health status of '{user_input}' is: {health_status}")

            if health_status == "Healthy":
                healthy_foods.add(user_input)
            else:
                unhealthy_foods.add(user_input)

# Step 6: Start user interaction
user_interaction()


Cleaned Dataset Preview:
                               food  Caloric Value   Fat  Saturated Fats  \
0                      cream cheese             51   5.0             2.9   
1                 neufchatel cheese            215  19.4            10.9   
2  requeijao cremoso light catupiry             49   3.6             2.3   
3                    ricotta cheese             30   2.0             1.3   
4              cream cheese low fat             30   2.3             1.4   

   Monounsaturated Fats  Polyunsaturated Fats  Carbohydrates  Sugars  Protein  \
0                   1.3                 0.200            0.8   0.500      0.9   
1                   4.9                 0.800            3.1   2.700      7.8   
2                   0.9                 0.000            0.9   3.400      0.8   
3                   0.5                 0.002            1.5   0.091      1.5   
4                   0.6                 0.042            1.2   0.900      1.2   

   Dietary Fiber  ...  Copper  

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Device in use: mps


Map:   0%|          | 0/2395 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6467,0.662393,0.626305
2,0.6232,0.630119,0.659708
3,0.5062,0.660168,0.659708
4,0.4913,0.711999,0.672234
5,0.3783,0.739826,0.676409


Evaluation Results: {'eval_loss': 0.7398261427879333, 'eval_accuracy': 0.6764091858037579, 'eval_runtime': 924.4953, 'eval_samples_per_second': 0.518, 'eval_steps_per_second': 0.032, 'epoch': 5.0}



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  chicken


The health status of 'chicken' is: Unhealthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  cheese


The health status of 'cheese' is: Healthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  water


The health status of 'water' is: Unhealthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  juice


The health status of 'juice' is: Unhealthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  soda


The health status of 'soda' is: Unhealthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  mango


The health status of 'mango' is: Unhealthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  meat


The health status of 'meat' is: Unhealthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  fish


The health status of 'fish' is: Unhealthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  crab


The health status of 'crab' is: Unhealthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  sea food


The health status of 'sea food' is: Unhealthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  foood


The health status of 'foood' is: Unhealthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  food


The health status of 'food' is: Unhealthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  drink


The health status of 'drink' is: Unhealthy



Enter a food item (type 'exit' to quit, 'list' for list, 'clear' to clear lists):  pizza


The health status of 'pizza' is: Healthy
