In [13]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import joblib


df = pd.read_csv("AI_Recipe_Health_Dataset_1200_rows.csv")
df.head()


Unnamed: 0,recipe_id,calories,protein_g,fat_g,carbs_g,fiber_g,sugar_g,sodium_mg,prep_time_min,ingredients_count,meal_type,diet_type,health_score,rating
0,1,754,8.46,52.17,33.16,3.49,5.12,1566,119,20,Breakfast,Vegan,55,1.1
1,2,195,14.68,35.87,8.05,4.97,32.49,1166,58,10,Snack,Vegan,36,4.2
2,3,106,46.01,12.02,53.6,6.95,10.77,1613,48,6,Breakfast,Non-Veg,13,2.4
3,4,452,37.02,56.69,88.92,13.41,48.66,825,15,20,Dinner,Vegan,80,4.5
4,5,470,35.49,49.62,10.27,5.7,14.47,213,114,10,Breakfast,Non-Veg,36,2.8


In [5]:
df.shape, df.columns


((1200, 14),
 Index(['recipe_id', 'calories', 'protein_g', 'fat_g', 'carbs_g', 'fiber_g',
        'sugar_g', 'sodium_mg', 'prep_time_min', 'ingredients_count',
        'meal_type', 'diet_type', 'health_score', 'rating'],
       dtype='object'))

In [6]:
df["label"] = np.where(df["health_score"] >= 50, "Healthy", "Unhealthy")
df["label"].value_counts()


label
Healthy      629
Unhealthy    571
Name: count, dtype: int64

In [7]:
drop_cols = ["label", "health_score"]
if "recipe_id" in df.columns:
    drop_cols.append("recipe_id")

X = df.drop(columns=drop_cols)
y = df["label"]

X.head(), y.head()


(   calories  protein_g  fat_g  carbs_g  fiber_g  sugar_g  sodium_mg  \
 0       754       8.46  52.17    33.16     3.49     5.12       1566   
 1       195      14.68  35.87     8.05     4.97    32.49       1166   
 2       106      46.01  12.02    53.60     6.95    10.77       1613   
 3       452      37.02  56.69    88.92    13.41    48.66        825   
 4       470      35.49  49.62    10.27     5.70    14.47        213   
 
    prep_time_min  ingredients_count  meal_type diet_type  rating  
 0            119                 20  Breakfast     Vegan     1.1  
 1             58                 10      Snack     Vegan     4.2  
 2             48                  6  Breakfast   Non-Veg     2.4  
 3             15                 20     Dinner     Vegan     4.5  
 4            114                 10  Breakfast   Non-Veg     2.8  ,
 0      Healthy
 1    Unhealthy
 2    Unhealthy
 3      Healthy
 4    Unhealthy
 Name: label, dtype: object)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=["number"]).columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)
num_cols, cat_cols


(['calories',
  'protein_g',
  'fat_g',
  'carbs_g',
  'fiber_g',
  'sugar_g',
  'sodium_mg',
  'prep_time_min',
  'ingredients_count',
  'rating'],
 ['meal_type', 'diet_type'])

In [12]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
    "Gaussian Naive Bayes": GaussianNB()
}

results = {}

for name, clf in models.items():
    # Special case: Naive Bayes needs dense arrays after encoding
    if name == "Gaussian Naive Bayes":
        preprocess_nb = ColumnTransformer(
            transformers=[
                ("num", StandardScaler(), num_cols),
                ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols),
            ]
        )
        pipe = Pipeline([("prep", preprocess_nb), ("clf", clf)])
    else:
        pipe = Pipeline([("prep", preprocess), ("clf", clf)])

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    acc = accuracy_score(y_test, preds)
    results[name] = (pipe, acc)

    print("\n" + "="*60)
    print(name)
    print("Accuracy:", round(acc, 4))
    print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
    print("\nClassification Report:\n", classification_report(y_test, preds))



Logistic Regression
Accuracy: 0.5333
Confusion Matrix:
 [[80 46]
 [66 48]]

Classification Report:
               precision    recall  f1-score   support

     Healthy       0.55      0.63      0.59       126
   Unhealthy       0.51      0.42      0.46       114

    accuracy                           0.53       240
   macro avg       0.53      0.53      0.52       240
weighted avg       0.53      0.53      0.53       240


Random Forest
Accuracy: 0.5292
Confusion Matrix:
 [[80 46]
 [67 47]]

Classification Report:
               precision    recall  f1-score   support

     Healthy       0.54      0.63      0.59       126
   Unhealthy       0.51      0.41      0.45       114

    accuracy                           0.53       240
   macro avg       0.52      0.52      0.52       240
weighted avg       0.53      0.53      0.52       240


Gaussian Naive Bayes
Accuracy: 0.5292
Confusion Matrix:
 [[73 53]
 [60 54]]

Classification Report:
               precision    recall  f1-score   su

In [14]:

best_name = max(results, key=lambda k: results[k][1])
best_model = results[best_name][0]
best_acc = results[best_name][1]

print("Best model:", best_name)
print("Best accuracy:", round(best_acc, 4))

joblib.dump(best_model, "best_food_healthiness_model.joblib")
print("Saved: best_food_healthiness_model.joblib")


Best model: Logistic Regression
Best accuracy: 0.5333
Saved: best_food_healthiness_model.joblib


In [15]:
sample = X_test.iloc[[0]]
prediction = best_model.predict(sample)[0]
print("Sample prediction:", prediction)
print("Sample input:\n", sample)


Sample prediction: Healthy
Sample input:
       calories  protein_g  fat_g  carbs_g  fiber_g  sugar_g  sodium_mg  \
1001       365      42.46  19.25    61.34    14.47    17.44       1364   

      prep_time_min  ingredients_count meal_type diet_type  rating  
1001             71                 17     Lunch       Veg     2.5  
