In [1]:
#MiniProject Code - Naive Bayes

import pandas as pd
import os
import glob

#Setup of dataset path
base_folder = os.path.join("/Users/jpsalvador/Downloads/FINAL FOOD DATASET")
csv_pattern = os.path.join(base_folder, "FOOD-DATA-GROUP*.csv")

#Features to be used, same with what was written for AdaBoost

feature_cols = [
    "Protein", "Fat", "Carbohydrates", "Caloric Value",
    "Sugars", "Dietary Fiber", "Saturated Fats",
    "Monounsaturated Fats", "Polyunsaturated Fats",
    "Sodium", "Potassium"
]

#Loading of dataset
csv_files = glob.glob(csv_pattern)
print("CSV files found:", csv_files)

#Loading of files and combining 
dataframes = []
for file in csv_files:
    df_temp = pd.read_csv(file)
    dataframes.append(df_temp)

df = pd.concat(dataframes, ignore_index=True)

print("\n\nTotal rows before cleaning:", len(df))

#Cleaning by dropping rows with missing values
df = df.dropna(subset=feature_cols)
print("Total rows after cleaning:", len(df))
print(df.head())

CSV files found: ['/Users/jpsalvador/Downloads/FINAL FOOD DATASET/FOOD-DATA-GROUP1.csv', '/Users/jpsalvador/Downloads/FINAL FOOD DATASET/FOOD-DATA-GROUP3.csv', '/Users/jpsalvador/Downloads/FINAL FOOD DATASET/FOOD-DATA-GROUP2.csv', '/Users/jpsalvador/Downloads/FINAL FOOD DATASET/FOOD-DATA-GROUP5.csv', '/Users/jpsalvador/Downloads/FINAL FOOD DATASET/FOOD-DATA-GROUP4.csv']


Total rows before cleaning: 2395
Total rows after cleaning: 2395
   Unnamed: 0.1  Unnamed: 0                              food  Caloric Value  \
0             0           0                      cream cheese             51   
1             1           1                 neufchatel cheese            215   
2             2           2  requeijao cremoso light catupiry             49   
3             3           3                    ricotta cheese             30   
4             4           4              cream cheese low fat             30   

    Fat  Saturated Fats  Monounsaturated Fats  Polyunsaturated Fats  \
0   5.0 

In [2]:
#Assignment of Labels

def assign_label(row):
    """
    Assignment of labels using the following criteria:
    Muscle Gain if Protein >=15 OR (Protein >=10 AND Caloric Value >= 200)
    Endurance if Carbohydrates >= 30 OR Sugars >= 12 OR (Carbohydrates >=20 AND Sodium >= 140)
    Weight Loss if Caloric Value < 150 AND Fat < 5 AND (Carbohydrates < 20 OR Dietary Fiber >= 3)
    
    Assignment of labels will be done in that order of priority.
    """

    #MUSCLE GAIN
    muscle_gain = (
        (row["Protein"] >= 15) or
        (row["Protein"] >= 10 and row["Caloric Value"] >= 200)
    )

    #ENDURANCE
    endurance = (
        (row["Carbohydrates"] >= 30) or
        (row["Sugars"] >= 12) or
        (row["Carbohydrates"] >= 20 and row["Sodium"] >= 140)
    )

    #WEIGHT LOSS
    weight_loss = (
        (row["Caloric Value"] < 150) and
        (row["Fat"] < 5) and
        ((row["Carbohydrates"] < 20) or (row["Dietary Fiber"] >=3))
    )

    #ASSIGNMENT BASED ON PRIORITY
    if muscle_gain:
        return "Muscle Gain"
    if endurance:
        return "Endurance"
    if weight_loss:
        return "Weight Loss"
    return "No Category"

#Tagging of labels
df["Label"] = df.apply(assign_label, axis=1)
label_counts = df["Label"].value_counts()
print(label_counts)


Label
Weight Loss    1016
Muscle Gain     649
Endurance       386
No Category     344
Name: count, dtype: int64


In [19]:
#Naive Bayes

def run_naive_bayes(df, feature_cols):
    """
    This is the function to run the Gaussian Naive Bayes
    """
    
    #Dropping the No Category rows
    df_model = df[df["Label"] != "No Category"].copy()
    # df_model = df.copy() Tried to include "No Category" but got lower results
    X = df_model[feature_cols].values
    y = df_model["Label"].values
    
    #Cross-Validation
    from sklearn.naive_bayes import GaussianNB
    from sklearn.model_selection import cross_val_score
    
    #Setting of Gaussian Naive Bayes as the NB Algorithm to be used
    nb_model = GaussianNB()
    

    
    #METRICS
    cv_recall = cross_val_score(nb_model, X, y, cv = 5, scoring = 'recall_macro')
    cv_precision = cross_val_score(nb_model, X, y, cv = 5, scoring = 'precision_macro')
    cv_f1 = cross_val_score(nb_model, X, y, cv = 5, scoring = 'f1_macro')
    cv_accuracy = cross_val_score(nb_model, X, y, cv=5, scoring = 'accuracy')
    
    print("\n\n===== GAUSSIAN NAIVE BAYES RESULTS =====")
    #For checking only
    print("Shape of X:", X.shape)
    print("Example row from X:", X[0])
    print("Length of y:", len(y))
    print("Unique Labels:", set(y))
    print("\n")
    print(df_model["Label"].value_counts())
    print("\nMETRICS")
    print("Accuracy scores:", cv_accuracy)
    print("Average Accuracy:", cv_accuracy.mean())
    print("\nRecall scores:", cv_recall)
    print("Average Recall:", cv_recall.mean())
    print("\nPrecision scores:", cv_precision)
    print("Average Precision:", cv_precision.mean())
    print("\nF1 scores:", cv_f1)
    print("Average F1:", cv_f1.mean())

    return {
        "accuracy": cv_accuracy.mean(),
        "precision": cv_precision.mean(),
        "recall": cv_recall.mean(),
        "f1": cv_f1.mean
    }

In [20]:
#SVM

def run_svm_rbf(df, feature_cols):
    """
    This is runs the SVM (RBF Kernel) algorithm.
    """

    #Same setup of removing the 'No Category' wors
    df_model = df[df["Label"] != "No Category"].copy()

    X = df_model[feature_cols].values
    y = df_model["Label"].values
    
    #Importing of libraries for SVM
    from sklearn.svm import SVC
    from sklearn.preprocessing import StandardScaler #needed since the features aren't of the same scale
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import cross_val_score

    #SVM Pipeline -- scaling and classifier
    svm_model = Pipeline([
        ("scaler", StandardScaler()),
        ("svc", SVC(kernel="rbf", random_state=42))
    ])

    #METRICS
    cv_recall = cross_val_score(svm_model, X, y, cv = 5, scoring = 'recall_macro')
    cv_precision = cross_val_score(svm_model, X, y, cv = 5, scoring = 'precision_macro')
    cv_f1 = cross_val_score(svm_model, X, y, cv = 5, scoring = 'f1_macro')
    cv_accuracy = cross_val_score(svm_model, X, y, cv=5, scoring = 'accuracy')

    print("\n\n===== SVM (RBF) RESULTS =====")
     #For checking only
    print("Shape of X:", X.shape)
    print("Example row from X:", X[0])
    print("Length of y:", len(y))
    print("Unique Labels:", set(y))
    print("\n")
    print(df_model["Label"].value_counts())
    print("\nMETRICS")
    print("Accuracy scores:", cv_accuracy)
    print("Average Accuracy:", cv_accuracy.mean())
    print("\nRecall scores:", cv_recall)
    print("Average Recall:", cv_recall.mean())
    print("\nPrecision scores:", cv_precision)
    print("Average Precision:", cv_precision.mean())
    print("\nF1 scores:", cv_f1)
    print("Average F1:", cv_f1.mean())

    return {
        "accuracy": cv_accuracy.mean(),
        "precision": cv_precision.mean(),
        "recall": cv_recall.mean(),
        "f1": cv_f1.mean
    }

In [21]:
#CALLING OF THE FUNCTIONS

nb_results = run_naive_bayes(df,feature_cols)
svm_results = run_svm_rbf(df, feature_cols)



===== GAUSSIAN NAIVE BAYES RESULTS =====
Shape of X: (2051, 11)
Example row from X: [ 0.8  3.6  0.9 49.   3.4  0.1  2.3  0.9  0.   0.   0. ]
Length of y: 2051
Unique Labels: {'Endurance', 'Weight Loss', 'Muscle Gain'}


Label
Weight Loss    1016
Muscle Gain     649
Endurance       386
Name: count, dtype: int64

METRICS
Accuracy scores: [0.87347932 0.89756098 0.90243902 0.93170732 0.85365854]
Average Accuracy: 0.891769034478666

Recall scores: [0.86425632 0.87421199 0.89791128 0.91885471 0.85292524]
Average Recall: 0.8816319079864531

Precision scores: [0.85371738 0.86673314 0.87667136 0.91885471 0.83497036]
Average Precision: 0.8701893903106768

F1 scores: [0.84385223 0.87018346 0.88229028 0.91885471 0.82622815]
Average F1: 0.8682817646374117


===== SVM (RBF) RESULTS =====
Shape of X: (2051, 11)
Example row from X: [ 0.8  3.6  0.9 49.   3.4  0.1  2.3  0.9  0.   0.   0. ]
Length of y: 2051
Unique Labels: {'Endurance', 'Weight Loss', 'Muscle Gain'}


Label
Weight Loss    1016
Muscle G