## Notebook for preprocessing and training


In [22]:
# Import essential libraries for data manipulation and visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
# Import libraries for data preprocessing and modeling
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy import stats
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE


## Load and Preview the Data

In [23]:


def load_and_clean_data(file_path, drop_method="drop_critical", critical_columns=None):
    """
    Loads data from a CSV, removes missing values based on the specified method, and displays a preview.
    
    Parameters:
        file_path (str): The path to the CSV file.
        drop_method (str): Method for handling missing values.
                           "drop_all" - Drop rows with any missing value.
                           "drop_critical" - Drop rows with missing values in critical columns.
        critical_columns (list): List of critical columns to check when using the "drop_critical" method.
    
    Returns:
        DataFrame: The cleaned DataFrame.
    """
    # Load the data
    print("Loading data...")
    df = pd.read_csv(file_path)
    print("Data loaded successfully.\n")
    
    # Show initial data preview
    print("### Initial Data Preview")
    display(df.head())
    
    # Drop rows with missing values based on the chosen method
    if drop_method == "drop_all":
        df_cleaned = df.dropna()
    elif drop_method == "drop_critical" and critical_columns is not None:
        df_cleaned = df.dropna(subset=critical_columns)
    else:
        raise ValueError("Invalid drop method or missing critical columns for 'drop_critical' method")
    
    # Show info and missing values summary after cleaning
    print("\n### Dataset Info After Cleaning")
    print(df_cleaned.info())
    
    print("\n### Missing Values in Each Column After Cleaning")
    print(df_cleaned.isnull().sum())
    
    return df_cleaned

# Usage example
file_path = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/f_merged_cleaned.csv"
critical_columns = [
    'energy-kcal_100g', 'fat_100g', 'saturated-fat_100g', 
    'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 
    'proteins_100g', 'salt_100g'
]
df_cleaned = load_and_clean_data(file_path, drop_method="drop_critical", critical_columns=critical_columns)


Loading data...
Data loaded successfully.

### Initial Data Preview


Unnamed: 0,categories,pnns_groups_1,pnns_groups_2,food_groups,nutriscore_grade,energy-kcal_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,nutrition-score-fr_100g
0,Proteinpulver,unknown,unknown,unknown,d,406.0,6.6,4.3,6.7,6.7,,80.0,0.5,0.013317,12.0
1,Dark chocolate bar,sugary snacks,chocolate products,chocolate-products,e,578.0,40.9,21.5,40.7,35.0,8.4,7.5,0.0,12.0,19.0
2,Gemüse,fruits and vegetables,vegetables,vegetables,a,208.0,0.0,0.0,8.0,8.0,88.0,0.0,0.23,50.0,-2.0
3,Beverages and beverages preparations,beverages,artificially sweetened beverages,artificially-sweetened-beverages,e,85.0,2.4,1.1,9.6,2.2,1.3,5.7,1.181102,0.0,17.0
4,Plant-based foods and beverages,beverages,artificially sweetened beverages,artificially-sweetened-beverages,d,536.0715,35.714284,8.928572,46.42857,28.57143,10.714286,10.714286,0.089286,,15.0



### Dataset Info After Cleaning
<class 'pandas.core.frame.DataFrame'>
Index: 107256 entries, 1 to 140638
Data columns (total 15 columns):
 #   Column                                                 Non-Null Count   Dtype  
---  ------                                                 --------------   -----  
 0   categories                                             107254 non-null  object 
 1   pnns_groups_1                                          107256 non-null  object 
 2   pnns_groups_2                                          107256 non-null  object 
 3   food_groups                                            107256 non-null  object 
 4   nutriscore_grade                                       107256 non-null  object 
 5   energy-kcal_100g                                       107256 non-null  float64
 6   fat_100g                                               107256 non-null  float64
 7   saturated-fat_100g                                     107256 non-null  float64
 8   carboh

In [24]:
def finalize_clean_data(df, critical_column=None, fill_method="median", output_path="final_cleaned_data.csv"):
    """
    Finalizes cleaning by filling missing values in 'categories' with 'unknown',
    addressing missing values in a specified critical column, and saving the DataFrame.
    
    Parameters:
        df (DataFrame): The DataFrame with partially cleaned data.
        critical_column (str): Column to address remaining missing values.
        fill_method (str): Method to handle missing values in critical_column.
                           Options: "median" (default), "mean", "fill_zero", "drop".
        output_path (str): Path where the final cleaned data will be saved.
    
    Returns:
        DataFrame: Final cleaned DataFrame.
    """
    # Fill missing values in 'categories' column with 'unknown'
    df['categories'].fillna('unknown', inplace=True)
    
    # Handle missing values in the specified critical column
    if critical_column and fill_method:
        if fill_method == "median":
            df[critical_column].fillna(df[critical_column].median(), inplace=True)
        elif fill_method == "mean":
            df[critical_column].fillna(df[critical_column].mean(), inplace=True)
        elif fill_method == "fill_zero":
            df[critical_column].fillna(0, inplace=True)
        elif fill_method == "drop":
            df.dropna(subset=[critical_column], inplace=True)
        else:
            raise ValueError("Invalid fill method specified.")
    
    # Save final cleaned data
    df.to_csv(output_path, index=False)
    print(f"Final cleaned data successfully saved to {output_path}")
    
    return df

# Apply to the cleaned data
df_final = finalize_clean_data(df_cleaned, 
                               critical_column="fruits-vegetables-nuts-estimate-from-ingredients_100g", 
                               fill_method="median",
                               output_path="C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_cleaned_data.csv")


Final cleaned data successfully saved to C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_cleaned_data.csv


In [25]:
import pandas as pd
from IPython.display import display

def load_and_preview_final_data(file_path):
    """
    Loads the final cleaned CSV and previews the data with basic statistics.
    
    Parameters:
        file_path (str): Path to the final cleaned CSV file.
        
    Returns:
        DataFrame: Loaded DataFrame for further use.
    """
    # Load the final cleaned CSV
    df = pd.read_csv(file_path)
    print("Data loaded successfully from:", file_path)
    
    # Display the first few rows
    print("\n### Initial Data Preview")
    display(df.head())
    
    # Show data info for types and non-null counts
    print("\n### Dataset Info")
    display(df.info())
    
    # Check for any remaining missing values in each column
    print("\n### Remaining Missing Values in Each Column")
    display(df.isnull().sum())
    
    # Descriptive statistics for numerical columns
    print("\n### Descriptive Statistics (Numerical Features)")
    display(df.describe())
    
    return df

# Load and preview the final cleaned data
file_path = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_cleaned_data.csv"
df_final_cleaned = load_and_preview_final_data(file_path)


Data loaded successfully from: C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_cleaned_data.csv

### Initial Data Preview


Unnamed: 0,categories,pnns_groups_1,pnns_groups_2,food_groups,nutriscore_grade,energy-kcal_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,nutrition-score-fr_100g
0,Dark chocolate bar,sugary snacks,chocolate products,chocolate-products,e,578.0,40.9,21.5,40.7,35.0,8.4,7.5,0.0,12.0,19.0
1,Gemüse,fruits and vegetables,vegetables,vegetables,a,208.0,0.0,0.0,8.0,8.0,88.0,0.0,0.23,50.0,-2.0
2,Beverages and beverages preparations,beverages,artificially sweetened beverages,artificially-sweetened-beverages,e,85.0,2.4,1.1,9.6,2.2,1.3,5.7,1.181102,0.0,17.0
3,Plant-based foods and beverages,beverages,artificially sweetened beverages,artificially-sweetened-beverages,d,536.0715,35.714284,8.928572,46.42857,28.57143,10.714286,10.714286,0.089286,2.905273,15.0
4,Beverages,beverages,artificially sweetened beverages,artificially-sweetened-beverages,d,442.0,22.0,2.6,54.0,25.0,1.4,6.4,0.53,22.666667,13.0



### Dataset Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107256 entries, 0 to 107255
Data columns (total 15 columns):
 #   Column                                                 Non-Null Count   Dtype  
---  ------                                                 --------------   -----  
 0   categories                                             107256 non-null  object 
 1   pnns_groups_1                                          107256 non-null  object 
 2   pnns_groups_2                                          107256 non-null  object 
 3   food_groups                                            107256 non-null  object 
 4   nutriscore_grade                                       107256 non-null  object 
 5   energy-kcal_100g                                       107256 non-null  float64
 6   fat_100g                                               107256 non-null  float64
 7   saturated-fat_100g                                     107256 non-null  float64
 8   carbohydrates_10

None


### Remaining Missing Values in Each Column


categories                                               0
pnns_groups_1                                            0
pnns_groups_2                                            0
food_groups                                              0
nutriscore_grade                                         0
energy-kcal_100g                                         0
fat_100g                                                 0
saturated-fat_100g                                       0
carbohydrates_100g                                       0
sugars_100g                                              0
fiber_100g                                               0
proteins_100g                                            0
salt_100g                                                0
fruits-vegetables-nuts-estimate-from-ingredients_100g    0
nutrition-score-fr_100g                                  0
dtype: int64


### Descriptive Statistics (Numerical Features)


Unnamed: 0,energy-kcal_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,nutrition-score-fr_100g
count,107256.0,107256.0,107256.0,107256.0,107256.0,107256.0,107256.0,107256.0,107256.0,107256.0
mean,289.1018,13.699155,4.8416,33.941732,14.952286,3.215402,8.885974,2.006691,19.735023,8.51847
std,238.111434,18.164536,8.159355,30.150176,19.321773,6.275913,21.228585,124.399403,31.488797,9.038231
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.672309,-15.0
25%,118.0,0.88,0.0,8.0,1.18,0.0,1.82,0.115,0.0,0.0
50%,300.0,8.1,1.79,27.14,5.56,1.6,5.88,0.62,2.905273,9.0
75%,423.72911,21.43,7.0,57.5,23.684211,3.8,11.76,1.25,24.053257,15.0
max,22000.0,1900.0,700.0,2000.0,300.0,1100.0,3400.0,39600.0,1800.098419,40.0


#### prepare the data for modeling,  outline a pipeline that includes:

Encoding Categorical Variables: Using one-hot encoding for features like categories, pnns_groups_1, pnns_groups_2, and food_groups.
Balancing the Target Variable: Using downsampling or upsampling to balance classes in the target, nutrition-score-fr_100g.
Standardization/Normalization: Applying scaling to numerical features to prepare for model training.
Splitting Data: Setting aside a sample for testing from the beginning and creating a train-test split from the remaining data.
Taking a Smaller Sample (optional): If the dataset is large, we’ll take a smaller sample for quicker experimentation.

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import pandas as pd

def preprocess_data(df, target_column="nutriscore_grade", test_size=0.2, sample_size=0.2):
    # Step 1: Keep only a sample of the data for model training
    df_sample = df.sample(frac=sample_size, random_state=42) if sample_size < 1.0 else df.copy()

    # Step 2: Separate features and target
    X = df_sample.drop(columns=[target_column])
    y = df_sample[target_column]

    # Step 3: Encode categorical features
    X = pd.get_dummies(X, drop_first=True)  # One-hot encode and drop first to avoid multicollinearity

    # Step 4: Handle imbalanced classes with SMOTE
    smote = SMOTE(random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Step 5: Standardize numerical features
    scaler = StandardScaler()
    X_balanced = scaler.fit_transform(X_balanced)

    # Step 6: Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=test_size, random_state=42)

    return X_train, X_test, y_train, y_test, df_sample

# Apply the function
file_path = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_cleaned_data.csv"
df = pd.read_csv(file_path)
X_train, X_test, y_train, y_test, df_test = preprocess_data(df)



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42)
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Evaluation metrics
    print(f"\n{model_name} Performance:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n" + "-"*50 + "\n")
'''
# Example: Save the best model (e.g., Random Forest) if desired
best_model = models["Random Forest"]
joblib.dump(best_model, "best_model.joblib")'''


Training Logistic Regression...

Logistic Regression Performance:
Accuracy: 0.8582740788623141
Classification Report:
               precision    recall  f1-score   support

           a       0.93      0.93      0.93      1244
           b       0.85      0.90      0.88      1239
           c       0.85      0.74      0.79      1208
           d       0.77      0.79      0.78      1237
           e       0.88      0.92      0.90      1260

    accuracy                           0.86      6188
   macro avg       0.86      0.86      0.86      6188
weighted avg       0.86      0.86      0.86      6188

Confusion Matrix:
 [[1157   71   15    1    0]
 [  62 1119   53    5    0]
 [  19  106  899  182    2]
 [   2   19   84  982  150]
 [   0    2    6   98 1154]]

--------------------------------------------------

Training Random Forest...

Random Forest Performance:
Accuracy: 0.9701034259857789
Classification Report:
               precision    recall  f1-score   support

           a     

# Logistic Regression Model Performance

### Model: Logistic Regression

#### Accuracy
- **Overall Accuracy**: 85.83%

#### Classification Report
| Label | Precision | Recall | F1-Score | Support |
|-------|-----------|--------|----------|---------|
| A     | 0.93      | 0.93   | 0.93     | 1244    |
| B     | 0.85      | 0.90   | 0.88     | 1239    |
| C     | 0.85      | 0.74   | 0.79     | 1208    |
| D     | 0.77      | 0.79   | 0.78     | 1237    |
| E     | 0.88      | 0.92   | 0.90     | 1260    |
| **Macro Avg** | **0.86** | **0.86** | **0.86** | **6188** |
| **Weighted Avg** | **0.86** | **0.86** | **0.86** | **6188** |

#### Confusion Matrix
|      | Predicted A | Predicted B | Predicted C | Predicted D | Predicted E |
|------|-------------|-------------|-------------|-------------|-------------|
| **Actual A** | 1157       | 71          | 15          | 1           | 0           |
| **Actual B** | 62         | 1119        | 53          | 5           | 0           |
| **Actual C** | 19         | 106         | 899         | 182         | 2           |
| **Actual D** | 2          | 19          | 84          | 982         | 150         |
| **Actual E** | 0          | 2           | 6           | 98          | 1154        |

### Interpretation and Next Steps
The Logistic Regression model shows a strong performance with an accuracy of **85.83%**. It has particularly high precision and recall for classes A, B, and E. Class C has lower recall, which might suggest room for improvement, possibly through additional tuning or using other algorithms.

#### Suggested Next Steps
1. **Consider Alternative Models**: Test models such as Random Forest, Gradient Boosting, or other ensemble techniques to see if they improve performance on classes with lower recall.
2. **Model Tuning**: Hyperparameter tuning (e.g., Grid Search or Randomized Search) can help to optimize Logistic Regression or other models.
3. **Evaluate with Other Metrics**: Beyond accuracy, metrics like F1-score (harmonizing precision and recall) are beneficial, especially for imbalanced classes.

Once the best model is selected, it can be saved using joblib for deployment or further integration.


Logistic Regression Model Evaluation
Overview: Logistic Regression achieved an overall accuracy of 85.83% on the test set, suggesting that the model performs reasonably well in predicting the Nutri-Score grades (A, B, C, D, E) based on the features in your dataset. Below is a breakdown of key metrics and insights.

Key Metrics Explained
Accuracy: The model correctly predicted the Nutri-Score grade in about 86% of cases. This high accuracy indicates a generally well-performing model.

Classification Report: This report shows metrics for each class (A, B, C, D, E):

Precision measures how often the model's positive predictions were correct. High precision in classes A and E means that when the model predicts these grades, it’s usually correct.
Recall measures how well the model captures all actual positive cases for each class. Here:
High recall in classes A, B, and E shows that the model performs well in detecting these classes.
Lower recall in Class C (0.74) suggests that the model misses some instances of this class, meaning that a significant portion of actual “C” cases are being misclassified.
F1-Score combines precision and recall into one metric. F1-scores are fairly balanced across all classes, with the highest values in classes A and E.
Confusion Matrix: The matrix provides a detailed breakdown of correct and incorrect predictions for each class:

Diagonal Values: High values on the diagonal (e.g., 1157 for Class A) indicate correct predictions for that class.
Off-Diagonal Values: For example, Class C has several cases (182) misclassified as Class D, suggesting overlap between these two classes. Reducing these misclassifications could improve model performance.
Interpretation and Recommendations
Strengths:

The model performs strongly for Class A and E with high precision and recall, indicating it has learned distinguishing features for these classes well.
An overall balanced accuracy across most classes makes Logistic Regression a stable option for this task.
Areas for Improvement:

Class C Recall: The lower recall for Class C indicates that the model struggles to identify some instances of this class accurately. Additional tuning or feature engineering may help distinguish Class C better.
Class Overlap: Some overlap between classes (e.g., C misclassified as D) might be addressed by experimenting with alternative models that can handle class overlap more effectively, such as Random Forest or Gradient Boosting.
Next Steps to Improve Model Performance
Hyperparameter Tuning: Fine-tune Logistic Regression parameters to potentially improve recall and F1-scores. You can use GridSearchCV or RandomizedSearchCV for efficient tuning.
Alternative Models: Try other models like Random Forest, Gradient Boosting, or Support Vector Machines to see if they improve on Logistic Regression's performance.
Handling Imbalanced Data: Use techniques like SMOTE or class weights to balance the dataset further if specific classes are underrepresented.
Joblib Model Saving: Once an optimal model is identified, use joblib to save the model for deployment or further testing in a production environment.