In [10]:
import pandas as pd

# Load the CSV file (update the path if necessary)
file_path = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_csv_2.csv"
df = pd.read_csv(file_path)

# Drop the specified columns
df = df.drop(columns=['categories', 'food_groups'], errors='ignore')

# Save the modified DataFrame to a new CSV file
output_path = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_csv_4.csv"
df.to_csv(output_path, index=False)

print(f"New CSV file saved at: {output_path}")


New CSV file saved at: C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_csv_4.csv


In [8]:
import pandas as pd

# Load the dataset
file_path = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_csv_4.csv"
df = pd.read_csv(file_path)

# Get and print the column names
column_names = df.columns.tolist()
print("Column names:", column_names)



Column names: ['pnns_groups_1', 'pnns_groups_2', 'nutriscore_grade', 'energy-kcal_100g', 'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'fruits-vegetables-nuts-estimate-from-ingredients_100g']


In [15]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
import pandas as pd
import joblib

# Load data
file_path = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/data/final_csv_4.csv"
df = pd.read_csv(file_path)

# Define columns
numerical_features = [
    'energy-kcal_100g', 'fat_100g', 'saturated-fat_100g', 
    'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 
    'proteins_100g', 'salt_100g', 'fruits-vegetables-nuts-estimate-from-ingredients_100g'
]
categorical_features = ['pnns_groups_1', 'pnns_groups_2']
target_column = 'nutriscore_grade'

# Separate features and target
X = df[numerical_features + categorical_features]
y = df[target_column]

# Define preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Apply preprocessing and SMOTE
X_preprocessed = preprocessor.fit_transform(X)
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_preprocessed, y)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Define models and parameters for GridSearch
models = {
    "Random Forest Pipeline": {
        "pipeline": Pipeline([
            ('model', RandomForestClassifier(random_state=42))
        ]),
        "params": {
            'model__n_estimators': [50, 100],
            'model__max_depth': [5, 10]
        }
    },
    "Logistic Regression Pipeline": {
        "pipeline": Pipeline([
            ('model', LogisticRegression(max_iter=1000, random_state=42))
        ]),
        "params": {
            'model__C': [0.1, 1, 10],
            'model__solver': ['liblinear']
        }
    }
}

# Train and evaluate models with GridSearchCV
best_pipeline = None
best_score = 0

for model_name, config in models.items():
    print(f"Training {model_name} with GridSearchCV...")

    # Perform GridSearchCV on each model pipeline
    grid_search = GridSearchCV(
        estimator=config['pipeline'],
        param_grid=config['params'],
        cv=2,  # Reduced number of folds for faster execution
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)

    # Best model and performance
    if grid_search.best_score_ > best_score:
        best_score = grid_search.best_score_
        best_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', grid_search.best_estimator_)
        ])

    print(f"\nBest {model_name} Performance:")
    print("Accuracy on CV data:", grid_search.best_score_)
    print("Best Parameters:", grid_search.best_params_)

    # Evaluate on test set
    y_pred = grid_search.predict(X_test)
    print("\nTest Performance:")
    print("Accuracy on test data:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n" + "-"*50 + "\n")

# Save the best pipeline (including preprocessing)
pipeline_path = "C:/data/simplon_dev_ia_projects/flask_projects/nutriscore_prediction_app/trained_models/best_model_pipeline.joblib"
joblib.dump(best_pipeline, pipeline_path)
print(f"Best model pipeline saved successfully at: {pipeline_path}")


Training Random Forest Pipeline with GridSearchCV...

Best Random Forest Pipeline Performance:
Accuracy on CV data: 0.7703699518997966
Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 100}

Test Performance:
Accuracy on test data: 0.7689576137134002
Classification Report:
               precision    recall  f1-score   support

           a       0.85      0.80      0.83      6153
           b       0.64      0.75      0.69      6299
           c       0.70      0.67      0.69      6293
           d       0.79      0.81      0.80      6097
           e       0.91      0.82      0.86      6135

    accuracy                           0.77     30977
   macro avg       0.78      0.77      0.77     30977
weighted avg       0.78      0.77      0.77     30977

Confusion Matrix:
 [[4939  926  283    5    0]
 [ 640 4709  912   28   10]
 [ 155 1167 4243  713   15]
 [  31  183  492 4910  481]
 [  47  403  115  551 5019]]

--------------------------------------------------

Training