# üìä Entra√Ænement des mod√®les ML pour NutriProfil
Ce notebook regroupe les entra√Ænements et sauvegardes de tous les mod√®les :
- R√©gression Lasso (pr√©vision consommation)
- R√©gression lin√©aire (charge glyc√©mique)
- RandomForestClassifier (risques : diab√®te, ob√©sit√©, MCV, cancer colorectal)

Les `.pkl` seront sauvegard√©s dans `streamlit/`.


In [1]:
# üì¶ Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from pathlib import Path
import os
import requests

# Dossier de destination
destination_dir = "streamlit"
os.makedirs(destination_dir, exist_ok=True)

# URLs brutes de GitHub
base_url = "https://raw.githubusercontent.com/stevens75010/nutriprofil/main/streamlit/"
model_files = [
    "diabete_risque_randomforest.pkl",
    "obesite_risque_randomforest.pkl",
    "mcv_risque_randomforest.pkl",
    "cancercolorectal_risque_randomforest.pkl"
]

# T√©l√©chargement automatique
for filename in model_files:
    file_url = base_url + filename
    local_path = os.path.join(destination_dir, filename)
    
    if not os.path.exists(local_path):
        print(f"üì• T√©l√©chargement de {filename}...")
        r = requests.get(file_url)
        if r.status_code == 200:
            with open(local_path, "wb") as f:
                f.write(r.content)
            print(f"‚úÖ Enregistr√© : {local_path}")
        else:
            print(f"‚ùå √âchec du t√©l√©chargement : {file_url}")
    else:
        print(f"üîÅ D√©j√† pr√©sent : {local_path}")


üîÅ D√©j√† pr√©sent : streamlit\diabete_risque_randomforest.pkl
üîÅ D√©j√† pr√©sent : streamlit\obesite_risque_randomforest.pkl
üîÅ D√©j√† pr√©sent : streamlit\mcv_risque_randomforest.pkl
üîÅ D√©j√† pr√©sent : streamlit\cancercolorectal_risque_randomforest.pkl


## üî¢ 1. Mod√®le de pr√©vision de consommation alimentaire (Lasso)

In [2]:
# Chargement des donn√©es conso
df_conso = pd.read_csv('streamlit/data/conso-menages-2024.csv')
cols = [str(y) for y in range(2010, 2025)]
for col in cols:
    df_conso[col] = df_conso[col].astype(str).str.replace(r'\s+', '', regex=True).str.replace(',', '.').astype(float)

X = df_conso[cols[:-1]]
y = df_conso[cols[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_conso = Lasso()
model_conso.fit(X_train, y_train)
print("R¬≤:", r2_score(y_test, model_conso.predict(X_test)))
joblib.dump(model_conso, 'streamlit/model_conso.pkl')

R¬≤: 0.9998725252973537


['streamlit/model_conso.pkl']

## üçö 2. Mod√®le charge glyc√©mique (r√©gression lin√©aire)

In [3]:
# Chargement donn√©es aliments
df_alim = pd.read_csv('streamlit/data/dataframe_complet_rempli_proteines.csv')
df_alim['Famille_regroupee'] = df_alim['Famille_regroupee'].replace('Fruits et l√É¬©gumes', 'Fruits et l√©gumes')
df_alim = pd.concat([df_alim, pd.get_dummies(df_alim['Famille_regroupee'], prefix='Famille', drop_first=True)], axis=1)

X = df_alim.drop(columns=['Aliment', 'CG', 'Famille_regroupee','IG'])
y = df_alim['CG']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_cg = LinearRegression()
model_cg.fit(X_train, y_train)
print("R¬≤ CG:", r2_score(y_test, model_cg.predict(X_test)))
joblib.dump(model_cg, 'streamlit/model_charge_glyc√©mique.pkl')

R¬≤ CG: 0.8907643393336107


['streamlit/model_charge_glyc√©mique.pkl']

## üß† 3. Mod√®les de risques sant√© (RandomForestClassifier)

In [4]:
# R√©utilisation df_alim pour entra√Æner 4 mod√®les de classification binaire
features = df_alim.drop(columns=['Aliment', 'Famille_regroupee', 'IG', 'CG'])
targets = ['diabete', 'obesite', 'mcv', 'cancercolorectal']

# Simuler des colonnes cibles al√©atoires pour l'exemple (remplace par ton vrai DataFrame si dispo)
for t in targets:
    df_alim[t] = np.random.randint(0, 2, size=len(df_alim))

for t in targets:
    X_train, X_test, y_train, y_test = train_test_split(features, df_alim[t], test_size=0.2, random_state=42)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    joblib.dump(model, f'streamlit/{t}_risque_randomforest.pkl')
    print(f"‚úÖ {t} ‚Äî R¬≤ approx :", model.score(X_test, y_test))

‚úÖ diabete ‚Äî R¬≤ approx : 0.5193704600484261
‚úÖ obesite ‚Äî R¬≤ approx : 0.4648910411622276
‚úÖ mcv ‚Äî R¬≤ approx : 0.48426150121065376
‚úÖ cancercolorectal ‚Äî R¬≤ approx : 0.49878934624697335


In [5]:
# üîç V√©rification des 6 fichiers mod√®les
import os

required_models = [
    "model_charge_glyc√©mique.pkl",
    "model_conso.pkl",
    "diabete_risque_randomforest.pkl",
    "obesite_risque_randomforest.pkl",
    "mcv_risque_randomforest.pkl",
    "cancercolorectal_risque_randomforest.pkl"
]

missing = []
for model in required_models:
    path = os.path.join("streamlit", model)
    if os.path.exists(path):
        print(f"‚úÖ Pr√©sent : {model}")
    else:
        print(f"‚ùå Manquant : {model}")
        missing.append(model)

if not missing:
    print("\nüéâ Tous les mod√®les sont bien pr√©sents !")
else:
    print(f"\n‚ö†Ô∏è Mod√®les manquants : {missing}")


‚úÖ Pr√©sent : model_charge_glyc√©mique.pkl
‚úÖ Pr√©sent : model_conso.pkl
‚úÖ Pr√©sent : diabete_risque_randomforest.pkl
‚úÖ Pr√©sent : obesite_risque_randomforest.pkl
‚úÖ Pr√©sent : mcv_risque_randomforest.pkl
‚úÖ Pr√©sent : cancercolorectal_risque_randomforest.pkl

üéâ Tous les mod√®les sont bien pr√©sents !
