In [None]:
import pandas as pd
import re
import joblib
import numpy as np

from sklearn.preprocessing    import MultiLabelBinarizer, StandardScaler
from sklearn.svm             import SVC
from sklearn.multioutput     import MultiOutputClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.metrics         import classification_report, confusion_matrix, hamming_loss
from sklearn.pipeline        import make_pipeline
from scipy.stats             import loguniform

# 1. Chargement et nettoyage
df = pd.read_csv('datasheet.csv')
df = df.dropna(subset=['afterUse']).reset_index(drop=True)

# 2. Encodage multilabel de la colonne 'afterUse'
split_values = df['afterUse'].str.split(',')
mlb = MultiLabelBinarizer()
one_hot = mlb.fit_transform(split_values)
one_hot_df = pd.DataFrame(one_hot, columns=mlb.classes_)
final_df = pd.concat([df, one_hot_df], axis=1)

# 3. Extraction des ingrédients et catégorisation
ingredient_categories = {
    'hyaluronic':        r'\b(hyaluron|sodium hyaluronate|hydrolyzed hyaluronic|hyaluronic acid)\b',
    'niacinamide':       r'\b(niacinamide|nicotinamide|vitamin b3)\b',
    'peptide':           r'\b(peptide|palmitoyl|oligopeptide|hexapeptide|sh-polypeptide)\b',
    'vitamin_c':         r'\b(vitamin c|ascorb|ascorbic acid|sodium ascorbyl phosphate)\b',
    'ceramide':          r'\b(ceramide|ceramide np|ceramide ap|ceramide eop)\b',
    'retinol':           r'\b(retinol|retinyl|retinoid|retinaldehyde)\b',
    'aha_bha':           r'\b(glycolic acid|lactic acid|mandelic acid|salicylic acid|betaine salicylate|citric acid|bha)\b',
    'antioxidant':       r'\b(ferulic acid|resveratrol|vitamin e|tocopherol|tocopheryl acetate|camellia sinensis leaf extract|glycyrrhiza glabra root extract|helianthus annuus seed oil|squalane|centella asiatica extract|mica)\b',
    'mineral_spf':       r'\b(zinc oxide|titanium dioxide|iron oxide|ci 77891|ci 77491|ci 77492)\b',
    'growth_factor':     r'\b(epidermal growth factor|egf)\b',
    'probiotic':         r'\b(lactobacillus|bifida ferment|probiotic|ferment extract)\b',
    'hydrating':         r'\b(glycerin|butylene glycol|propanediol|pentylene glycol|panthenol|aloe barbadensis leaf juice|sodium pca|betaine)\b',
    'emollient':         r'\b(cetearyl alcohol|cetyl alcohol|glyceryl stearate|caprylic/capric triglyceride|dimethicone|cocos nucifera oil|butyrospermum parkii butter|hydrogenated lecithin|lecithin)\b',
    'preservative':      r'\b(phenoxyethanol|ethylhexylglycerin|sodium benzoate|chlorphenesin|potassium sorbate|disodium edta)\b',
    'texture_stabilizer':r'\b(carbomer|xanthan gum|hydroxyethylcellulose|polysorbate 20|polysorbate 60|acrylates/c10-30 alkyl acrylate crosspolymer|hydroxyethyl acrylate/sodium acryloyldimethyl taurate copolymer)\b',
    'fragrance':         r'\b(parfum|limonene|linalool|citronellol|geraniol|alcohol denat)\b',
    'solvent':           r'\b(alcohol|propylene glycol|dipropylene glycol|cyclopentasiloxane)\b',
    'ph_adjuster':       r'\b(sodium hydroxide|tromethamine|sodium citrate)\b',
    'colorant':          r'\b(ci 77891|ci 77491|ci 77492)\b',
    'skin_soothing':     r'\b(allantoin|adenosine|centella asiatica extract|glycyrrhiza glabra root extract|camellia sinensis leaf extract)\b',
    '2_hexanediol':      r'\b(2-hexanediol)\b',
    'glyceryl_caprylate':r'\b(glyceryl caprylate)\b',
    'hydroxyacetophenone':r'\b(hydroxyacetophenone)\b',
    'titanium_dioxide':  r'\b(titanium dioxide)\b',
    'peg_100_stearate':  r'\b(peg-100 stearate)\b'
}

def categorize_ingredients(ingredients: str, categories: dict) -> dict:
    lst = ingredients.lower().split(',')
    out = {cat: 0 for cat in categories}
    for cat, patt in categories.items():
        if any(re.search(patt, ing) for ing in lst):
            out[cat] = 1
    return out

ing_cat_series = final_df['ingridients'].dropna().apply(
    categorize_ingredients,
    categories=ingredient_categories
)
ingr_df = pd.DataFrame(ing_cat_series.tolist(), index=ing_cat_series.index)
final_df = pd.concat([final_df, ingr_df], axis=1)

# Sauvegarde intermédiaire
final_df.to_csv("projets.csv", index=False)



# 5. Définition des features et targets
features_columns = [
    'Acne Fighting','Acne Trigger','Anti-Aging','Brightening','Dark Spots',
    'Drying','Eczema','Good For Oily Skin','Hydrating','Irritating',
    'Redness Reducing','Reduces Irritation','Reduces Large Pores',
    'Rosacea','Scar Healing'
]
target_columns = list(ingredient_categories.keys())

X = final_df[features_columns]
y = final_df[target_columns]




In [2]:
X

Unnamed: 0,Acne Fighting,Acne Trigger,Anti-Aging,Brightening,Dark Spots,Drying,Eczema,Good For Oily Skin,Hydrating,Irritating,Redness Reducing,Reduces Irritation,Reduces Large Pores,Rosacea,Scar Healing
0,0,1,1,1,1,1,0,1,0,1,0,0,1,1,0
1,1,1,1,1,0,1,0,1,0,0,1,1,0,0,0
2,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0
3,0,1,1,1,1,1,0,0,0,1,1,1,1,0,0
4,0,1,0,0,0,1,0,1,0,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17521,0,1,1,1,0,0,0,0,0,1,1,0,0,0,1
17522,1,0,0,1,0,0,0,1,0,0,1,1,0,0,0
17523,1,1,1,1,0,0,0,1,0,0,1,0,0,0,0
17524,1,0,1,1,1,1,1,1,0,1,0,1,1,1,0


In [3]:
y


Unnamed: 0,hyaluronic,niacinamide,peptide,vitamin_c,ceramide,retinol,aha_bha,antioxidant,mineral_spf,growth_factor,...,fragrance,solvent,ph_adjuster,colorant,skin_soothing,2_hexanediol,glyceryl_caprylate,hydroxyacetophenone,titanium_dioxide,peg_100_stearate
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,1,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,1,0,0,...,1,0,1,0,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17521,1,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
17522,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
17523,1,1,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
17524,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from scipy.stats import randint

# 6. Split stratifié multilabel
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=42)
train_idx, test_idx = next(msss.split(X, y))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# 7. Filtrage des cibles sans variance
valid_targets = [c for c in y_train if y_train[c].nunique() > 1]
dropped = [c for c in y_train if c not in valid_targets]
if dropped:
    print("Dropped targets with only one class:", dropped)
y_train = y_train[valid_targets]
y_test  = y_test[valid_targets]

# 9. RandomizedSearchCV sur Random Forest
base_rf = RandomForestClassifier(n_jobs=-1, random_state=42)
param_dist = {
    'estimator__n_estimators': randint(100, 300),
    'estimator__max_depth': randint(5, 50),
    'estimator__min_samples_split': randint(2, 10),
    'estimator__min_samples_leaf': randint(1, 10),
    'estimator__max_features': ['sqrt', 'log2', None]
}
rand = RandomizedSearchCV(
    estimator=MultiOutputClassifier(base_rf, n_jobs=-1),
    param_distributions=param_dist,
    n_iter=20, cv=3,
    scoring='f1_micro', verbose=2,
    random_state=42, n_jobs=-1
)
rand.fit(X_train, y_train)

# 10. GridSearchCV autour des meilleurs hyperparamètres
best = rand.best_params_
grid_param = {
    'estimator__n_estimators': [
        max(50, int(best['estimator__n_estimators'] * 0.8)),
        best['estimator__n_estimators'],
        int(best['estimator__n_estimators'] * 1.2)
    ],
    'estimator__max_depth': [
        max(5, int(best['estimator__max_depth'] * 0.8)),
        best['estimator__max_depth'],
        int(best['estimator__max_depth'] * 1.2)
    ],
    'estimator__min_samples_split': [best['estimator__min_samples_split']],
    'estimator__min_samples_leaf': [best['estimator__min_samples_leaf']],
    'estimator__max_features': [best['estimator__max_features']]
}
grid = GridSearchCV(
    estimator=MultiOutputClassifier(RandomForestClassifier(random_state=42), n_jobs=-1),
    param_grid=grid_param,
    cv=3, scoring='f1_micro', verbose=2, n_jobs=-1
)
grid.fit(X_train, y_train)

# 11. Évaluation finale
final_model = grid.best_estimator_
y_pred = final_model.predict(X_test)

report = classification_report(
    y_test, y_pred,
    target_names=valid_targets,
    output_dict=True, zero_division=0
)
df_report = pd.DataFrame(report).transpose().sort_values('f1-score', ascending=False)
df_report.to_csv('classification_report_rf.csv', index=True)


Dropped targets with only one class: ['growth_factor']
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [3]:
print("\nPer-target confusion matrices:")
for i, col in enumerate(valid_targets):
    tn, fp, fn, tp = confusion_matrix(y_test.iloc[:, i], y_pred[:, i]).ravel()
    print(f"{col}: TP={tp}, TN={tn}, FP={fp}, FN={fn}")


Per-target confusion matrices:
hyaluronic: TP=778, TN=3458, FP=400, FN=1148
niacinamide: TP=949, TN=4660, FP=113, FN=62
peptide: TP=0, TN=5259, FP=0, FN=525
vitamin_c: TP=155, TN=5151, FP=104, FN=374
ceramide: TP=336, TN=5079, FP=76, FN=293
retinol: TP=289, TN=5399, FP=69, FN=27
aha_bha: TP=1894, TN=3161, FP=102, FN=627
antioxidant: TP=3461, TN=1295, FP=635, FN=393
mineral_spf: TP=304, TN=4548, FP=173, FN=759
probiotic: TP=0, TN=5371, FP=0, FN=413
hydrating: TP=4707, TN=167, FP=816, FN=94
emollient: TP=2884, TN=1279, FP=954, FN=667
preservative: TP=4030, TN=239, FP=1315, FN=200
texture_stabilizer: TP=2129, TN=1591, FP=1158, FN=906
fragrance: TP=1214, TN=2353, FP=807, FN=1410
solvent: TP=2520, TN=1168, FP=1234, FN=862
ph_adjuster: TP=217, TN=3879, FP=170, FN=1518
colorant: TP=112, TN=4963, FP=62, FN=647
skin_soothing: TP=858, TN=3464, FP=414, FN=1048
2_hexanediol: TP=463, TN=4136, FP=267, FN=918
glyceryl_caprylate: TP=0, TN=5447, FP=0, FN=337
hydroxyacetophenone: TP=0, TN=5414, FP=0, F

In [4]:
from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss, classification_report

# 11. Évaluation finale
final_model = grid.best_estimator_
y_pred = final_model.predict(X_test)

# Rapport détaillé
report = classification_report(
    y_test, y_pred,
    target_names=valid_targets,
    output_dict=True, zero_division=0
)
df_report = pd.DataFrame(report).transpose().sort_values('f1-score', ascending=False)
df_report.to_csv('classification_report_rf.csv', index=True)

# Calcul des métriques globales
precision_micro = precision_score(y_test, y_pred, average='micro', zero_division=0)
recall_micro    = recall_score(y_test, y_pred, average='micro', zero_division=0)
f1_micro        = f1_score(y_test, y_pred, average='micro', zero_division=0)

precision_macro = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall_macro    = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1_macro        = f1_score(y_test, y_pred, average='macro', zero_division=0)

hamming         = hamming_loss(y_test, y_pred)

# Affichage des résultats
print("\n=== Métriques globales ===")
print(f"Hamming loss     : {hamming:.4f}")
print(f"Precision (micro): {precision_micro:.4f}")
print(f"Recall (micro)   : {recall_micro:.4f}")
print(f"F1-score (micro) : {f1_micro:.4f}")
print(f"Precision (macro): {precision_macro:.4f}")
print(f"Recall (macro)   : {recall_macro:.4f}")
print(f"F1-score (macro) : {f1_macro:.4f}")



=== Métriques globales ===
Hamming loss     : 0.1673
Precision (micro): 0.7548
Recall (micro)   : 0.6554
F1-score (micro) : 0.7016
Precision (macro): 0.5415
Recall (macro)   : 0.4472
F1-score (macro) : 0.4699


In [5]:
# 12. Recommandation pour un exemple d’utilisateur
example_input = np.array([[1,0,1,1,0,1,1,0,0,1,1,0,0,1,1]])

pred = final_model.predict(example_input)[0]
recommendations = {valid_targets[i]: bool(pred[i]) for i in range(len(valid_targets))}
print("\nRecommended Ingredients:")
for ing, ok in recommendations.items():
    print(f"{ing}: {'Yes' if ok else 'No'}")

# 13. Sauvegarde du modèle
joblib.dump(final_model, 'multi_target_svm_model.joblib')


Recommended Ingredients:
hyaluronic: No
niacinamide: No
peptide: No
vitamin_c: No
ceramide: No
retinol: No
aha_bha: No
antioxidant: Yes
mineral_spf: No
probiotic: No
hydrating: Yes
emollient: Yes
preservative: Yes
texture_stabilizer: Yes
fragrance: Yes
solvent: Yes
ph_adjuster: No
colorant: No
skin_soothing: No
2_hexanediol: No
glyceryl_caprylate: No
hydroxyacetophenone: No
titanium_dioxide: No
peg_100_stearate: No


['multi_target_svm_model.joblib']

In [6]:
import joblib
import os

def save_pipeline(model, mlb_after, out_dir='saved_model'):
    """
    Sauvegarde le pipeline complet dans le répertoire out_dir :
      - le modèle entraîné (MultiOutputClassifier ou SVC),
      - le binariseur multilabel pour afterUse,
      - le scaler.
    """
    os.makedirs(out_dir, exist_ok=True)

    # 1. Modèle
    joblib.dump(model, os.path.join(out_dir, 'final_model.joblib'))
    # 2. Binariseur afterUse
    joblib.dump(mlb_after, os.path.join(out_dir, 'mlb_afterUse.joblib'))
    

    print(f"Artifacts saved in directory: {out_dir}")


if __name__ == '__main__':
    # … après tous vos entraînements et évaluations …
    # final_model  : le MultiOutputClassifier (ou pipeline) entraîné
    # mlb          : votre MultiLabelBinarizer() appliqué sur afterUse
    # scaler       : le StandardScaler() que vous avez fit
    
    save_pipeline(
        model=final_model,
        mlb_after=mlb,
        out_dir='saved_model_final'
    )


Artifacts saved in directory: saved_model_final


In [7]:
import numpy as np
import pandas as pd
import random
import joblib

# 1. Charger le modèle entraîné
model = joblib.load('multi_target_svm_model.joblib')

# 2. Définir les noms de features (inchangés)
features_columns = [
    'Acne Fighting', 'Acne Trigger', 'Anti-Aging', 'Brightening', 'Dark Spots',
    'Drying', 'Eczema', 'Good For Oily Skin', 'Hydrating', 'Irritating',
    'Redness Reducing', 'Reduces Irritation', 'Reduces Large Pores',
    'Rosacea', 'Scar Healing'
]

# 3. Liste initiale de noms de cibles (sera tronquée si besoin)
valid_targets = [
    'hyaluronic', 'niacinamide', 'peptide', 'vitamin_c', 'ceramide', 'retinol',
    'aha_bha', 'antioxidant', 'mineral_spf', 'growth_factor', 'probiotic',
    'hydrating', 'emollient', 'preservative', 'texture_stabilizer',
    'fragrance', 'solvent', 'ph_adjuster', 'colorant', 'skin_soothing',
    '2_hexanediol', 'glyceryl_caprylate', 'hydroxyacetophenone',
    'titanium_dioxide', 'peg_100_stearate'
]

def predict_ingredients_from_benefits(model, benefit_texts, feature_names, target_names):
    """
    Pour chaque chaîne de benefit_texts, construit le vecteur binaire de features,
    prédit les ingrédients, et renvoie un DataFrame enrichi.
    Adaptation automatique de target_names si leur nombre ne correspond pas.
    """
    # 1) Construction du DataFrame de features
    rows = []
    for txt in benefit_texts:
        lst = [b.strip() for b in txt.split(',') if b.strip()]
        rows.append({feat: int(feat in lst) for feat in feature_names})
    df_feat = pd.DataFrame(rows, columns=feature_names)

    # 2) Prédiction
    preds = model.predict(df_feat)              # shape (n_samples, n_outputs)
    n_out = preds.shape[1]
    if len(target_names) != n_out:
        print(f"⚠️  Ajustement automatique de target_names : {len(target_names)} → {n_out}")
        target_names = target_names[:n_out]

    df_pred = pd.DataFrame(preds, columns=target_names)

    # 3) Concaténation et liste finale
    result = pd.concat([df_feat, df_pred], axis=1)
    result['predicted_ingredients'] = result[target_names] \
        .apply(lambda row: [t for t, v in row.items() if v == 1], axis=1)
    return result

def generate_random_benefit_texts(feature_names, n_samples=5, min_feats=1, max_feats=None, seed=None):
    """
    Génère n_samples de chaînes aléatoires de bénéfices à partir de feature_names.
    """
    if seed is not None:
        random.seed(seed)
    max_feats = min(max_feats or len(feature_names), len(feature_names))

    texts = []
    for _ in range(n_samples):
        k = random.randint(min_feats, max_feats)
        choix = random.sample(feature_names, k)
        texts.append(", ".join(choix))
    return texts

if __name__ == '__main__':
    # 4. Génération de 5 textes aléatoires
    random_texts = generate_random_benefit_texts(
        feature_names=features_columns,
        n_samples=5,
        min_feats=2,
        max_feats=5,
        seed=42
    )
    print("Textes aléatoires générés :")
    for t in random_texts:
        print(" -", t)

    # 5. Prédiction des ingrédients pour ces textes
    df_random_preds = predict_ingredients_from_benefits(
        model=model,
        benefit_texts=random_texts,
        feature_names=features_columns,
        target_names=valid_targets
    )

    # 6. Affichage final
    print("\nIngrédients prédits pour chaque exemple :")
    print(df_random_preds[['predicted_ingredients']])
    df_random_preds.to_csv('predicted_ingredients.csv', index=False)
    print("✅ Résultats écrits dans 'predicted_ingredients.csv'")


Textes aléatoires générés :
 - Acne Fighting, Reduces Irritation
 - Brightening, Scar Healing, Anti-Aging, Reduces Irritation
 - Redness Reducing, Reduces Irritation
 - Irritating, Eczema
 - Acne Fighting, Acne Trigger
⚠️  Ajustement automatique de target_names : 25 → 24

Ingrédients prédits pour chaque exemple :
                               predicted_ingredients
0    [probiotic, emollient, preservative, fragrance]
1  [antioxidant, probiotic, hydrating, emollient,...
2  [hyaluronic, antioxidant, probiotic, emollient...
3  [probiotic, emollient, texture_stabilizer, fra...
4  [probiotic, hydrating, emollient, preservative...
✅ Résultats écrits dans 'predicted_ingredients.csv'


In [8]:
# 4. Recommandations basées sur les préoccupations cutanées
skin_condition_to_benefits = {
    "acne":           ["Acne Fighting", "Reduces Large Pores", "Irritating", "Drying"],
    "redness":        ["Redness Reducing", "Skin Soothing", "Anti-Inflammatory"],
    "dryness":        ["Hydrating", "Skin Soothing", "Reduces Irritation"],
    "wrinkles":       ["Anti-Aging", "Brightening"],
    "sensitive_skin": ["Redness Reducing", "Skin Soothing", "Reduces Irritation"],
}
user_skin_concerns = ["acne", "redness"]
user_benefits = {
    b
    for concern in user_skin_concerns
    for b in skin_condition_to_benefits.get(concern, [])
}
print("User Benefits:", list(user_benefits))
# 8. Mise à l’échelle
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

User Benefits: ['Reduces Large Pores', 'Drying', 'Acne Fighting', 'Redness Reducing', 'Skin Soothing', 'Anti-Inflammatory', 'Irritating']
