## Import et configurations MlFlows

In [2]:
import mlflow
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)

MLFLOW_TRACKING_URI = "http://localhost:5555"  
EXPERIMENT_NAME = "stockout_substitution_hyperopt_classifier_ranker_6"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if experiment is None:
    raise ValueError(f"Expérience '{EXPERIMENT_NAME}' introuvable")

EXPERIMENT_ID = experiment.experiment_id
print("Experiment ID:", EXPERIMENT_ID)


Experiment ID: 7


## Chargement de tous les runs

In [3]:
runs = mlflow.search_runs(
    experiment_ids=[EXPERIMENT_ID],
    output_format="pandas"
)

print(f"{len(runs)} runs chargés")
runs.head(3)


101 runs chargés


Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.ndcg_at_1,metrics.best_iteration,metrics.hit_rate_at_5,metrics.ndcg_at_3,metrics.hit_rate_at_3,metrics.hit_rate_at_1,metrics.ndcg_at_5,metrics.recall,metrics.logloss,metrics.f1,metrics.precision,metrics.auc,metrics.pr_auc,params.learning_rate,params.num_leaves,params.min_child_samples,params.n_estimators,params.subsample,params.colsample_bytree,params.rsm,params.l2_leaf_reg,params.iterations,params.depth,params.reg_alpha,params.reg_lambda,params.min_child_weight,params.max_depth,params.C,params.penalty,tags.mlflow.source.type,tags.mlflow.user,tags.model_name,tags.mlflow.runName,tags.model_type,tags.mlflow.source.name
0,46ad7c66a9484c7a84cf379747c88287,7,FINISHED,mlflow-artifacts:/7/46ad7c66a9484c7a84cf379747...,2026-01-07 09:34:49.919000+00:00,2026-01-07 09:34:53.200000+00:00,0.636238,4.0,0.71567,0.855774,0.712296,0.636238,0.886897,,,,,,,0.1,63,100,500,0.7,1.0,,,,,,,,,,,LOCAL,eric,LGBMRanker,valuable-conch-790,ranking,/home/eric/.cache/pypoetry/virtualenvs/algo-re...
1,5910e6f65ad645ffb5716d529b2608d4,7,FINISHED,mlflow-artifacts:/7/5910e6f65ad645ffb5716d529b...,2026-01-07 09:34:46.602000+00:00,2026-01-07 09:34:49.815000+00:00,0.637536,17.0,0.715757,0.856644,0.712296,0.637536,0.887357,,,,,,,0.05,31,100,500,1.0,0.7,,,,,,,,,,,LOCAL,eric,LGBMRanker,honorable-smelt-850,ranking,/home/eric/.cache/pypoetry/virtualenvs/algo-re...
2,1d55f3ce4a104f3cbaa7716234813ae2,7,FINISHED,mlflow-artifacts:/7/1d55f3ce4a104f3cbaa7716234...,2026-01-07 09:34:43.218000+00:00,2026-01-07 09:34:46.504000+00:00,0.638141,15.0,0.71567,0.857083,0.712296,0.638141,0.887685,,,,,,,0.1,31,20,1000,0.9,0.9,,,,,,,,,,,LOCAL,eric,LGBMRanker,secretive-ape-612,ranking,/home/eric/.cache/pypoetry/virtualenvs/algo-re...


## Colonnes utiles et nettoyage

In [4]:
metric_cols = [c for c in runs.columns if c.startswith("metrics.")]
param_cols = [c for c in runs.columns if c.startswith("params.")]
tag_cols   = [c for c in runs.columns if c.startswith("tags.")]

cols = (
    ["run_id", "status", "start_time"] +
    metric_cols +
    param_cols +
    tag_cols
)

df = runs[cols].copy()

# Simplification des noms
df.columns = (
    df.columns
      .str.replace("metrics.", "", regex=False)
      .str.replace("params.", "", regex=False)
      .str.replace("tags.", "", regex=False)
)

df.head(2)


Unnamed: 0,run_id,status,start_time,ndcg_at_1,best_iteration,hit_rate_at_5,ndcg_at_3,hit_rate_at_3,hit_rate_at_1,ndcg_at_5,recall,logloss,f1,precision,auc,pr_auc,learning_rate,num_leaves,min_child_samples,n_estimators,subsample,colsample_bytree,rsm,l2_leaf_reg,iterations,depth,reg_alpha,reg_lambda,min_child_weight,max_depth,C,penalty,mlflow.source.type,mlflow.user,model_name,mlflow.runName,model_type,mlflow.source.name
0,46ad7c66a9484c7a84cf379747c88287,FINISHED,2026-01-07 09:34:49.919000+00:00,0.636238,4.0,0.71567,0.855774,0.712296,0.636238,0.886897,,,,,,,0.1,63,100,500,0.7,1.0,,,,,,,,,,,LOCAL,eric,LGBMRanker,valuable-conch-790,ranking,/home/eric/.cache/pypoetry/virtualenvs/algo-re...
1,5910e6f65ad645ffb5716d529b2608d4,FINISHED,2026-01-07 09:34:46.602000+00:00,0.637536,17.0,0.715757,0.856644,0.712296,0.637536,0.887357,,,,,,,0.05,31,100,500,1.0,0.7,,,,,,,,,,,LOCAL,eric,LGBMRanker,honorable-smelt-850,ranking,/home/eric/.cache/pypoetry/virtualenvs/algo-re...


## Separation Ranker et Classifier

In [5]:
df["model_type"] = df.get("model_type", "unknown")
df["model_name"] = df.get("model_name", "unknown")

df_classif = df[df["model_type"] == "classification"].copy()
df_ranker  = df[df["model_type"] == "ranking"].copy()

print("Classifiers:", len(df_classif))
print("Rankers:", len(df_ranker))


Classifiers: 81
Rankers: 20


## Analyse Classifier

Métriques disponibles:
- auc
- pr_auc
- logloss
- precision
- recall
- f1

#### Top modèles par AUC

In [6]:
df_classif_sorted = (
    df_classif
    .sort_values("auc", ascending=False)
)

df_classif_sorted[
    ["model_name", "auc", "pr_auc", "f1", "precision", "recall", "logloss"]
].head(10)


Unnamed: 0,model_name,auc,pr_auc,f1,precision,recall,logloss
21,CatBoostClassifier,0.728301,0.790296,0.697076,0.694752,0.699416,0.593348
33,CatBoostClassifier,0.728088,0.790238,0.69627,0.695968,0.696572,0.593361
25,CatBoostClassifier,0.727761,0.789836,0.698379,0.69586,0.700916,0.593617
75,XGBClassifier,0.727048,0.789316,0.695649,0.696227,0.695072,0.594013
64,XGBClassifier,0.726991,0.789142,0.69612,0.697251,0.694993,0.594209
24,CatBoostClassifier,0.726738,0.789323,0.692552,0.696312,0.688833,0.594369
31,CatBoostClassifier,0.726738,0.789323,0.692552,0.696312,0.688833,0.594369
71,XGBClassifier,0.726716,0.788943,0.694947,0.696407,0.693492,0.59446
69,XGBClassifier,0.726598,0.788784,0.694937,0.695515,0.694361,0.594439
62,XGBClassifier,0.725739,0.788125,0.693279,0.697379,0.689228,0.595311


Observations clés:

AUC max ≈ 0.728 → performance correcte mais pas “state of the art”

CatBoost et XGBoost dominent clairement

Les écarts sont très faibles entre les meilleurs runs (≈ 0.002 d’AUC)

Lecture des métriques (meilleur run CatBoost – id 21)

AUC = 0.7283
→ Bonne capacité de discrimination globale

PR-AUC = 0.7903
→ Solide si classes déséquilibrées

F1 ≈ 0.697
→ Bon compromis précision / rappel

Precision ≈ Recall ≈ 0.695–0.700
→ Modèle bien équilibré

Logloss ≈ 0.593
→ Probabilités plutôt bien calibrées

Conclusion

- CatBoostClassifier est légèrement supérieur, mais XGBClassifier est quasiment équivalent.
- Les différences sont non significatives statistiquement sans test plus poussé.

#### Moyenne performance par modèle (classifier)

In [7]:
classif_summary = (
    df_classif
    .groupby("model_name")[["auc", "pr_auc", "f1", "precision", "recall", "logloss"]]
    .mean()
    .sort_values("auc", ascending=False)
)

classif_summary


Unnamed: 0_level_0,auc,pr_auc,f1,precision,recall,logloss
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
XGBClassifier,0.716703,0.781907,0.687279,0.688384,0.686203,0.605323
CatBoostClassifier,0.715695,0.781355,0.687522,0.68943,0.685642,0.605999
LGBMClassifier,0.711249,0.777924,0.682801,0.685932,0.679723,0.610106
LogReg,0.699013,0.761051,0.687202,0.671237,0.703945,0.619595


Interprétation

XGB et CatBoost sont équivalents en moyenne

CatBoost :
- meilleure précision
- meilleur F1 moyen

XGB :
- meilleur AUC moyen
- meilleur logloss → meilleure calibration

LogReg :
- recall élevé
- mais AUC et logloss faibles → modèle trop simple

Conclusion
- CatBoost / XGB = meilleur compromis global
- LogReg utile seulement si le recall est prioritaire

#### Meilleur run par modèle

In [8]:
best_classif_runs = (
    df_classif
    .sort_values("auc", ascending=False)
    .groupby("model_name")
    .head(1)
)

best_classif_runs[
    ["model_name", "auc", "pr_auc", "f1", "precision", "recall", "logloss"]
]


Unnamed: 0,model_name,auc,pr_auc,f1,precision,recall,logloss
21,CatBoostClassifier,0.728301,0.790296,0.697076,0.694752,0.699416,0.593348
75,XGBClassifier,0.727048,0.789316,0.695649,0.696227,0.695072,0.594013
57,LGBMClassifier,0.72454,0.787475,0.690771,0.69352,0.688043,0.595991
80,LogReg,0.699072,0.761048,0.686965,0.670882,0.703838,0.61966


Conclusion:
- CatBoostClassifier (run 21) est le meilleur choix
- XGB est une alternative quasi équivalente

## Analyse des Rankers

Métriques disponibles:
- ndcg_at_1
- ndcg_at_3
- ndcg_at_5
- hit_rate_at_1
- hit_rate_at_3
- hit_rate_at_5

#### Top runs par NDCG@3

In [9]:
df_ranker_sorted = df_ranker.sort_values("ndcg_at_3", ascending=False)

df_ranker_sorted[
    ["model_name", "ndcg_at_1", "ndcg_at_3", "ndcg_at_5",
     "hit_rate_at_1", "hit_rate_at_3", "hit_rate_at_5"]
].head(10)


Unnamed: 0,model_name,ndcg_at_1,ndcg_at_3,ndcg_at_5,hit_rate_at_1,hit_rate_at_3,hit_rate_at_5
18,LGBMRanker,0.638055,0.857388,0.887532,0.638055,0.712382,0.715757
17,LGBMRanker,0.638228,0.857165,0.887528,0.638228,0.712988,0.715757
2,LGBMRanker,0.638141,0.857083,0.887685,0.638141,0.712296,0.71567
12,LGBMRanker,0.637709,0.856951,0.887459,0.637709,0.712555,0.715757
7,LGBMRanker,0.637709,0.856896,0.887479,0.637709,0.712469,0.715757
6,LGBMRanker,0.637622,0.856866,0.887557,0.637622,0.712209,0.715757
10,LGBMRanker,0.637882,0.856764,0.887377,0.637882,0.712555,0.715757
1,LGBMRanker,0.637536,0.856644,0.887357,0.637536,0.712296,0.715757
3,LGBMRanker,0.635892,0.856134,0.886727,0.635892,0.712123,0.71567
14,LGBMRanker,0.635373,0.855933,0.886806,0.635373,0.712382,0.715757


#### Moyenne des performances Ranker

In [10]:
ranker_summary = (
    df_ranker
    .groupby("model_name")[
        ["ndcg_at_1", "ndcg_at_3", "ndcg_at_5",
         "hit_rate_at_1", "hit_rate_at_3", "hit_rate_at_5"]
    ]
    .mean()
    .sort_values("ndcg_at_3", ascending=False)
)

ranker_summary


Unnamed: 0_level_0,ndcg_at_1,ndcg_at_3,ndcg_at_5,hit_rate_at_1,hit_rate_at_3,hit_rate_at_5
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LGBMRanker,0.636467,0.855982,0.886821,0.636467,0.7123,0.715718


#### Meilleur run par Ranker

In [11]:
best_ranker_runs = (
    df_ranker
    .sort_values("ndcg_at_3", ascending=False)
    .groupby("model_name")
    .head(1)
)

best_ranker_runs[
    ["model_name", "ndcg_at_1", "ndcg_at_3", "ndcg_at_5",
     "hit_rate_at_1", "hit_rate_at_3", "hit_rate_at_5"]
]


Unnamed: 0,model_name,ndcg_at_1,ndcg_at_3,ndcg_at_5,hit_rate_at_1,hit_rate_at_3,hit_rate_at_5
18,LGBMRanker,0.638055,0.857388,0.887532,0.638055,0.712382,0.715757


Résultats clés:
- NDCG@3 ≈ 0.857
- HitRate@3 ≈ 71%
- Stabilité très forte entre runs (écarts minimes)

Interprétation:
- ~71% du temps, un item pertinent est présent dans le top 3
- ~64% du temps, il est en top 1
- Très bonne capacité de tri fin en tête de liste

Moyenne vs meilleur run:
- Moyenne NDCG@3 = 0.8560
- Meilleur run NDCG@3 = 0.8574
=> Hyperparamètres bien maîtrisés, peu de sur-optimisation

#### Inspection des hyperparamètres du meilleur run

In [12]:
best_run = best_ranker_runs.iloc[0]

best_params = best_run[[c for c in best_run.index if c not in [
    "run_id","model_name","model_type","status","start_time"
] and not pd.isna(best_run[c])]]

best_params


ndcg_at_1                                                      0.638055
best_iteration                                                     10.0
hit_rate_at_5                                                  0.715757
ndcg_at_3                                                      0.857388
hit_rate_at_3                                                  0.712382
hit_rate_at_1                                                  0.638055
ndcg_at_5                                                      0.887532
learning_rate                                                       0.1
num_leaves                                                           31
min_child_samples                                                    50
n_estimators                                                        500
subsample                                                           1.0
colsample_bytree                                                    1.0
mlflow.source.type                                              

### Conclusions

Classification vs Ranking — que choisir ?
Si l'objectif est :

- Décision binaire (oui/non)
=> CatBoostClassifier

- Top-K recommandations / priorisation / tri
=> LGBMRanker


_______


- Le meilleur classifier selon l'AUC est : **CatBoostClassifier (run 21, AUC = 0.7283)**
- Le meilleur ranker selon NDCG@3 est : **LGBMRanker (run 18, NDCG@3 = 0.8574)**
- Les Rankers surpassent les classifiers en top-k → **à privilégier en prod**
- Les classifiers restent utiles comme **baseline / fallback**
