## 1. Setup & Configuration


In [1]:
# 1.1 Imports
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import (
    cross_val_score,
    GridSearchCV,
    StratifiedKFold,
    cross_validate,
    train_test_split,
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix,
    make_scorer,
)
from sklearn.base import clone
import lightgbm as lgb
import warnings

warnings.filterwarnings("ignore")
from pathlib import Path
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

print("‚úÖ Imports r√©ussis")

# 1.1.1 Param√®tres globaux
RANDOM_STATE = 42
N_FOLDS = 5
COST_FN = 10  # Co√ªt d'un Faux N√©gatif = 10x Faux Positif

# 1.2 Configuration MLFlow - CORRIG√âE
EXPERIMENT_NAME = "home_credit_default_risk"

# IMPORTANT : V√©rifier si l'exp√©rience existe, sinon la cr√©er
try:
    # Essayer de r√©cup√©rer l'exp√©rience existante
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

    if experiment is None:
        # L'exp√©rience n'existe pas, on la cr√©e
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        print(f"‚úÖ Exp√©rience cr√©√©e: {EXPERIMENT_NAME} (ID: {experiment_id})")
    else:
        # L'exp√©rience existe d√©j√†
        experiment_id = experiment.experiment_id
        print(f"‚úÖ Exp√©rience trouv√©e: {EXPERIMENT_NAME} (ID: {experiment_id})")

    # D√©finir l'exp√©rience active
    mlflow.set_experiment(EXPERIMENT_NAME)

except Exception as e:
    print(f"‚ö†Ô∏è Erreur MLFlow: {e}")
    print("üí° Solution : Cr√©ation manuelle de l'exp√©rience...")
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
    mlflow.set_experiment(EXPERIMENT_NAME)
    print(f"‚úÖ Exp√©rience cr√©√©e: {EXPERIMENT_NAME} (ID: {experiment_id})")

print(f"‚úÖ Tracking URI: {mlflow.get_tracking_uri()}")
print(f"‚úÖ Exp√©rience active: {mlflow.get_experiment_by_name(EXPERIMENT_NAME).name}")

# 1.3 Charger les donn√©es preprocessed
DATA_PATH = Path("../data")
train_full = pd.read_csv(DATA_PATH / "train_preprocessed.csv")

print(f"‚úÖ Donn√©es charg√©es: {train_full.shape}")

# S√©parer X, y
X_full = train_full.drop(["TARGET", "SK_ID_CURR"], axis=1)
y_full = train_full["TARGET"]

# üîÄ SPLIT 80/20 pour cr√©er un set de validation
X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, test_size=0.2, stratify=y_full, random_state=RANDOM_STATE
)

print(f"‚úÖ Train: {X_train.shape} - {y_train.value_counts(normalize=True).to_dict()}")
print(f"‚úÖ Val:   {X_val.shape} - {y_val.value_counts(normalize=True).to_dict()}")
print(f"üîí Val isol√© jusqu'√† l'√©valuation finale")


print(f"‚úÖ X_train shape: {X_train.shape}")
print(f"‚úÖ y_train distribution: {y_train.value_counts().to_dict()}")


print("\n" + "=" * 60)
print("üéØ CONFIGURATION TERMIN√âE")
print("=" * 60)

‚úÖ Imports r√©ussis


2026/02/13 16:12:36 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/02/13 16:12:36 INFO mlflow.store.db.utils: Updating database tables
2026/02/13 16:12:36 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/02/13 16:12:36 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/02/13 16:12:36 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/02/13 16:12:36 INFO alembic.runtime.migration: Will assume non-transactional DDL.


‚úÖ Exp√©rience trouv√©e: home_credit_default_risk (ID: 1)
‚úÖ Tracking URI: sqlite:///mlflow.db
‚úÖ Exp√©rience active: home_credit_default_risk
‚úÖ Donn√©es charg√©es: (307511, 421)
‚úÖ Train: (246008, 419) - {0: 0.9192709180189262, 1: 0.08072908198107379}
‚úÖ Val:   (61503, 419) - {0: 0.9192722306228964, 1: 0.08072776937710356}
üîí Val isol√© jusqu'√† l'√©valuation finale
‚úÖ X_train shape: (246008, 419)
‚úÖ y_train distribution: {0: 226148, 1: 19860}

üéØ CONFIGURATION TERMIN√âE


## # ANALYSE DES MONTANTS


In [2]:
train_agregatted = pd.read_csv(DATA_PATH / "train_aggregated.csv")

# Montant moyen du cr√©dit
amt_credit_mean = train_agregatted["AMT_CREDIT"].mean()
amt_credit_median = train_agregatted["AMT_CREDIT"].median()

print(f"üìä MONTANTS DE CR√âDIT:")
print(f"   Moyenne : {amt_credit_mean:,.0f}")
print(f"   M√©diane : {amt_credit_median:,.0f}")
print(f"   Min     : {train_agregatted['AMT_CREDIT'].min():,.0f}")
print(f"   Max     : {train_agregatted['AMT_CREDIT'].max():,.0f}")

# Montant total des cr√©dits
total_credits = train_agregatted["AMT_CREDIT"].sum()
print(f"\nüí∞ Total des cr√©dits : {total_credits:,.0f}")

# Volume
n_clients = len(train_agregatted)
print(f"üë• Nombre de clients : {n_clients:,}")

# Taux de d√©faut dans les donn√©es
default_rate = train_agregatted["TARGET"].mean()
print(f"üìâ Taux de d√©faut : {default_rate:.2%}")

üìä MONTANTS DE CR√âDIT:
   Moyenne : 599,026
   M√©diane : 513,531
   Min     : 45,000
   Max     : 4,050,000

üí∞ Total des cr√©dits : 184,207,084,196
üë• Nombre de clients : 307,511
üìâ Taux de d√©faut : 8.07%


## 3. Score M√©tier Personnalis√©

Le d√©s√©quilibre du co√ªt m√©tier :

- Faux N√©gatif (FN) : Mauvais client pr√©dit bon ‚Üí Cr√©dit accord√© ‚Üí PERTE
- Faux Positif (FP) : Bon client pr√©dit mauvais ‚Üí Cr√©dit refus√© ‚Üí Manque √† gagner

Hypoth√®se : Co√ªt FN = 10 √ó Co√ªt FP


In [3]:
def business_cost_score(y_true, y_pred, cost_fn=10, cost_fp=1):
    """
    Calcule le co√ªt m√©tier total.

    Plus le score est BAS, meilleur est le mod√®le.

    Parameters:
    -----------
    y_true : array-like
        Vraies valeurs (0 ou 1)
    y_pred : array-like
        Pr√©dictions (0 ou 1)
    cost_fn : float
        Co√ªt d'un Faux N√©gatif
    cost_fp : float
        Co√ªt d'un Faux Positif

    Returns:
    --------
    float : Co√ªt m√©tier total
    """
    from sklearn.metrics import confusion_matrix

    # Matrice de confusion
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Co√ªt total
    total_cost = (fn * cost_fn) + (fp * cost_fp)

    # Normaliser par le nombre d'exemples
    cost_per_client = total_cost / len(y_true)

    return cost_per_client


def business_score_from_proba(y_true, y_proba, threshold=0.5, cost_fn=10, cost_fp=1):
    """
    Calcule le score m√©tier √† partir de probabilit√©s et d'un seuil.
    """
    y_pred = (y_proba >= threshold).astype(int)
    return business_cost_score(y_true, y_pred, cost_fn, cost_fp)


# Test du score
print("\nüìä Test du Score M√©tier:")
print("=" * 60)

# Sc√©nario 1 : Mod√®le parfait
y_true_test = np.array([0, 0, 1, 1])
y_pred_perfect = np.array([0, 0, 1, 1])
cost_perfect = business_cost_score(y_true_test, y_pred_perfect, COST_FN, 1)
print(f"Mod√®le parfait : Co√ªt = {cost_perfect:.4f}")

# Sc√©nario 2 : 1 Faux N√©gatif
y_pred_fn = np.array([0, 0, 0, 1])  # Manque 1 d√©faut
cost_fn = business_cost_score(y_true_test, y_pred_fn, COST_FN, 1)
print(f"1 Faux N√©gatif : Co√ªt = {cost_fn:.4f} (= {COST_FN}/4 = {COST_FN / 4:.2f})")

# Sc√©nario 3 : 1 Faux Positif
y_pred_fp = np.array([1, 0, 1, 1])  # Rejette 1 bon client
cost_fp = business_cost_score(y_true_test, y_pred_fp, COST_FN, 1)
print(f"1 Faux Positif : Co√ªt = {cost_fp:.4f} (= 1/4 = {1 / 4:.2f})")

print(f"\nüí° Un FN co√ªte {cost_fn / cost_fp:.0f}x plus qu'un FP")


üìä Test du Score M√©tier:
Mod√®le parfait : Co√ªt = 0.0000
1 Faux N√©gatif : Co√ªt = 2.5000 (= 10/4 = 2.50)
1 Faux Positif : Co√ªt = 0.2500 (= 1/4 = 0.25)

üí° Un FN co√ªte 10x plus qu'un FP


### G√©n√®re un rapport business complet √† partir du score m√©tier CV.

| Avantage         | D√©tail                                           |
| ---------------- | ------------------------------------------------ |
| **R√©utilisable** | Un seul appel pour g√©n√©rer tout le rapport       |
| **Flexible**     | Param√®tres personnalisables (LGD, marge, volume) |
| **Automatique**  | Charge les donn√©es et calcule tout seul          |
| **Comparable**   | Fonction bonus pour comparer plusieurs mod√®les   |
| **Int√©grable**   | Retourne un dict de KPIs pour MLFlow             |
| **Sauvegard√©**   | G√©n√®re un fichier .txt horodat√©                  |

---


In [4]:
# ============================================
# G√âN√âRATEUR DE RAPPORT BUSINESS AUTOMATIQUE
# ============================================

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime


def generer_rapport_business(
    cv_business_cost,
    amt_credit_mean=None,
    taux_defaut=None,
    # Param√®tres optionnels avec valeurs par d√©faut
    lgd=0.75,
    profit_margin=0.05,
    volume_annuel=100_000,
    projection_annees=3,
    # Chemins
    train_path="../data/train_aggregated.csv",
    output_dir="../reports",
    # Options
    sauvegarder=True,
    afficher=True,
):
    """
    G√©n√®re un rapport business complet √† partir du score m√©tier CV.

    Parameters:
    -----------
    cv_business_cost : float
        Score m√©tier en cross-validation (ex: 0.4964)
    amt_credit_mean : float, optional
        Montant moyen du cr√©dit. Si None, calcul√© depuis train_path
    taux_defaut : float, optional
        Taux de d√©faut. Si None, calcul√© depuis train_path
    lgd : float, default=0.75
        Loss Given Default (75% = standard cr√©dit consommation)
    profit_margin : float, default=0.05
        Marge sur un bon client (5%)
    volume_annuel : int, default=100000
        Nombre de clients projet√©s par an
    projection_annees : int, default=3
        Nombre d'ann√©es pour la projection
    train_path : str
        Chemin vers train_aggregated.csv
    output_dir : str
        R√©pertoire de sortie pour le rapport
    sauvegarder : bool, default=True
        Sauvegarder le rapport en fichier txt
    afficher : bool, default=True
        Afficher le rapport dans le terminal

    Returns:
    --------
    dict : Dictionnaire avec tous les KPIs calcul√©s
    """

    # ============================================
    # 1. CHARGER LES DONN√âES SI N√âCESSAIRE
    # ============================================

    if amt_credit_mean is None or taux_defaut is None:
        print(f"üìÇ Chargement de {train_path}...")
        try:
            train = pd.read_csv(train_path)

            if amt_credit_mean is None:
                amt_credit_mean = train["AMT_CREDIT"].mean()

            if taux_defaut is None:
                if "TARGET" not in train.columns:
                    raise ValueError("‚ùå Colonne TARGET absente. V√©rifiez le fichier.")
                taux_defaut = train["TARGET"].mean()

            n_clients_historique = len(train)
            print(f"‚úÖ Donn√©es charg√©es : {n_clients_historique:,} clients\n")

        except FileNotFoundError:
            raise FileNotFoundError(
                f"‚ùå Fichier non trouv√© : {train_path}\n"
                "Veuillez fournir amt_credit_mean et taux_defaut manuellement."
            )
    else:
        n_clients_historique = None

    # ============================================
    # 2. CALCULS DES CO√õTS UNITAIRES
    # ============================================

    cost_fn_euro = amt_credit_mean * lgd
    cost_fp_euro = amt_credit_mean * profit_margin
    ratio = cost_fn_euro / cost_fp_euro

    # ============================================
    # 3. CONVERSION DU SCORE EN EUROS
    # ============================================

    # Le score est normalis√© avec cost_fp = 1
    # Donc : multiplier par cost_fp_euro pour avoir en ‚Ç¨
    cout_modele_par_client = cv_business_cost * cost_fp_euro

    # ============================================
    # 4. BASELINE (MOD√àLE NA√èF)
    # ============================================

    cout_naif_par_client = taux_defaut * cost_fn_euro

    # ============================================
    # 5. √âCONOMIES
    # ============================================

    economie_par_client = cout_naif_par_client - cout_modele_par_client
    amelioration_pct = (economie_par_client / cout_naif_par_client) * 100

    # Impact annuel
    economie_annuelle = economie_par_client * volume_annuel

    # Projection multi-ann√©es
    economie_projection = economie_annuelle * projection_annees

    # ============================================
    # 6. ESTIMATION R√âPARTITION FN/FP
    # ============================================

    # Sur 1000 clients
    n_bons_1000 = int(1000 * (1 - taux_defaut))
    n_mauvais_1000 = int(1000 * taux_defaut)

    # Estimation inverse du nombre d'erreurs
    # cv_business_cost √ó 1000 = (fn √ó 10) + (fp √ó 1)
    total_cost_1000 = cv_business_cost * 1000

    # Estimation simplifi√©e (assume un √©quilibre raisonnable)
    # Ratio r√©el est 15:1, donc ajuster
    fn_estimate = int(total_cost_1000 / (10 + ratio / 15))
    fp_estimate = int((total_cost_1000 - fn_estimate * 10))

    # ============================================
    # 7. CONSTRUCTION DU RAPPORT
    # ============================================

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    rapport = f"""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë          RAPPORT D'IMPACT BUSINESS - SCORING CR√âDIT                ‚ïë
‚ïë          G√©n√©r√© le {timestamp}                       ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù

‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ                          1. CONTEXTE                               ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò

  Donn√©es de Base:
    ‚Ä¢ Montant moyen des cr√©dits : {amt_credit_mean:,.0f} ‚Ç¨
    ‚Ä¢ Taux de d√©faut historique : {taux_defaut:.2%}
"""

    if n_clients_historique:
        rapport += f"    ‚Ä¢ √âchantillon analys√© : {n_clients_historique:,} clients\n"

    rapport += f"""
  Hypoth√®ses M√©tier:
    ‚Ä¢ Loss Given Default (LGD) : {lgd:.0%}
      ‚Üí Perte moyenne en cas de d√©faut
    ‚Ä¢ Marge sur bon client : {profit_margin:.1%}
      ‚Üí Profit moyen par client remboursant
    ‚Ä¢ Volume projet√© : {volume_annuel:,} clients/an

‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ                   2. CO√õTS DES ERREURS                             ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò

  üí• Faux N√©gatif (FN) - Accepter un mauvais client:
    ‚Ä¢ Co√ªt unitaire : {cost_fn_euro:,.0f} ‚Ç¨
    ‚Ä¢ Calcul : {amt_credit_mean:,.0f} ‚Ç¨ √ó {lgd:.0%} = {cost_fn_euro:,.0f} ‚Ç¨
    ‚Ä¢ Impact : Perte du capital pr√™t√©

  üòï Faux Positif (FP) - Refuser un bon client:
    ‚Ä¢ Co√ªt unitaire : {cost_fp_euro:,.0f} ‚Ç¨
    ‚Ä¢ Calcul : {amt_credit_mean:,.0f} ‚Ç¨ √ó {profit_margin:.1%} = {cost_fp_euro:,.0f} ‚Ç¨
    ‚Ä¢ Impact : Manque √† gagner (marge perdue)

  ‚öñÔ∏è  Ratio d'Importance:
    ‚Ä¢ Un FN co√ªte {ratio:.1f}√ó plus qu'un FP
    ‚Ä¢ Priorit√© : D√©tecter les mauvais clients

‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ                  3. PERFORMANCE DU MOD√àLE                          ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò

  üìä Score M√©tier (Cross-Validation):
    ‚Ä¢ Score normalis√© : {cv_business_cost:.4f}
    ‚Ä¢ Co√ªt r√©el par client : {cout_modele_par_client:,.0f} ‚Ç¨

  üìâ Baseline (Accepter tous les clients):
    ‚Ä¢ Co√ªt par client : {cout_naif_par_client:,.0f} ‚Ç¨
    ‚Ä¢ Calcul : {taux_defaut:.2%} √ó {cost_fn_euro:,.0f} ‚Ç¨ = {cout_naif_par_client:,.0f} ‚Ç¨

  ‚úÖ Am√©lioration:
    ‚Ä¢ R√©duction du co√ªt : {amelioration_pct:.1f}%
    ‚Ä¢ √âconomie par client : {economie_par_client:,.0f} ‚Ç¨

  üîç Estimation des Erreurs (sur 1000 clients):
    ‚Ä¢ {n_mauvais_1000} clients √† risque ‚Üí ~{fn_estimate} accept√©s (FN)
    ‚Ä¢ {n_bons_1000} bons clients ‚Üí ~{fp_estimate} refus√©s (FP)

‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ                    4. IMPACT FINANCIER                             ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò

  üí∞ Par Client:
    ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
    ‚îÇ                     ‚îÇ  Sans Mod√®le     ‚îÇ  Avec Mod√®le     ‚îÇ
    ‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§
    ‚îÇ Co√ªt par client     ‚îÇ {cout_naif_par_client:>13,.0f} ‚Ç¨ ‚îÇ {cout_modele_par_client:>13,.0f} ‚Ç¨ ‚îÇ
    ‚îÇ √âconomie            ‚îÇ         -        ‚îÇ {economie_par_client:>13,.0f} ‚Ç¨ ‚îÇ
    ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò

  üìà Impact Annuel ({volume_annuel:,} clients):
    ‚Ä¢ Co√ªt sans mod√®le : {cout_naif_par_client * volume_annuel:,.0f} ‚Ç¨
      = {(cout_naif_par_client * volume_annuel) / 1_000_000:.2f} millions ‚Ç¨

    ‚Ä¢ Co√ªt avec mod√®le : {cout_modele_par_client * volume_annuel:,.0f} ‚Ç¨
      = {(cout_modele_par_client * volume_annuel) / 1_000_000:.2f} millions ‚Ç¨

    ‚Ä¢ üí∞ √âCONOMIE ANNUELLE : {economie_annuelle:,.0f} ‚Ç¨
                           = {economie_annuelle / 1_000_000:.2f} millions ‚Ç¨

  üéØ Projection {projection_annees} ans:
"""

    for annee in range(1, projection_annees + 1):
        economie_cumul = economie_annuelle * annee
        rapport += f"    ‚Ä¢ Ann√©e {annee} (cumul√©) : {economie_cumul / 1_000_000:.2f} millions ‚Ç¨\n"

    rapport += f"""
    ‚Ä¢ üèÜ TOTAL {projection_annees} ANS : {economie_projection:,.0f} ‚Ç¨
                     = {economie_projection / 1_000_000:.2f} millions ‚Ç¨
                     = {economie_projection / 1_000_000_000:.2f} milliards ‚Ç¨

‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ                      5. CONCLUSION                                 ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò

  Le mod√®le de machine learning permet de r√©duire le co√ªt par client
  de {amelioration_pct:.0f}%, en d√©tectant mieux les clients √† risque tout en
  maintenant un taux d'acceptation raisonnable des bons clients.

  ROI: Pour chaque {volume_annuel:,} clients trait√©s par an, le mod√®le
       √©conomise {economie_annuelle / 1_000_000:.2f} millions d'euros.

  Recommandation: ‚úÖ D√©ployer le mod√®le en production

‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
"""

    # ============================================
    # 8. AFFICHAGE
    # ============================================

    if afficher:
        print(rapport)

    # ============================================
    # 9. SAUVEGARDE
    # ============================================

    if sauvegarder:
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

        filename = f"rapport_business_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
        filepath = output_path / filename

        with open(filepath, "w", encoding="utf-8") as f:
            f.write(rapport)

        print(f"\n‚úÖ Rapport sauvegard√© : {filepath}")

    # ============================================
    # 10. RETOUR DES KPIs
    # ============================================

    kpis = {
        # Entr√©es
        "cv_business_cost": cv_business_cost,
        "amt_credit_mean": amt_credit_mean,
        "taux_defaut": taux_defaut,
        "lgd": lgd,
        "profit_margin": profit_margin,
        "volume_annuel": volume_annuel,
        # Co√ªts unitaires
        "cost_fn_euro": cost_fn_euro,
        "cost_fp_euro": cost_fp_euro,
        "ratio_fn_fp": ratio,
        # Performance
        "cout_modele_par_client": cout_modele_par_client,
        "cout_naif_par_client": cout_naif_par_client,
        "economie_par_client": economie_par_client,
        "amelioration_pct": amelioration_pct,
        # Impact
        "economie_annuelle": economie_annuelle,
        "economie_projection": economie_projection,
        # Estimation erreurs
        "fn_estimate_per_1000": fn_estimate,
        "fp_estimate_per_1000": fp_estimate,
    }

    return kpis


## Fonction de train et de log des modeles pour selectionner le modele pour le grid search et la recherche d'hyperparametres


In [5]:
def train_and_log_model(model, model_name, X_train, y_train, params=None):
    """Entra√Æne un mod√®le et enregistre les m√©triques et artefacts dans MLflow.

    Args:
        model (_type_): Mod√®le √† entra√Æner
        model_name (_type_): Nom du mod√®le
        X_train (_type_): Donn√©es d'entra√Ænement
        y_train (_type_): Cibles d'entra√Ænement
        params (_type_, optional): Param√®tres du mod√®le. Defaults to None.

    Returns:
        _type_: Co√ªt m√©tier moyen sur la validation crois√©e
    """
    with mlflow.start_run(run_name=model_name):
        # --- 1. Param√®tres & M√©tadonn√©es ---
        if params:
            mlflow.log_params(params)
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("train_shape", str(X_train.shape))
        mlflow.log_param(
            "target_dist", str(y_train.value_counts(normalize=True).to_dict())
        )

        # --- 2. Cross-Validation (Technique + M√©tier) ---
        scoring = {
            "auc": "roc_auc",
            "business": make_scorer(
                business_cost_score, cost_fn=COST_FN, cost_fp=1, greater_is_better=False
            ),
        }

        cv_results = cross_validate(
            model,
            X_train,
            y_train,
            cv=StratifiedKFold(
                n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE
            ),
            scoring=scoring,
            return_train_score=True,
            n_jobs=-1,
        )

        # --- 3. Entra√Ænement final & Pr√©dictions ---
        model.fit(X_train, y_train)
        y_proba = model.predict_proba(X_train)[:, 1]

        # On utilise le seuil par d√©faut 0.5 pour la baseline
        y_pred_05 = (y_proba >= 0.5).astype(int)

        # --- 4. Calcul des m√©triques ---
        train_auc = roc_auc_score(y_train, y_proba)
        cv_auc_mean = cv_results["test_auc"].mean()

        # Matrice de confusion (pour extraire TN, FP, FN, TP)
        tn, fp, fn, tp = confusion_matrix(y_train, y_pred_05).ravel()

        # --- 5. LOGS MLFLOW  ---
        # CV Metrics
        mlflow.log_metric("cv_auc_mean", cv_auc_mean)
        mlflow.log_metric("cv_auc_std", cv_results["test_auc"].std())

        # ‚ö†Ô∏è IMPORTANT: greater_is_better=False fait que sklearn INVERSE le signe
        # cv_results["test_business"] contient des valeurs N√âGATIVES
        # On re-inverse avec `-` pour avoir un co√ªt POSITIF dans MLFlow
        mlflow.log_metric("cv_business_cost_mean", -cv_results["test_business"].mean())

        # Overfitting Gaps
        mlflow.log_metric("auc_gap", train_auc - cv_auc_mean)

        # Confusion Matrix Metrics
        mlflow.log_metric("train_tn", int(tn))
        mlflow.log_metric("train_fp", int(fp))
        mlflow.log_metric("train_fn", int(fn))
        mlflow.log_metric("train_tp", int(tp))

        # --- 6. Artefacts (Plots & Mod√®le) ---
        fig, ax = plt.subplots(figsize=(6, 5))
        sns.heatmap(
            confusion_matrix(y_train, y_pred_05), annot=True, fmt="d", cmap="Blues"
        )
        mlflow.log_figure(fig, "confusion_matrix_default.png")
        plt.close()

        mlflow.sklearn.log_model(model, "model")

        print(
            f"‚úÖ {model_name} | CV AUC: {cv_auc_mean:.4f} | Gap: {train_auc - cv_auc_mean:.4f}"
        )

        # Re-inverse pour retourner le co√ªt positif (sklearn l'a invers√©)
        return -cv_results["test_business"].mean()

In [7]:
# D√©finition des mod√®les √† tester
models_to_test = [
    {
        "name": "Logistic_Regression_Baseline_balanced",
        "model": LogisticRegression(
            max_iter=1000, random_state=RANDOM_STATE, class_weight="balanced"
        ),
        "params": {"max_iter": 1000, "class_weight": "balanced"},
    },
    {
        "name": "Logistic_Regression_Baseline_non_balanced",
        "model": LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
        "params": {"max_iter": 1000},
    },
    {
        "name": "RandomForest_Baseline",
        "model": RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=RANDOM_STATE,
            class_weight="balanced",
            n_jobs=-1,
        ),
        "params": {"n_estimators": 100, "max_depth": 10},
    },
    {
        "name": "LightGBM_Baseline_balanced",
        "model": lgb.LGBMClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=7,
            random_state=RANDOM_STATE,
            class_weight="balanced",
        ),
        "params": {"n_estimators": 100, "learning_rate": 0.1},
    },
    {
        "name": "LightGBM_Baseline_non_balanced",
        "model": lgb.LGBMClassifier(
            n_estimators=100, learning_rate=0.1, max_depth=7, random_state=RANDOM_STATE
        ),
        "params": {"n_estimators": 100, "learning_rate": 0.1},
    },
]

# Ex√©cution de la boucle
results_list = []

print("\n" + "=" * 60)
print("üöÄ D√âMARRAGE DES ENTRA√éNEMENTS BASELINES")
print("=" * 60)

for m in models_to_test:
    score = train_and_log_model(
        m["model"], m["name"], X_train, y_train, params=m["params"]
    )
    results_list.append({"Model": m["name"], "CV Business Cost": score})

# 3. Comparaison finale
print("\n" + "=" * 60)
print("üìä R√âSULTATS FINAUX")
print("=" * 60)
baseline_results = pd.DataFrame(results_list).sort_values("CV Business Cost")
print(baseline_results.to_string(index=False))

best_baseline = baseline_results.iloc[0]["Model"]
print(f"\nüí° Le meilleur mod√®le est : {best_baseline}")


üöÄ D√âMARRAGE DES ENTRA√éNEMENTS BASELINES




‚úÖ Logistic_Regression_Baseline_balanced | CV AUC: 0.7705 | Gap: 0.0057




‚úÖ Logistic_Regression_Baseline_non_balanced | CV AUC: 0.7700 | Gap: 0.0048




‚úÖ RandomForest_Baseline | CV AUC: 0.7545 | Gap: 0.0788
[LightGBM] [Info] Number of positive: 15888, number of negative: 180918
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.145657 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 46867
[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 401
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 15888, number of negative: 180918
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.448147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 46893
[LightGBM] [Info] Number of data points in the tra



‚úÖ LightGBM_Baseline_balanced | CV AUC: 0.7785 | Gap: 0.0442
[LightGBM] [Info] Number of positive: 15888, number of negative: 180918
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.467164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 46868
[LightGBM] [Info] Number of data points in the train set: 196806, number of used features: 402
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432480
[LightGBM] [Info] Start training from score -2.432480
[LightGBM] [Info] Number of positive: 15888, number of negative: 180919
[LightGBM] [Info] Number of positive: 15888, number of negative: 180918
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.309859 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[Light



‚úÖ LightGBM_Baseline_non_balanced | CV AUC: 0.7782 | Gap: 0.0460

üìä R√âSULTATS FINAUX
                                    Model  CV Business Cost
               LightGBM_Baseline_balanced          0.499955
    Logistic_Regression_Baseline_balanced          0.511394
                    RandomForest_Baseline          0.535344
           LightGBM_Baseline_non_balanced          0.784300
Logistic_Regression_Baseline_non_balanced          0.787718

üí° Le meilleur mod√®le est : LightGBM_Baseline_balanced


In [8]:
"""
## 4. Hyperparameter Tuning avec GridSearchCV (Harmonis√©)

On optimise le meilleur mod√®le baseline avec GridSearchCV.
On utilise le multi-scoring pour logger l'AUC et le Score M√©tier en m√™me temps.
"""

best_baseline_name = "LightGBM_Baseline_balanced"

print(f"\nüîç Hyperparameter Tuning sur {best_baseline_name}")
print("=" * 60)

# 1. D√©finition des scorers (AUC + M√©tier)
scoring = {
    "AUC": "roc_auc",
    "Business_Score": make_scorer(
        business_cost_score, greater_is_better=False, cost_fn=COST_FN, cost_fp=1
    ),
}

# 2. Grille de param√®tres
param_grid = {
    "n_estimators": [100, 200],
    "learning_rate": [0.01, 0.05, 0.1],
    "num_leaves": [31, 50, 70],  # contr√¥le la complexit√©
    "min_child_samples": [20, 50, 100],  # r√©gularisation pour le d√©s√©quilibre
    "force_row_wise": [True],
    "verbosity": [-1],
}

# LightGBM utilise des arbres leaf-wise. La r√®gle est : num_leaves <= 2^max_depth. Or ta grille teste :

# max_depth=5 + num_leaves=70 ‚Üí 70 > 2^5 = 32, donc num_leaves est ignor√© silencieusement
# max_depth=5 + num_leaves=50 ‚Üí idem
# √áa veut dire que tu gaspilles du temps de calcul sur des combinaisons qui reviennent au m√™me. Deux options :

# Retirer max_depth et ne garder que num_leaves (approche recommand√©e LightGBM)
# Ou garder max_depth et ajuster num_leaves pour qu'il soit coh√©rent


lgbm_grid = lgb.LGBMClassifier(
    random_state=RANDOM_STATE, class_weight="balanced", n_jobs=-1
)

# 3. GridSearchCV
grid_search = GridSearchCV(
    lgbm_grid,  # type:ignore
    param_grid,
    cv=StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE),
    scoring=scoring,
    refit="Business_Score",  # Crucial: On choisit le meilleur mod√®le selon le co√ªt m√©tier
    return_train_score=True,  # Pour calculer les gaps d'overfitting
    n_jobs=-1,
    verbose=2,
)

print("‚è≥ GridSearch en cours...")
grid_search.fit(X_train, y_train)

# --- 4. LOG HARMONIS√â DANS MLFLOW ---

with mlflow.start_run(run_name=f"{best_baseline_name}_Tuned_Complete"):
    # Index du meilleur run
    idx = grid_search.best_index_
    best_model = grid_search.best_estimator_

    # A. Param√®tres
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_param("model_type", f"{best_baseline_name}_Tuned")
    mlflow.log_param("n_features", X_train.shape[1])

    # B. M√©triques de Cross-Validation (Moyennes et √âcarts-types)
    cv_auc_mean = grid_search.cv_results_["mean_test_AUC"][idx]
    cv_cost_mean = -grid_search.cv_results_["mean_test_Business_Score"][idx]

    mlflow.log_metric("cv_auc_mean", cv_auc_mean)
    mlflow.log_metric("cv_auc_std", grid_search.cv_results_["std_test_AUC"][idx])
    mlflow.log_metric("cv_business_cost_mean", cv_cost_mean)
    mlflow.log_metric(
        "cv_business_cost_std", grid_search.cv_results_["std_test_Business_Score"][idx]
    )

    # C. M√©triques sur le Train (pour d√©tection Overfitting)
    y_proba = best_model.predict_proba(X_train)[:, 1]
    y_pred_05 = (y_proba >= 0.5).astype(int)  # Seuil baseline

    train_auc = roc_auc_score(y_train, y_proba)

    # D. Gaps & Confusion Matrix
    mlflow.log_metric("auc_gap", train_auc - cv_auc_mean)

    tn, fp, fn, tp = confusion_matrix(y_train, y_pred_05).ravel()
    mlflow.log_metric("train_fn", int(fn))
    mlflow.log_metric("train_fp", int(fp))

    # E. Artefacts
    fig, ax = plt.subplots(figsize=(6, 5))
    sns.heatmap(confusion_matrix(y_train, y_pred_05), annot=True, fmt="d", cmap="Blues")
    ax.set_title(f"Confusion Matrix - {best_baseline_name} Tuned (Seuil 0.5)")
    mlflow.log_figure(fig, "confusion_matrix_tuned.png")
    plt.close()

    # F. Mod√®le
    mlflow.sklearn.log_model(best_model, "model")

    print(f"\n‚úÖ Tuning termin√© et loggu√© de fa√ßon exhaustive.")
    print(f"Meilleurs param√®tres: {grid_search.best_params_}")
    print(f"Meilleur CV AUC: {cv_auc_mean:.4f} | CV Business Cost: {cv_cost_mean:.4f}")


üîç Hyperparameter Tuning sur LightGBM_Baseline_balanced
‚è≥ GridSearch en cours...
Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END force_row_wise=True, learning_rate=0.01, min_child_samples=20, n_estimators=100, num_leaves=31, verbosity=-1; total time= 1.2min
[CV] END force_row_wise=True, learning_rate=0.01, min_child_samples=20, n_estimators=100, num_leaves=31, verbosity=-1; total time= 1.2min
[CV] END force_row_wise=True, learning_rate=0.01, min_child_samples=20, n_estimators=100, num_leaves=31, verbosity=-1; total time= 1.2min
[CV] END force_row_wise=True, learning_rate=0.01, min_child_samples=20, n_estimators=100, num_leaves=31, verbosity=-1; total time= 1.2min
[CV] END force_row_wise=True, learning_rate=0.01, min_child_samples=20, n_estimators=100, num_leaves=31, verbosity=-1; total time= 1.3min
[CV] END force_row_wise=True, learning_rate=0.01, min_child_samples=20, n_estimators=100, num_leaves=50, verbosity=-1; total time= 1.4min
[CV] END force_row_wise=




‚úÖ Tuning termin√© et loggu√© de fa√ßon exhaustive.
Meilleurs param√®tres: {'force_row_wise': True, 'learning_rate': 0.05, 'min_child_samples': 50, 'n_estimators': 200, 'num_leaves': 50, 'verbosity': -1}
Meilleur CV AUC: 0.7814 | CV Business Cost: 0.4951


In [9]:
def find_optimal_threshold(y_true, y_proba, cost_fn=10, cost_fp=1):
    """
    Trouve le seuil optimal.
    Note: On √©largit l'intervalle car avec FN=10, le seuil est souvent bas.
    """
    thresholds = np.linspace(0.02, 0.7, 100)  # Plus de pr√©cision sur les seuils bas
    costs = []

    for thresh in thresholds:
        y_pred = (y_proba >= thresh).astype(int)
        cost = business_cost_score(y_true, y_pred, cost_fn, cost_fp)
        costs.append(cost)

    optimal_idx = np.argmin(costs)
    optimal_threshold = thresholds[optimal_idx]
    optimal_cost = costs[optimal_idx]

    # Figure d'optimisation
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(thresholds, costs, color="navy", lw=2)
    ax.axvline(
        optimal_threshold,
        color="red",
        linestyle="--",
        label=f"Optimal: {optimal_threshold:.3f}",
    )
    ax.axvline(0.5, color="orange", linestyle=":", label="D√©faut: 0.500")
    ax.set_title("Recherche du Seuil Minimisant le Co√ªt M√©tier")
    ax.set_xlabel("Seuil")
    ax.set_ylabel("Co√ªt M√©tier")
    ax.legend()

    return optimal_threshold, optimal_cost, fig


def find_optimal_threshold_cv(
    model, X_train, y_train, cost_fn=10, cost_fp=1, n_folds=5
):
    """
    Version CROSS-VALIDATION - √©vite l'overfitting du seuil.
    Utilise Out-Of-Fold predictions.
    """
    from sklearn.base import clone

    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    y_proba_oof = np.zeros(len(y_train))

    print(f"üîÑ Optimisation du seuil en {n_folds}-fold CV...")

    # G√©n√©rer les pr√©dictions OOF
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        X_tr = X_train.iloc[train_idx]
        y_tr = y_train.iloc[train_idx]
        X_vl = X_train.iloc[val_idx]

        # Cloner et entra√Æner
        model_fold = clone(model)
        model_fold.fit(X_tr, y_tr)

        # Pr√©dire sur validation fold
        y_proba_oof[val_idx] = model_fold.predict_proba(X_vl)[:, 1]

        print(f"  Fold {fold_idx + 1}/{n_folds} termin√©")

    # Chercher le seuil optimal sur OOF
    thresholds = np.linspace(0.02, 0.7, 100)
    costs = []

    for thresh in thresholds:
        y_pred = (y_proba_oof >= thresh).astype(int)
        cost = business_cost_score(y_train, y_pred, cost_fn, cost_fp)
        costs.append(cost)

    optimal_idx = np.argmin(costs)
    optimal_threshold = thresholds[optimal_idx]
    optimal_cost = costs[optimal_idx]

    # Figure
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(thresholds, costs, color="navy", lw=2)
    ax.axvline(
        optimal_threshold,
        color="red",
        linestyle="--",
        label=f"Optimal (CV): {optimal_threshold:.3f}",
    )
    ax.axvline(0.5, color="orange", linestyle=":", label="D√©faut: 0.500")
    ax.set_title("Optimisation du Seuil en Cross-Validation (Sans Fuite)")
    ax.set_xlabel("Seuil")
    ax.set_ylabel("Co√ªt M√©tier")
    ax.legend()
    plt.close()

    return optimal_threshold, optimal_cost, fig, y_proba_oof


# --- Ex√©cution ---

# ‚úÖ CORRECTION : G√©n√©ration des pr√©dictions OOF pour √©viter le data leakage
optimal_thresh, optimal_cost, fig_thresh, y_proba_oof = find_optimal_threshold_cv(
    best_model, X_train, y_train, COST_FN, 1, N_FOLDS
)

# ‚úÖ CORRECTION : Utiliser y_proba_oof au lieu de y_train_proba
y_pred_opt = (y_proba_oof >= optimal_thresh).astype(int)
y_pred_def = (y_proba_oof >= 0.5).astype(int)

# --- LOG FINAL HARMONIS√â ---

with mlflow.start_run(run_name=f"{best_baseline}_FINAL_MODEL"):
    # 1. Hyperparam√®tres et Seuil
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_param("optimal_threshold", round(optimal_thresh, 3))
    mlflow.log_param("model_type", f"{best_baseline}_Tuned_Final")

    # 2. M√©triques de Cross-Validation (Harmonisation avec les Baselines)
    # On r√©cup√®re les r√©sultats du GridSearch pour le meilleur index
    best_idx = grid_search.best_index_

    cv_auc = grid_search.cv_results_["mean_test_AUC"][best_idx]
    cv_cost = -grid_search.cv_results_["mean_test_Business_Score"][
        best_idx
    ]  # On repasse en positif
    cv_cost_std = grid_search.cv_results_["std_test_Business_Score"][best_idx]

    mlflow.log_metric("cv_auc_mean", cv_auc)
    mlflow.log_metric("cv_business_cost_mean", cv_cost)
    mlflow.log_metric("cv_business_cost_std", cv_cost_std)

    # 3. M√©triques de performance sur le Train (Technique)
    # ‚úÖ CORRECTION : Calcul de train_auc sur OOF pour coh√©rence
    train_auc = roc_auc_score(y_train, y_proba_oof)
    mlflow.log_metric("train_auc", train_auc)
    mlflow.log_metric("auc_gap", train_auc - cv_auc)

    # 4. Impact M√©tier du Seuil (La valeur ajout√©e de cette √©tape)
    cost_def = business_cost_score(y_train, y_pred_def, COST_FN, 1)
    mlflow.log_metric("train_business_cost_at_05", cost_def)
    mlflow.log_metric("train_business_cost_optimal", optimal_cost)

    improvement = ((cost_def - optimal_cost) / cost_def) * 100
    mlflow.log_metric("threshold_improvement_percent", improvement)

    # 5. Confusion Matrix au seuil OPTIMAL
    tn, fp, fn, tp = confusion_matrix(y_train, y_pred_opt).ravel()
    mlflow.log_metric("final_train_fn", int(fn))
    mlflow.log_metric("final_train_fp", int(fp))
    mlflow.log_metric("final_train_tp", int(tp))
    mlflow.log_metric("final_train_tn", int(tn))

    # 6. Artefacts
    mlflow.log_figure(fig_thresh, "threshold_optimization_curve.png")

    fig_cm, ax = plt.subplots()
    sns.heatmap(
        confusion_matrix(y_train, y_pred_opt), annot=True, fmt="d", cmap="Greens"
    )
    ax.set_title(f"Matrice de Confusion au Seuil Optimal ({optimal_thresh:.3f})")
    mlflow.log_figure(fig_cm, "final_confusion_matrix.png")
    plt.close()

    # 7. Model Registry
    mlflow.sklearn.log_model(
        best_model, "model", registered_model_name="Credit_Scoring_Model_LGBM"
    )

    print(f"‚úÖ Mod√®le Final loggu√© et enregistr√© !")
    print(f"Score CV m√©tier: {cv_cost:.4f} (¬± {cv_cost_std:.4f})")
    print(
        f"Gain m√©tier sur train (seuil 0.5 -> {optimal_thresh:.3f}): {improvement:.1f}%"
    )

üîÑ Optimisation du seuil en 5-fold CV...
  Fold 1/5 termin√©
  Fold 2/5 termin√©
  Fold 3/5 termin√©
  Fold 4/5 termin√©
  Fold 5/5 termin√©


2026/02/13 17:07:42 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/02/13 17:07:42 INFO mlflow.store.db.utils: Updating database tables
2026/02/13 17:07:42 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/02/13 17:07:42 INFO alembic.runtime.migration: Will assume non-transactional DDL.


‚úÖ Mod√®le Final loggu√© et enregistr√© !
Score CV m√©tier: 0.4951 (¬± 0.0068)
Gain m√©tier sur train (seuil 0.5 -> 0.494): 0.2%


Successfully registered model 'Credit_Scoring_Model_LGBM'.
Created version '1' of model 'Credit_Scoring_Model_LGBM'.


In [None]:
"""
## 5. Evaluation finale sur le set de validation (X_val)

Ce set a √©t√© isol√© d√®s le d√©but (20% des donn√©es) et n'a jamais √©t√©
utilis√© pendant l'entra√Ænement ni le tuning. C'est notre estimation
honn√™te de la performance en production.
"""

print("\n" + "=" * 60)
print("üéØ √âVALUATION FINALE SUR LE SET DE VALIDATION (20%)")
print("=" * 60)

# Pr√©dictions sur X_val
y_val_proba = best_model.predict_proba(X_val)[:, 1]
y_val_pred_optimal = (y_val_proba >= optimal_thresh).astype(int)
y_val_pred_default = (y_val_proba >= 0.5).astype(int)

# M√©triques
val_auc = roc_auc_score(y_val, y_val_proba)
val_cost_optimal = business_cost_score(y_val, y_val_pred_optimal, COST_FN, 1)
val_cost_default = business_cost_score(y_val, y_val_pred_default, COST_FN, 1)

# Confusion matrix au seuil optimal
tn_val, fp_val, fn_val, tp_val = confusion_matrix(y_val, y_val_pred_optimal).ravel()

print(f"\nüìä AUC Validation : {val_auc:.4f}")
print(f"üìä Business Cost (seuil 0.5)     : {val_cost_default:.4f}")
print(f"üìä Business Cost (seuil {optimal_thresh:.3f}) : {val_cost_optimal:.4f}")
print(f"\nüìä Matrice de confusion (seuil {optimal_thresh:.3f}) :")
print(f"   TN={tn_val:,}  FP={fp_val:,}")
print(f"   FN={fn_val:,}  TP={tp_val:,}")

# Comparaison CV vs Validation
print(f"\nüîç Comparaison CV vs Validation :")
print(f"   CV AUC          : {cv_auc:.4f}")
print(f"   Val AUC         : {val_auc:.4f}")
print(f"   √âcart           : {abs(cv_auc - val_auc):.4f}")
print(f"   CV Business Cost: {cv_cost:.4f}")
print(f"   Val Business Cost: {val_cost_optimal:.4f}")

if abs(cv_auc - val_auc) < 0.02:
    print(
        "\n‚úÖ Les scores CV et validation sont coh√©rents ‚Äî le mod√®le g√©n√©ralise bien."
    )
else:
    print("\n‚ö†Ô∏è √âcart significatif entre CV et validation ‚Äî possible overfitting.")

# Log dans MLflow
with mlflow.start_run(run_name=f"{best_baseline}_VALIDATION"):
    mlflow.log_metric("val_auc", val_auc)
    mlflow.log_metric("val_business_cost_optimal", val_cost_optimal)
    mlflow.log_metric("val_business_cost_default", val_cost_default)
    mlflow.log_metric("val_fn", int(fn_val))
    mlflow.log_metric("val_fp", int(fp_val))
    mlflow.log_metric("val_tn", int(tn_val))
    mlflow.log_metric("val_tp", int(tp_val))
    mlflow.log_param("optimal_threshold", round(optimal_thresh, 3))
    mlflow.log_param("model_type", f"{best_baseline}_Validation")

    fig_val, ax = plt.subplots(figsize=(6, 5))
    sns.heatmap(
        confusion_matrix(y_val, y_val_pred_optimal), annot=True, fmt="d", cmap="Purples"
    )
    ax.set_title(f"Validation - Confusion Matrix (Seuil {optimal_thresh:.3f})")
    mlflow.log_figure(fig_val, "validation_confusion_matrix.png")
    plt.close()

print("\n‚úÖ √âvaluation validation logu√©e dans MLflow")


üéØ √âVALUATION FINALE SUR LE SET DE VALIDATION (20%)

üìä AUC Validation : 0.7852
üìä Business Cost (seuil 0.5)     : 0.4908
üìä Business Cost (seuil 0.494) : 0.4907

üìä Matrice de confusion (seuil 0.494) :
   TN=41,659  FP=14,879
   FN=1,530  TP=3,435

üîç Comparaison CV vs Validation :
   CV AUC          : 0.7814
   Val AUC         : 0.7852
   √âcart           : 0.0038
   CV Business Cost: 0.4951
   Val Business Cost: 0.4907

‚úÖ Les scores CV et validation sont coh√©rents ‚Äî le mod√®le g√©n√©ralise bien.

‚úÖ √âvaluation validation logu√©e dans MLflow


In [12]:
"""
## 6. Comparaison Finale et S√©lection du Meilleur Mod√®le
"""

print("\n" + "=" * 60)
print("üìä COMPARAISON FINALE DES MOD√àLES")
print("=" * 60)

# R√©cup√©rer tous les runs de l'experiment
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

# Trier par business_cost
runs_sorted = runs.sort_values("metrics.cv_business_cost_mean", ascending=True)

# Afficher le top 10 des mod√®les
print("\nTop 10 Mod√®les (tri√©s par co√ªt m√©tier):")
columns_to_show = [
    "tags.mlflow.runName",
    "metrics.cv_business_cost_mean",  # Correct
    "metrics.train_auc",  # Correct
    "metrics.cv_auc_mean",  # Ajout du pr√©fixe 'metrics.' ici !
    "metrics.auc_gap",  # Si tu l'as loggu√©, c'est tr√®s utile √† voir
]

# On ne garde que les colonnes qui existent r√©ellement pour √©viter un nouveau crash
existing_cols = [c for c in columns_to_show if c in runs_sorted.columns]

print(runs_sorted[existing_cols].head(10).to_string(index=False))
# S√©lectionner le meilleur
best_run_id = runs_sorted.iloc[0]["run_id"]
best_run_name = runs_sorted.iloc[0]["tags.mlflow.runName"]

print(f"\n‚úÖ Meilleur mod√®le: {best_run_name}")
print(f"‚úÖ Run ID: {best_run_id}")


üìä COMPARAISON FINALE DES MOD√àLES

Top 10 Mod√®les (tri√©s par co√ªt m√©tier):
                      tags.mlflow.runName  metrics.cv_business_cost_mean  metrics.train_auc  metrics.cv_auc_mean  metrics.auc_gap
   LightGBM_Baseline_balanced_FINAL_MODEL                       0.495069           0.781337             0.781354        -0.000016
LightGBM_Baseline_balanced_Tuned_Complete                       0.495069                NaN             0.781354         0.070313
LightGBM_Baseline_balanced_Tuned_Complete                       0.498224                NaN             0.779327         0.044570
               LightGBM_Baseline_balanced                       0.499955                NaN             0.778513         0.044177
               LightGBM_Baseline_balanced                       0.499955                NaN             0.778513         0.044177
    Logistic_Regression_Baseline_balanced                       0.511394                NaN             0.770536         0.005719
    Log

In [13]:
"""
## 7. Chargement des Donn√©es de Test
"""

print("\n" + "=" * 60)
print("üìÇ CHARGEMENT DES DONN√âES DE TEST")
print("=" * 60)

# Charger les donn√©es de test preprocessed
test_full = pd.read_csv(DATA_PATH / "test_preprocessed.csv")

# S√©parer les IDs et les features
test_ids = test_full["SK_ID_CURR"]
X_test = test_full.drop("SK_ID_CURR", axis=1)

print(f"‚úÖ Donn√©es de test charg√©es: {X_test.shape}")
print(f"‚úÖ IDs de test: {len(test_ids)} clients")
print(f"‚úÖ Pr√™t pour les pr√©dictions!")


üìÇ CHARGEMENT DES DONN√âES DE TEST
‚úÖ Donn√©es de test charg√©es: (48744, 420)
‚úÖ IDs de test: 48744 clients
‚úÖ Pr√™t pour les pr√©dictions!


In [None]:
"""
## 8. Model Serving - Test du Mod√®le en Production
"""
# S'assurer que le nom correspond √† ce qui a √©t√© enregistr√© plus haut
model_name = "Credit_Scoring_Model_LGBM"

# Charger la derni√®re version (au lieu de "Production" qui est peut-√™tre vide)
model_uri = f"models:/{model_name}/latest"
print(f"Chargement du mod√®le depuis : {model_uri}")

model_production = mlflow.sklearn.load_model(
    model_uri
)  # Utilise sklearn.load_model pour garder predict_proba

# 1. Aligner X_test sur les features du mod√®le de fa√ßon robuste
features_cols = model_production.feature_name_

# reindex ajoute les colonnes manquantes (fill_value=0) et retire les colonnes en trop
X_test_aligned = X_test.reindex(columns=features_cols, fill_value=0)

print(
    f"‚úÖ Alignement termin√© : {X_test.shape[1]} -> {X_test_aligned.shape[1]} colonnes"
)

# 2. Test sur l'√©chantillon
print(f"\nüß™ Test du Mod√®le en Production :")
print("=" * 60)

X_sample = X_test_aligned.head(5)
predictions_proba = model_production.predict_proba(X_sample)[:, 1]
predictions = (predictions_proba >= optimal_thresh).astype(int)

# 3. Affichage
result_df = pd.DataFrame(
    {
        "SK_ID_CURR": test_ids.head(5).values,
        "Probability": predictions_proba,
        "Prediction": predictions,
    }
)
print(result_df.to_string(index=False))

# 4. G√©n√©ration du fichier final
print("\nüì§ G√©n√©ration des pr√©dictions finales pour Kaggle...")
final_proba = model_production.predict_proba(X_test_aligned)[:, 1]

submission = pd.DataFrame(
    {
        "SK_ID_CURR": test_ids,
        "TARGET": final_proba,  # On envoie la proba pour maximiser l'AUC sur Kaggle
    }
)

submission.to_csv("../data/submission.csv", index=False)
print("‚úÖ Fichier de soumission cr√©√© : ../data/submission.csv")

Chargement du mod√®le depuis : models:/Credit_Scoring_Model_LGBM/latest


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

‚úÖ Alignement termin√© : 420 -> 419 colonnes

üß™ Test du Mod√®le en Production :
 SK_ID_CURR  Probability  Prediction
     100001     0.242050           0
     100005     0.635779           1
     100013     0.248509           0
     100028     0.180654           0
     100038     0.518640           1

üì§ G√©n√©ration des pr√©dictions finales pour Kaggle...
‚úÖ Fichier de soumission cr√©√© : ../data/submission.csv


In [None]:
"""
## 9. Documentation et Export
"""

# Sauvegarder le seuil optimal et autres m√©tadonn√©es
metadata = {
    "best_model_name": best_run_name,
    "best_model_run_id": best_run_id,
    "optimal_threshold": optimal_thresh,
    "cost_fn": COST_FN,
    "cost_fp": 1,
    "business_cost_optimal": optimal_cost,  # ‚úÖ CORRECTION : cost_optimal ‚Üí optimal_cost
    "n_features": X_train.shape[1],
    "n_train_samples": X_train.shape[0],
    "class_distribution": y_train.value_counts().to_dict(),
}

import json

with open("../models/model_metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)

print("\n‚úÖ M√©tadonn√©es sauvegard√©es: model_metadata.json")

print("\n" + "=" * 60)
print("üéâ MODELING TERMIN√â !")
print("=" * 60)
print(f"""
R√âSUM√â:
- Meilleur mod√®le: {best_run_name}
- Co√ªt m√©tier: {optimal_cost:.4f}
- Seuil optimal: {optimal_thresh:.3f}
- Mod√®le en production: {model_name}
- Fichier soumission: submission.csv

PROCHAINES √âTAPES:
1. Lancer l'UI MLFlow: mlflow ui
2. Comparer les runs et visualisations
3. Soumettre √† Kaggle
4. Documenter les r√©sultats
""")