# Notebook 04 : SMOTE vs Class Weight - CORRIGÉ

**Objectif** : Comparer deux approches pour gérer le déséquilibre des classes

## Contenu
1. Chargement sécurisé des données (avec fallback)
2. Comparaison SMOTE vs class_weight
3. Analyse des résultats
4. Décision finale

---

## Configuration

In [None]:
# Imports
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
import time

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, accuracy_score, recall_score, 
    precision_score, f1_score, confusion_matrix,
    classification_report
)
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('Set2')

print('Imports OK')

## 1. Chargement Sécurisé des Données

On essaie de charger les artifacts. Si ça échoue, on recharge depuis les données brutes.

In [None]:
# Chemins
current_dir = Path.cwd()
if current_dir.name == 'notebooks':
    DOSSIER_ARTIFACTS = current_dir.parent / 'artifacts'
    DOSSIER_DATA = current_dir.parent / 'data'
else:
    DOSSIER_ARTIFACTS = current_dir / 'artifacts'
    DOSSIER_DATA = current_dir / 'data'

print(f"Dossier artifacts: {DOSSIER_ARTIFACTS}")
print(f"Dossier data: {DOSSIER_DATA}")

In [None]:
# Fonction de chargement sécurisé
def charger_donnees():
    """
    Tente de charger les données depuis artifacts.
    Si échec, recharge depuis les données brutes.
    """
    
    # Tentative 1 : Charger depuis artifacts
    try:
        print("Tentative de chargement depuis artifacts...")
        
        data_split_path = DOSSIER_ARTIFACTS / 'data_split.joblib'
        if not data_split_path.exists():
            raise FileNotFoundError("data_split.joblib non trouve")
        
        data = joblib.load(data_split_path)
        
        # Vérifier le format
        if isinstance(data, dict):
            X_train = data['X_train_processed']
            X_valid = data['X_valid_processed']
            y_train = data['y_train']
            y_valid = data['y_valid']
        else:
            X_train, X_valid, y_train, y_valid = data
        
        print(f"OK - Chargement depuis artifacts reussi")
        print(f"  X_train shape: {X_train.shape}")
        print(f"  y_train shape: {y_train.shape}")
        
        return X_train, X_valid, y_train, y_valid
    
    except Exception as e:
        print(f"Echec du chargement depuis artifacts: {e}")
        print("\nTentative de rechargement depuis donnees brutes...")
        
        # Tentative 2 : Recharger depuis données brutes
        # Chercher le fichier agrégé ou brut
        fichier_aggr = DOSSIER_DATA / 'application_train_AGGREGATED.csv'
        fichier_brut = DOSSIER_DATA / 'application_train.csv'
        
        if fichier_aggr.exists():
            print(f"Chargement de {fichier_aggr.name}...")
            df = pd.read_csv(fichier_aggr)
        elif fichier_brut.exists():
            print(f"Chargement de {fichier_brut.name}...")
            df = pd.read_csv(fichier_brut)
        else:
            raise FileNotFoundError(
                "Aucun fichier de donnees trouve. "
                "Executez le Notebook 01 ou placez application_train.csv dans data/"
            )
        
        print(f"OK - Donnees chargees: {df.shape}")
        
        # Préparation rapide
        X = df.drop(columns=['SK_ID_CURR', 'TARGET'], errors='ignore')
        y = df['TARGET']
        
        # Garder seulement les colonnes numériques pour simplifier
        X = X.select_dtypes(include=['int64', 'float64'])
        
        # Remplir les NaN avec la médiane
        X = X.fillna(X.median())
        
        print(f"Preprocessing simple applique")
        print(f"  Features numeriques: {X.shape[1]}")
        
        # Split
        X_train, X_valid, y_train, y_valid = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )
        
        print(f"\nSplit effectue:")
        print(f"  X_train: {X_train.shape}")
        print(f"  X_valid: {X_valid.shape}")
        
        return X_train, X_valid, y_train, y_valid

# Charger les données
X_train, X_valid, y_train, y_valid = charger_donnees()

print(f"\nDistribution y_train:")
print(y_train.value_counts(normalize=True))

## 2. Baseline (Sans Correction du Déséquilibre)

In [None]:
import gc
from sklearn.impute import SimpleImputer

print("="*60)
print("BASELINE : Sans correction du desequilibre")
print("="*60)

# Libérer la mémoire
gc.collect()

# ÉTAPE 1 : Optimiser les types de données
print("Optimisation des types de données...")
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
        X_valid[col] = pd.to_numeric(X_valid[col], errors='coerce')
    
    if X_train[col].dtype in ['float64', 'int64']:
        X_train[col] = X_train[col].astype('float32')
        X_valid[col] = X_valid[col].astype('float32')

print("Optimisation terminée!")

# ÉTAPE 2 : Vérifier et gérer les valeurs manquantes
print(f"\nNombre de NaN dans X_train: {X_train.isna().sum().sum()}")
print(f"Nombre de NaN dans X_valid: {X_valid.isna().sum().sum()}")

# Imputation des valeurs manquantes avec la médiane
print("\nImputation des valeurs manquantes...")
imputer = SimpleImputer(strategy='median')
X_train_imputed = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)
X_valid_imputed = pd.DataFrame(
    imputer.transform(X_valid),
    columns=X_valid.columns,
    index=X_valid.index
)

print("Imputation terminée!")
print(f"NaN restants dans X_train: {X_train_imputed.isna().sum().sum()}")
print(f"NaN restants dans X_valid: {X_valid_imputed.isna().sum().sum()}")

# Libérer la mémoire
gc.collect()

# ÉTAPE 3 : Entraîner le modèle
start = time.time()
model_baseline = RandomForestClassifier(
    n_estimators=50,
    max_depth=8,
    max_features='sqrt',
    min_samples_split=10,
    random_state=42,
    n_jobs=-1,
    max_samples=0.8
)

print("\nEntraînement du modèle...")
model_baseline.fit(X_train_imputed, y_train)
y_pred_baseline = model_baseline.predict(X_valid_imputed)
y_proba_baseline = model_baseline.predict_proba(X_valid_imputed)[:, 1]
elapsed = time.time() - start

# Métriques
metrics_baseline = {
    'AUC': roc_auc_score(y_valid, y_proba_baseline),
    'Accuracy': accuracy_score(y_valid, y_pred_baseline),
    'Recall': recall_score(y_valid, y_pred_baseline),
    'Precision': precision_score(y_valid, y_pred_baseline),
    'F1': f1_score(y_valid, y_pred_baseline),
    'Temps (s)': elapsed
}

print(f"\nResultats:")
for metric, value in metrics_baseline.items():
    if metric == 'Temps (s)':
        print(f"  {metric}: {value:.1f}")
    else:
        print(f"  {metric}: {value:.4f}")

# Matrice de confusion
cm_baseline = confusion_matrix(y_valid, y_pred_baseline)
print(f"\nMatrice de confusion:")
print(cm_baseline)

## 3. Approche 1 : SMOTE

In [None]:
import gc
from sklearn.impute import SimpleImputer

print("="*60)
print("APPROCHE 1 : SMOTE (Synthetic Minority Over-sampling)")
print("="*60)

# Libérer la mémoire
gc.collect()

# ÉTAPE 1 : Optimiser les types de données
print("Optimisation des types de données...")
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
        X_valid[col] = pd.to_numeric(X_valid[col], errors='coerce')
    
    if X_train[col].dtype in ['float64', 'int64']:
        X_train[col] = X_train[col].astype('float32')
        X_valid[col] = X_valid[col].astype('float32')

print("Optimisation terminée!")

# ÉTAPE 2 : Imputation des valeurs manquantes
print("\nImputation des valeurs manquantes...")
imputer = SimpleImputer(strategy='median')
X_train_imputed = pd.DataFrame(
    imputer.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)
X_valid_imputed = pd.DataFrame(
    imputer.transform(X_valid),
    columns=X_valid.columns,
    index=X_valid.index
)

print(f"NaN restants dans X_train: {X_train_imputed.isna().sum().sum()}")
print(f"NaN restants dans X_valid: {X_valid_imputed.isna().sum().sum()}")

gc.collect()

# ÉTAPE 3 : Application de SMOTE
start = time.time()
print("\nApplication de SMOTE...")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_imputed, y_train)

print(f"\nAvant SMOTE:")
print(f"  Shape: {X_train_imputed.shape}")
print(f"  Distribution: {y_train.value_counts().to_dict()}")
print(f"\nApres SMOTE:")
print(f"  Shape: {X_train_smote.shape}")
print(f"  Distribution: {pd.Series(y_train_smote).value_counts().to_dict()}")

# ÉTAPE 4 : Entraîner le modèle
print(f"\nEntrainement du modele...")
model_smote = RandomForestClassifier(
    n_estimators=50,      # Réduit pour économiser la mémoire
    max_depth=8,          # Réduit pour économiser la mémoire
    max_features='sqrt',
    min_samples_split=10,
    random_state=42,
    n_jobs=-1,
    max_samples=0.8
)

model_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = model_smote.predict(X_valid_imputed)
y_proba_smote = model_smote.predict_proba(X_valid_imputed)[:, 1]
elapsed = time.time() - start

# Métriques
metrics_smote = {
    'AUC': roc_auc_score(y_valid, y_proba_smote),
    'Accuracy': accuracy_score(y_valid, y_pred_smote),
    'Recall': recall_score(y_valid, y_pred_smote),
    'Precision': precision_score(y_valid, y_pred_smote),
    'F1': f1_score(y_valid, y_pred_smote),
    'Temps (s)': elapsed
}

print(f"\nResultats:")
for metric, value in metrics_smote.items():
    if metric == 'Temps (s)':
        print(f"  {metric}: {value:.1f}")
    else:
        print(f"  {metric}: {value:.4f}")

# Matrice de confusion
cm_smote = confusion_matrix(y_valid, y_pred_smote)
print(f"\nMatrice de confusion:")
print(cm_smote)

## 4. Approche 2 : Class Weight

In [None]:
import gc
from sklearn.impute import SimpleImputer

print("="*60)
print("APPROCHE 2 : Class Weight (Balanced)")
print("="*60)

# Libérer la mémoire
gc.collect()

# ÉTAPE 1 : Vérifier si l'imputation a déjà été faite
# Si X_train_imputed n'existe pas encore, faire l'imputation
try:
    _ = X_train_imputed
    print("Utilisation des données déjà imputées")
except NameError:
    print("Optimisation des types de données...")
    for col in X_train.columns:
        if X_train[col].dtype == 'object':
            X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
            X_valid[col] = pd.to_numeric(X_valid[col], errors='coerce')
        
        if X_train[col].dtype in ['float64', 'int64']:
            X_train[col] = X_train[col].astype('float32')
            X_valid[col] = X_valid[col].astype('float32')
    
    print("Imputation des valeurs manquantes...")
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = pd.DataFrame(
        imputer.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    X_valid_imputed = pd.DataFrame(
        imputer.transform(X_valid),
        columns=X_valid.columns,
        index=X_valid.index
    )
    print(f"NaN restants: {X_train_imputed.isna().sum().sum()}")

gc.collect()

# ÉTAPE 2 : Entraîner le modèle avec class_weight
start = time.time()
model_balanced = RandomForestClassifier(
    n_estimators=50,           # Réduit pour économiser la mémoire
    max_depth=8,               # Réduit pour économiser la mémoire
    max_features='sqrt',
    min_samples_split=10,
    class_weight='balanced',   # Ajustement automatique des poids
    random_state=42,
    n_jobs=-1,
    max_samples=0.8
)

print("Entrainement du modele avec class_weight='balanced'...")
model_balanced.fit(X_train_imputed, y_train)
y_pred_balanced = model_balanced.predict(X_valid_imputed)
y_proba_balanced = model_balanced.predict_proba(X_valid_imputed)[:, 1]
elapsed = time.time() - start

# Métriques
metrics_balanced = {
    'AUC': roc_auc_score(y_valid, y_proba_balanced),
    'Accuracy': accuracy_score(y_valid, y_pred_balanced),
    'Recall': recall_score(y_valid, y_pred_balanced),
    'Precision': precision_score(y_valid, y_pred_balanced),
    'F1': f1_score(y_valid, y_pred_balanced),
    'Temps (s)': elapsed
}

print(f"\nResultats:")
for metric, value in metrics_balanced.items():
    if metric == 'Temps (s)':
        print(f"  {metric}: {value:.1f}")
    else:
        print(f"  {metric}: {value:.4f}")

# Matrice de confusion
cm_balanced = confusion_matrix(y_valid, y_pred_balanced)
print(f"\nMatrice de confusion:")
print(cm_balanced)

## 5. Comparaison Visuelle

In [None]:
# Créer un DataFrame de comparaison
results_df = pd.DataFrame({
    'Baseline': metrics_baseline,
    'SMOTE': metrics_smote,
    'Class Weight': metrics_balanced
}).T

print("\nTableau comparatif:")
print(results_df.to_string())

# Visualisation
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

metrics_to_plot = ['AUC', 'Recall', 'Precision', 'F1', 'Accuracy', 'Temps (s)']

for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx // 3, idx % 3]
    
    data = results_df[metric]
    colors = ['gray', 'orange', 'green']
    
    bars = ax.bar(data.index, data.values, color=colors, alpha=0.7)
    ax.set_title(f'{metric}', fontsize=12, fontweight='bold')
    ax.set_ylabel('Score' if metric != 'Temps (s)' else 'Secondes')
    ax.grid(axis='y', alpha=0.3)
    
    # Ajouter les valeurs sur les barres
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}' if metric != 'Temps (s)' else f'{height:.1f}',
                ha='center', va='bottom', fontsize=9)
    
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../figures/04_smote_vs_classweight.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nGraphique sauvegarde: ../figures/04_smote_vs_classweight.png")

## 6. Matrices de Confusion Comparées

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

cms = [
    (cm_baseline, 'Baseline'),
    (cm_smote, 'SMOTE'),
    (cm_balanced, 'Class Weight')
]

for idx, (cm, title) in enumerate(cms):
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                cbar=False, square=True)
    axes[idx].set_title(title, fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Vraie Classe')
    axes[idx].set_xlabel('Classe Predite')
    axes[idx].set_xticklabels(['Rembourse', 'Defaut'])
    axes[idx].set_yticklabels(['Rembourse', 'Defaut'])

plt.tight_layout()
plt.savefig('../figures/04_confusion_matrices_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nGraphique sauvegarde: ../figures/04_confusion_matrices_comparison.png")

## 7. Analyse et Décision Finale

In [None]:
print("="*80)
print("ANALYSE ET DECISION")
print("="*80)

print("\n1. RECALL (Detection des defauts - PRIORITE):")
print(f"   Baseline:     {metrics_baseline['Recall']:.4f}")
print(f"   SMOTE:        {metrics_smote['Recall']:.4f} ({(metrics_smote['Recall']-metrics_baseline['Recall'])*100:+.1f}%)")
print(f"   Class Weight: {metrics_balanced['Recall']:.4f} ({(metrics_balanced['Recall']-metrics_baseline['Recall'])*100:+.1f}%)")

print("\n2. AUC (Capacite discriminante globale):")
print(f"   Baseline:     {metrics_baseline['AUC']:.4f}")
print(f"   SMOTE:        {metrics_smote['AUC']:.4f} ({(metrics_smote['AUC']-metrics_baseline['AUC'])*100:+.1f}%)")
print(f"   Class Weight: {metrics_balanced['AUC']:.4f} ({(metrics_balanced['AUC']-metrics_baseline['AUC'])*100:+.1f}%)")

print("\n3. F1-SCORE (Equilibre global):")
print(f"   Baseline:     {metrics_baseline['F1']:.4f}")
print(f"   SMOTE:        {metrics_smote['F1']:.4f} ({(metrics_smote['F1']-metrics_baseline['F1'])*100:+.1f}%)")
print(f"   Class Weight: {metrics_balanced['F1']:.4f} ({(metrics_balanced['F1']-metrics_baseline['F1'])*100:+.1f}%)")

print("\n4. TEMPS D'ENTRAINEMENT:")
print(f"   Baseline:     {metrics_baseline['Temps (s)']:.1f}s")
print(f"   SMOTE:        {metrics_smote['Temps (s)']:.1f}s ({metrics_smote['Temps (s)']/metrics_baseline['Temps (s)']:.1f}x)")
print(f"   Class Weight: {metrics_balanced['Temps (s)']:.1f}s ({metrics_balanced['Temps (s)']/metrics_baseline['Temps (s)']:.1f}x)")

# Déterminer le gagnant
print("\n" + "="*80)
print("DECISION FINALE")
print("="*80)

if metrics_balanced['Recall'] >= metrics_smote['Recall'] and \
   metrics_balanced['AUC'] >= metrics_smote['AUC'] and \
   metrics_balanced['Temps (s)'] < metrics_smote['Temps (s)']:
    print("\nAPPROCHE RETENUE : CLASS_WEIGHT='BALANCED'")
    print("\nJustifications:")
    print("  1. Recall superieur ou egal a SMOTE")
    print("  2. AUC superieur ou egal")
    print("  3. Temps d'entrainement plus rapide (important pour re-training)")
    print("  4. Pas de generation de donnees synthetiques (plus fiable)")
    print("  5. Simplicite d'implementation (un seul parametre)")
else:
    print("\nAPPROCHE RETENUE : SMOTE")
    print("\nJustifications:")
    print("  1. Meilleurs performances globales")
    print("  2. Recall ameliore significativement")
    
print("\n" + "="*80)
print("Pour les notebooks suivants, utiliser RandomForestClassifier")
print("avec class_weight='balanced' (ou SMOTE selon decision)")
print("="*80)

## 8. Sauvegarde de la Décision

In [None]:
# Sauvegarder les résultats
resultats = {
    'approche_retenue': 'class_weight' if metrics_balanced['Recall'] >= metrics_smote['Recall'] else 'smote',
    'metrics_baseline': metrics_baseline,
    'metrics_smote': metrics_smote,
    'metrics_balanced': metrics_balanced,
    'comparaison_df': results_df
}

# Créer le dossier artifacts s'il n'existe pas
DOSSIER_ARTIFACTS.mkdir(exist_ok=True)

# Sauvegarder
chemin_resultats = DOSSIER_ARTIFACTS / 'notebook04_resultats.joblib'
joblib.dump(resultats, chemin_resultats)

print(f"\nResultats sauvegardes: {chemin_resultats}")
print(f"\nApproche retenue: {resultats['approche_retenue'].upper()}")