# **CELL 1 ‚Äî Install libs & imports**

In [1]:
!pip install -q pandas numpy scikit-learn xgboost lightgbm catboost imbalanced-learn joblib

import os
import pickle
import warnings
from datetime import datetime
import time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, ConfusionMatrixDisplay
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore")
np.random.seed(42)

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("‚úÖ Imports OK")
print("üìÖ", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h‚úÖ Imports OK
üìÖ 2025-12-11 19:47:54


# **CELL 2 ‚Äî Project paths**

In [4]:
import os

BASE_DIR = "/content/projet_llm"
NOTEBOOKS_DIR = os.path.join(BASE_DIR, "notebooks")
PROCESSOR_DIR = os.path.join(NOTEBOOKS_DIR, "processors")

for d in [BASE_DIR, NOTEBOOKS_DIR, PROCESSOR_DIR]:
    os.makedirs(d, exist_ok=True)

print("üìÅ BASE_DIR     :", BASE_DIR)
print("üìÅ PROCESSOR_DIR:", PROCESSOR_DIR)


üìÅ BASE_DIR     : /content/projet_llm
üìÅ PROCESSOR_DIR: /content/projet_llm/notebooks/processors


# **CELL 3 ‚Äî Charger les donn√©es pr√©process√©es**

In [5]:
from google.colab import files

print("üì§ Please upload your preprocessing artifacts from your PC:")
print("‚û° At minimum: preprocessed_data.pkl (and optionally vectorizer.pkl, feature_names.pkl, smote_config.pkl)")

uploaded = files.upload()  # Select the .pkl files from your computer

# Move each uploaded file into PROCESSOR_DIR
for filename in uploaded.keys():
    src = filename
    dst = os.path.join(PROCESSOR_DIR, filename)
    print(f"‚û° Moving {src} -> {dst}")
    !mv "{src}" "{dst}"

print("\n‚úÖ Upload & move done.")
print("üìÇ Files in PROCESSOR_DIR:", os.listdir(PROCESSOR_DIR))


üì§ Please upload your preprocessing artifacts from your PC:
‚û° At minimum: preprocessed_data.pkl (and optionally vectorizer.pkl, feature_names.pkl, smote_config.pkl)


Saving feature_names.pkl to feature_names.pkl
Saving preprocessed_data.pkl to preprocessed_data.pkl
Saving smote_config.pkl to smote_config.pkl
Saving vectorizer.pkl to vectorizer.pkl
‚û° Moving feature_names.pkl -> /content/projet_llm/notebooks/processors/feature_names.pkl
‚û° Moving preprocessed_data.pkl -> /content/projet_llm/notebooks/processors/preprocessed_data.pkl
‚û° Moving smote_config.pkl -> /content/projet_llm/notebooks/processors/smote_config.pkl
‚û° Moving vectorizer.pkl -> /content/projet_llm/notebooks/processors/vectorizer.pkl

‚úÖ Upload & move done.
üìÇ Files in PROCESSOR_DIR: ['vectorizer.pkl', 'smote_config.pkl', 'models', 'feature_names.pkl', 'preprocessed_data.pkl']


In [6]:
def load_preprocessed_data(processor_dir=PROCESSOR_DIR):
    print("="*80)
    print("CHARGEMENT DES DONN√âES PR√âPROCESS√âES")
    print("="*80)

    preprocessed_path = os.path.join(processor_dir, "preprocessed_data.pkl")

    with open(preprocessed_path, "rb") as f:
        data = pickle.load(f)

    X_train = data["X_train"]
    X_test  = data["X_test"]
    y_train = data["y_train"]
    y_test  = data["y_test"]

    print("X_train:", X_train.shape, "| X_test:", X_test.shape)
    print("Classes train:", y_train.value_counts(normalize=True).to_dict())
    print("Classes test :", y_test.value_counts(normalize=True).to_dict())

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = load_preprocessed_data()


CHARGEMENT DES DONN√âES PR√âPROCESS√âES
X_train: (31772, 50000) | X_test: (7944, 50000)
Classes train: {1: 0.5051932519199295, 0: 0.4948067480800705}
Classes test : {1: 0.5051611278952669, 0: 0.49483887210473315}


# **CELL 4 ‚Äî D√©finir les mod√®les baseline (CPU only, RAM-friendly)**

In [7]:
def get_baseline_models():
    models = {
        "Logistic Regression": LogisticRegression(
            max_iter=500,
            class_weight="balanced"
        ),

        "Random Forest": RandomForestClassifier(
            n_estimators=200,
            max_depth=20,
            min_samples_split=10,
            min_samples_leaf=4,
            random_state=42,
            class_weight="balanced",
            n_jobs=-1
        ),

        "XGBoost": xgb.XGBClassifier(
            n_estimators=200,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="binary:logistic",
            eval_metric="logloss",
            random_state=42,
            tree_method="hist",    # ‚úÖ CPU rapide
            n_jobs=1               # limite la RAM
        ),

        "LightGBM": lgb.LGBMClassifier(
            n_estimators=200,
            max_depth=-1,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            class_weight="balanced",
            random_state=42,
            n_jobs=-1
        ),

        "CatBoost": CatBoostClassifier(
            iterations=200,
            depth=6,
            learning_rate=0.05,
            random_state=42,
            auto_class_weights="Balanced",
            verbose=False          # CPU
        )
    }

    print("="*80)
    print("MOD√àLES BASELINE CONFIGUR√âS")
    print("="*80)
    for i, name in enumerate(models.keys(), 1):
        print(f"{i}. {name}")

    return models

baseline_models = get_baseline_models()


MOD√àLES BASELINE CONFIGUR√âS
1. Logistic Regression
2. Random Forest
3. XGBoost
4. LightGBM
5. CatBoost


# **CELL 5 ‚Äî Entra√Æner & √©valuer les mod√®les (fonction g√©n√©rique)**

In [8]:
def train_and_evaluate_models(models, X_train, y_train, X_test, y_test, model_type="Baseline"):
    print("\n" + "="*80)
    print(f"ENTRA√éNEMENT ET √âVALUATION - {model_type}")
    print("="*80 + "\n")

    results = {}

    for name, model in models.items():
        print(f"üîÑ {name}...")
        start_time = time.time()

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        if hasattr(model, "predict_proba"):
            y_pred_proba = model.predict_proba(X_test)[:, 1]
        else:
            if hasattr(model, "decision_function"):
                scores = model.decision_function(X_test)
                s_min, s_max = scores.min(), scores.max()
                y_pred_proba = (scores - s_min) / (s_max - s_min + 1e-9)
            else:
                y_pred_proba = y_pred

        training_time = time.time() - start_time

        results[name] = {
            "model": model,
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "f1_score": f1_score(y_test, y_pred),
            "roc_auc": roc_auc_score(y_test, y_pred_proba),
            "training_time": training_time,
            "y_pred": y_pred,
            "y_pred_proba": y_pred_proba,
        }

        print(f"   ‚úÖ ROC-AUC: {results[name]['roc_auc']:.4f} | F1: {results[name]['f1_score']:.4f} | Temps: {training_time:.2f}s\n")

    return results

baseline_results = train_and_evaluate_models(
    baseline_models, X_train, y_train, X_test, y_test, "Baseline"
)



ENTRA√éNEMENT ET √âVALUATION - Baseline

üîÑ Logistic Regression...
   ‚úÖ ROC-AUC: 0.4999 | F1: 0.4941 | Temps: 0.82s

üîÑ Random Forest...
   ‚úÖ ROC-AUC: 0.5055 | F1: 0.5010 | Temps: 15.55s

üîÑ XGBoost...
   ‚úÖ ROC-AUC: 0.5050 | F1: 0.5307 | Temps: 289.08s

üîÑ LightGBM...
[LightGBM] [Info] Number of positive: 16051, number of negative: 15721
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 10.957657 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 946406
[LightGBM] [Info] Number of data points in the train set: 31772, number of used features: 17099
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
   ‚úÖ ROC-AUC: 0.5109 | F1: 0.5051 | Temps: 123.61s

üîÑ CatBoost...
   ‚úÖ ROC-AUC: 0.5015 | F1: 0.4890 | Temps: 749.09s



# **CELL 6 ‚ÄîGrilles de param√®tres**

In [9]:
def get_param_distributions():
    param_distributions = {
        # On laisse RF non tun√© pour limiter la RAM/temps
        "XGBoost": {
            "n_estimators": [100, 200, 300],
            "max_depth": [4, 6, 8],
            "learning_rate": [0.01, 0.05, 0.1],
            "subsample": [0.6, 0.8, 1.0],
            "colsample_bytree": [0.6, 0.8, 1.0],
        },
        "LightGBM": {
            "n_estimators": [100, 200, 300],
            "max_depth": [-1, 10, 20],
            "learning_rate": [0.01, 0.05, 0.1],
            "num_leaves": [31, 63, 127],
            "subsample": [0.6, 0.8, 1.0],
            "colsample_bytree": [0.6, 0.8, 1.0],
        },
        "CatBoost": {
            "iterations": [100, 200, 300],
            "depth": [4, 6, 8],
            "learning_rate": [0.01, 0.05, 0.1],
            "l2_leaf_reg": [1, 3, 5, 7],
        },
    }
    return param_distributions

param_distributions = get_param_distributions()

print("="*80)
print("PARAM√àTRES DE RECHERCHE D√âFINIS")
print("="*80)
for model_name, params in param_distributions.items():
    print(f"\n{model_name}:")
    for p in params:
        print("  ‚Ä¢", p)


PARAM√àTRES DE RECHERCHE D√âFINIS

XGBoost:
  ‚Ä¢ n_estimators
  ‚Ä¢ max_depth
  ‚Ä¢ learning_rate
  ‚Ä¢ subsample
  ‚Ä¢ colsample_bytree

LightGBM:
  ‚Ä¢ n_estimators
  ‚Ä¢ max_depth
  ‚Ä¢ learning_rate
  ‚Ä¢ num_leaves
  ‚Ä¢ subsample
  ‚Ä¢ colsample_bytree

CatBoost:
  ‚Ä¢ iterations
  ‚Ä¢ depth
  ‚Ä¢ learning_rate
  ‚Ä¢ l2_leaf_reg


# **CELL 7 ‚ÄîSous-√©chantillon pour le tuning**

In [10]:
from sklearn.utils import resample

# Subset pour tuning (pour √©viter crash RAM)
N_TUNE = 15000  # tu peux descendre √† 8000 si besoin

X_train_tune, y_train_tune = resample(
    X_train, y_train,
    n_samples=min(N_TUNE, X_train.shape[0]),
    random_state=42,
    stratify=y_train
)

print("Subset tuning:", X_train_tune.shape)
print("Distribution dans subset:")
print((y_train_tune.value_counts(normalize=True) * 100).round(2))


Subset tuning: (15000, 50000)
Distribution dans subset:
winner
1    50.52
0    49.48
Name: proportion, dtype: float64


# **CELL 8 ‚ÄîRandomizedSearchCV (mode RAM-friendly)**

In [11]:
def fine_tune_models(baseline_models, param_distributions, X_train, y_train):
    tuned_models = {}
    best_params_dict = {}

    print("\n" + "="*80)
    print("FINE-TUNING AVEC RandomizedSearchCV (mode RAM-friendly)")
    print("="*80)

    for name, base_model in baseline_models.items():
        if name not in param_distributions:
            print(f"\n‚è≠ Pas de search pour {name}")
            tuned_models[name] = base_model
            continue

        print(f"\nüîß Tuning {name}...")

        params = param_distributions[name]
        n_iter = 5
        cv = 3

        search = RandomizedSearchCV(
            base_model,
            params,
            n_iter=n_iter,
            cv=cv,
            scoring="f1",
            verbose=2,
            n_jobs=1,        # ‚úÖ important pour √©viter SIGKILL
            random_state=42
        )

        search.fit(X_train, y_train)

        best_model = search.best_estimator_
        tuned_models[name] = best_model
        best_params_dict[name] = search.best_params_

        print(f"‚úÖ Meilleurs params pour {name}: {search.best_params_}")
        print(f"   Score CV (f1): {search.best_score_:.4f}")

    return tuned_models, best_params_dict

tuned_models, best_params = fine_tune_models(
    baseline_models, param_distributions, X_train_tune, y_train_tune
)



FINE-TUNING AVEC RandomizedSearchCV (mode RAM-friendly)

‚è≠ Pas de search pour Logistic Regression

‚è≠ Pas de search pour Random Forest

üîß Tuning XGBoost...
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=8, n_estimators=300, subsample=0.6; total time= 3.9min
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=8, n_estimators=300, subsample=0.6; total time= 3.8min
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=8, n_estimators=300, subsample=0.6; total time= 3.8min
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=4, n_estimators=300, subsample=0.6; total time= 1.3min
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=4, n_estimators=300, subsample=0.6; total time= 1.3min
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=4, n_estimators=300, subsample=0.6; total time= 1.3min
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=6, n_estimators=300, 

# **CELL 9 ‚Äî√âvaluer les mod√®les tun√©s**

In [12]:
tuned_results = train_and_evaluate_models(
    tuned_models, X_train, y_train, X_test, y_test, "Fine-Tuned"
)



ENTRA√éNEMENT ET √âVALUATION - Fine-Tuned

üîÑ Logistic Regression...
   ‚úÖ ROC-AUC: 0.4999 | F1: 0.4941 | Temps: 0.84s

üîÑ Random Forest...
   ‚úÖ ROC-AUC: 0.5055 | F1: 0.5010 | Temps: 15.98s

üîÑ XGBoost...
   ‚úÖ ROC-AUC: 0.5029 | F1: 0.5343 | Temps: 391.50s

üîÑ LightGBM...
[LightGBM] [Info] Number of positive: 16051, number of negative: 15721
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 9.656601 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 946406
[LightGBM] [Info] Number of data points in the train set: 31772, number of used features: 17099
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
   ‚úÖ ROC-AUC: 0.5098 | F1: 0.5153 | Temps: 299.74s

üîÑ CatBoost...
   ‚úÖ ROC-AUC: 0.5040 | F1: 0.4943 | Temps: 1096.85s



# **CELL 10 ‚Äî Cr√©er les ensembles**

In [13]:
def create_stacking_ensemble(tuned_models):
    print("\n" + "="*80)
    print("CR√âATION DES ENSEMBLES - STACKING & VOTING")
    print("="*80 + "\n")

    estimators = [(name, model) for name, model in tuned_models.items()]

    stacking_clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(max_iter=500, class_weight="balanced"),
        n_jobs=-1
    )

    voting_clf = VotingClassifier(
        estimators=estimators,
        voting="soft",
        n_jobs=-1
    )

    ensemble_models = {
        "Stacking": stacking_clf,
        "Voting (Soft)": voting_clf
    }

    print("‚úÖ Ensembles cr√©√©s.")
    return ensemble_models

ensemble_models = create_stacking_ensemble(tuned_models)



CR√âATION DES ENSEMBLES - STACKING & VOTING

‚úÖ Ensembles cr√©√©s.


# **CELL 11 ‚Äî Entra√Æner & √©valuer les ensembles**

In [14]:
ensemble_results = train_and_evaluate_models(
    ensemble_models, X_train, y_train, X_test, y_test, "Ensemble"
)


ENTRA√éNEMENT ET √âVALUATION - Ensemble

üîÑ Stacking...
   ‚úÖ ROC-AUC: 0.4924 | F1: 0.4969 | Temps: 8216.55s

üîÑ Voting (Soft)...
   ‚úÖ ROC-AUC: 0.5067 | F1: 0.5123 | Temps: 1830.16s



#**CELL 12 ‚Äî Tableau comparatif final**

In [15]:
def create_comprehensive_comparison(baseline_results, tuned_results, ensemble_results):
    comparison_data = []

    for name, m in baseline_results.items():
        comparison_data.append({
            "Mod√®le": f"{name} (Baseline)",
            "Type": "Baseline",
            "Accuracy": m["accuracy"],
            "Precision": m["precision"],
            "Recall": m["recall"],
            "F1-Score": m["f1_score"],
            "ROC-AUC": m["roc_auc"],
            "Temps (s)": m["training_time"],
        })

    for name, m in tuned_results.items():
        comparison_data.append({
            "Mod√®le": f"{name} (Tuned)",
            "Type": "Fine-Tuned",
            "Accuracy": m["accuracy"],
            "Precision": m["precision"],
            "Recall": m["recall"],
            "F1-Score": m["f1_score"],
            "ROC-AUC": m["roc_auc"],
            "Temps (s)": m["training_time"],
        })

    for name, m in ensemble_results.items():
        comparison_data.append({
            "Mod√®le": f"{name} (Ensemble)",
            "Type": "Ensemble",
            "Accuracy": m["accuracy"],
            "Precision": m["precision"],
            "Recall": m["recall"],
            "F1-Score": m["f1_score"],
            "ROC-AUC": m["roc_auc"],
            "Temps (s)": m["training_time"],
        })

    df_comparison = pd.DataFrame(comparison_data)
    df_comparison = df_comparison.sort_values(
        by="ROC-AUC", ascending=False
    ).reset_index(drop=True)

    return df_comparison

df_comparison = create_comprehensive_comparison(
    baseline_results, tuned_results, ensemble_results
)

df_comparison.to_csv(os.path.join(PROCESSOR_DIR, "model_comparison_final.csv"), index=False)

display(df_comparison.style.format({
    "Accuracy": "{:.4f}",
    "Precision": "{:.4f}",
    "Recall": "{:.4f}",
    "F1-Score": "{:.4f}",
    "ROC-AUC": "{:.4f}",
    "Temps (s)": "{:.2f}",
}))

best_model_row = df_comparison.iloc[0]
print("\nüèÜ MEILLEUR MOD√àLE GLOBAL:", best_model_row["Mod√®le"])
print("   Type:", best_model_row["Type"])
print("   ROC-AUC:", best_model_row["ROC-AUC"])


Unnamed: 0,Mod√®le,Type,Accuracy,Precision,Recall,F1-Score,ROC-AUC,Temps (s)
0,LightGBM (Baseline),Baseline,0.5087,0.5142,0.4964,0.5051,0.5109,123.61
1,LightGBM (Tuned),Fine-Tuned,0.5111,0.5161,0.5146,0.5153,0.5098,299.74
2,Voting (Soft) (Ensemble),Ensemble,0.5089,0.514,0.5106,0.5123,0.5067,1830.16
3,Random Forest (Baseline),Baseline,0.504,0.5094,0.4929,0.501,0.5055,15.55
4,Random Forest (Tuned),Fine-Tuned,0.504,0.5094,0.4929,0.501,0.5055,15.98
5,XGBoost (Baseline),Baseline,0.503,0.5074,0.5562,0.5307,0.505,289.08
6,CatBoost (Tuned),Fine-Tuned,0.4987,0.504,0.4849,0.4943,0.504,1096.85
7,XGBoost (Tuned),Fine-Tuned,0.5037,0.5079,0.5637,0.5343,0.5029,391.5
8,CatBoost (Baseline),Baseline,0.4984,0.5037,0.4752,0.489,0.5015,749.09
9,Logistic Regression (Baseline),Baseline,0.4989,0.5041,0.4844,0.4941,0.4999,0.82



üèÜ MEILLEUR MOD√àLE GLOBAL: LightGBM (Baseline)
   Type: Baseline
   ROC-AUC: 0.5108995801802372


#**CELL 13 ‚Äî Tableau des am√©liorations (baseline vs tuned)**

In [16]:
def analyze_improvements(baseline_results, tuned_results):
    rows = []
    for name, base_metrics in baseline_results.items():
        if name not in tuned_results:
            continue
        tuned_metrics = tuned_results[name]
        base_auc = base_metrics["roc_auc"]
        tuned_auc = tuned_metrics["roc_auc"]
        gain_abs = tuned_auc - base_auc
        gain_pct = (gain_abs / base_auc) * 100 if base_auc > 0 else 0.0

        rows.append({
            "Mod√®le": name,
            "Baseline AUC": base_auc,
            "Tuned AUC": tuned_auc,
            "Gain absolu": gain_abs,
            "Am√©lioration (%)": gain_pct,
        })

    df_improvements = pd.DataFrame(rows).sort_values(
        by="Am√©lioration (%)", ascending=False
    )

    return df_improvements

df_improvements = analyze_improvements(baseline_results, tuned_results)
df_improvements.to_csv(os.path.join(PROCESSOR_DIR, "model_improvements.csv"), index=False)
df_improvements


Unnamed: 0,Mod√®le,Baseline AUC,Tuned AUC,Gain absolu,Am√©lioration (%)
4,CatBoost,0.50154,0.503985,0.002446,0.487643
0,Logistic Regression,0.49986,0.49986,0.0,0.0
1,Random Forest,0.505535,0.505535,0.0,0.0
3,LightGBM,0.5109,0.509829,-0.00107,-0.209529
2,XGBoost,0.505015,0.502924,-0.002091,-0.413962


# **CELL 14 ‚Äî √âvaluer les mod√®les tun√©s**

In [17]:
tuned_results = train_and_evaluate_models(
    tuned_models, X_train, y_train, X_test, y_test, "Fine-Tuned"
)


ENTRA√éNEMENT ET √âVALUATION - Fine-Tuned

üîÑ Logistic Regression...
   ‚úÖ ROC-AUC: 0.4999 | F1: 0.4941 | Temps: 1.49s

üîÑ Random Forest...
   ‚úÖ ROC-AUC: 0.5055 | F1: 0.5010 | Temps: 15.60s

üîÑ XGBoost...
   ‚úÖ ROC-AUC: 0.5029 | F1: 0.5343 | Temps: 391.86s

üîÑ LightGBM...
[LightGBM] [Info] Number of positive: 16051, number of negative: 15721
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 11.703305 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 946406
[LightGBM] [Info] Number of data points in the train set: 31772, number of used features: 17099
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
   ‚úÖ ROC-AUC: 0.5098 | F1: 0.5153 | Temps: 287.66s

üîÑ CatBoost...
   ‚úÖ ROC-AUC: 0.5040 | F1: 0.4943 | Temps: 1099.97s



# **CELL 14 ‚Äî Sauvegarder best model + tuned + ensembles**

In [18]:
def save_best_final_model(baseline_results, tuned_results, ensemble_results,
                          tuned_models, ensemble_models, best_params,
                          processor_dir=PROCESSOR_DIR):
    print("\n" + "="*80)
    print("üíæ SAUVEGARDE DU MEILLEUR MOD√àLE FINAL")
    print("="*80 + "\n")

    all_results = {}
    all_models = {}

    for name, m in baseline_results.items():
        all_results[f"{name} (Baseline)"] = m
        all_models[f"{name} (Baseline)"] = m["model"]

    for name, m in tuned_results.items():
        all_results[f"{name} (Tuned)"] = m
        all_models[f"{name} (Tuned)"] = tuned_models[name]

    for name, m in ensemble_results.items():
        all_results[f"{name} (Ensemble)"] = m
        all_models[f"{name} (Ensemble)"] = ensemble_models[name]

    best_model_name, best_metrics = max(
        all_results.items(), key=lambda x: x[1]["roc_auc"]
    )
    best_model = all_models[best_model_name]

    models_dir = os.path.join(processor_dir, "models")
    os.makedirs(models_dir, exist_ok=True)

    best_model_path = os.path.join(models_dir, "best_model_final.pkl")
    with open(best_model_path, "wb") as f:
        pickle.dump(best_model, f)
    print(f"‚úÖ Meilleur mod√®le sauvegard√©: {best_model_path}")

    metadata = {
        "model_name": best_model_name,
        "model_type": ("Ensemble" if "Ensemble" in best_model_name else
                       "Fine-Tuned" if "Tuned" in best_model_name else "Baseline"),
        "metrics": {
            "accuracy": best_metrics["accuracy"],
            "precision": best_metrics["precision"],
            "recall": best_metrics["recall"],
            "f1_score": best_metrics["f1_score"],
            "roc_auc": best_metrics["roc_auc"],
        },
        "training_time": best_metrics["training_time"],
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "best_params": best_params if "(Tuned)" in best_model_name else None,
    }

    metadata_path = os.path.join(models_dir, "best_model_final_metadata.pkl")
    with open(metadata_path, "wb") as f:
        pickle.dump(metadata, f)
    print(f"‚úÖ M√©tadonn√©es sauvegard√©es: {metadata_path}")

    all_tuned_path = os.path.join(models_dir, "all_tuned_models.pkl")
    with open(all_tuned_path, "wb") as f:
        pickle.dump(tuned_models, f)
    print(f"‚úÖ Tous les mod√®les tun√©s sauvegard√©s: {all_tuned_path}")

    ensemble_path = os.path.join(models_dir, "ensemble_models.pkl")
    with open(ensemble_path, "wb") as f:
        pickle.dump(ensemble_models, f)
    print(f"‚úÖ Mod√®les ensemble sauvegard√©s: {ensemble_path}")

    print("\n‚úÖ Sauvegarde termin√©e.")

    return best_model_name, best_model, metadata

best_model_name, best_model, metadata = save_best_final_model(
    baseline_results, tuned_results, ensemble_results,
    tuned_models, ensemble_models, best_params
)



üíæ SAUVEGARDE DU MEILLEUR MOD√àLE FINAL

‚úÖ Meilleur mod√®le sauvegard√©: /content/projet_llm/notebooks/processors/models/best_model_final.pkl
‚úÖ M√©tadonn√©es sauvegard√©es: /content/projet_llm/notebooks/processors/models/best_model_final_metadata.pkl
‚úÖ Tous les mod√®les tun√©s sauvegard√©s: /content/projet_llm/notebooks/processors/models/all_tuned_models.pkl
‚úÖ Mod√®les ensemble sauvegard√©s: /content/projet_llm/notebooks/processors/models/ensemble_models.pkl

‚úÖ Sauvegarde termin√©e.


# **CELL 15 ‚Äî Rapport texte final**

In [19]:
def generate_comprehensive_report(df_comparison, df_improvements, best_model_name,
                                  metadata, baseline_results, tuned_results, ensemble_results):
    report = f"""
{'='*80}
üìä RAPPORT FINAL COMPLET - MOD√âLISATION LLM PREFERENCES
{'='*80}

üìÖ Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
üéØ Projet: LLM Preference Classification (Chatbot Arena)

{'‚îÄ'*80}
üìà PIPELINE COMPLET EX√âCUT√â
{'‚îÄ'*80}

1. ‚úÖ Mod√®les Baseline: {len(baseline_results)} mod√®les
2. ‚úÖ Fine-Tuning: RandomizedSearchCV sur {len(tuned_results)} mod√®les
3. ‚úÖ Ensemble Learning: Stacking + Voting
4. ‚úÖ Comparaison compl√®te: {len(df_comparison)} configurations test√©es

{'‚îÄ'*80}
üèÜ MEILLEUR MOD√àLE GLOBAL
{'‚îÄ'*80}

Mod√®le: {best_model_name}
Type: {metadata['model_type']}
ROC-AUC: {metadata['metrics']['roc_auc']:.4f}
F1-Score: {metadata['metrics']['f1_score']:.4f}
Recall: {metadata['metrics']['recall']:.4f}
Accuracy: {metadata['metrics']['accuracy']:.4f}
"""

    report += f"""
{'‚îÄ'*80}
üìä TOP 5 MOD√àLES
{'‚îÄ'*80}
"""
    for idx, row in df_comparison.head(5).iterrows():
        report += f"""
{idx+1}. {row['Mod√®le']} ({row['Type']})
   AUC: {row['ROC-AUC']:.4f} | F1: {row['F1-Score']:.4f} | Recall: {row['Recall']:.4f}
"""

    report += f"""
{'‚îÄ'*80}
üîß IMPACT DU FINE-TUNING
{'‚îÄ'*80}
"""
    for _, row in df_improvements.iterrows():
        report += f"""
{row['Mod√®le']}:
  Base: {row['Baseline AUC']:.4f} ‚Üí Tuned: {row['Tuned AUC']:.4f}
  Am√©lioration: {row['Am√©lioration (%)']:+.2f}% ({row['Gain absolu']:+.4f})
"""

    report_path = os.path.join(PROCESSOR_DIR, "modeling_report_final.txt")
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(report)

    print(report)
    print(f"\nüíæ Rapport sauvegard√©: {report_path}")

generate_comprehensive_report(
    df_comparison, df_improvements, best_model_name,
    metadata, baseline_results, tuned_results, ensemble_results
)



üìä RAPPORT FINAL COMPLET - MOD√âLISATION LLM PREFERENCES

üìÖ Date: 2025-12-12 02:37:23
üéØ Projet: LLM Preference Classification (Chatbot Arena)

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìà PIPELINE COMPLET EX√âCUT√â
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

1. ‚úÖ Mod√®les Baseline: 5 mod√®les
2. ‚úÖ Fine-Tuning: RandomizedSearchCV sur 5 mod√®les
3. ‚úÖ Ensemble Learning: Stacking + Voting
4. ‚úÖ Comparaison compl√®te: 12 configurations test√©es

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î

# **CELL 16 ‚Äî ZIP du projet + sauvegarde dans Google Drive + t√©l√©chargement**

In [20]:
from google.colab import drive
drive.mount('/content/drive')

PROJECT_DIR = BASE_DIR
ZIP_PATH = "/content/projet_llm_mlops.zip"

print("üì¶ Zippage du projet...")
!zip -r "{ZIP_PATH}" "{PROJECT_DIR}"

DEST_DRIVE_PATH = "/content/drive/MyDrive/projet_llm_mlops.zip"
!cp "{ZIP_PATH}" "{DEST_DRIVE_PATH}"

print("üéâ All done! ZIP saved to Google Drive ‚Üí", DEST_DRIVE_PATH)

# T√©l√©chargement direct sur ton PC (optionnel)
from google.colab import files
files.download(ZIP_PATH)


ValueError: mount failed