In [1]:
# Hyperparameter tuning basique pour RandomForest et XGBoost
# - get_dummies (encodage simple)
# - split train/test
# - évalue défaut vs après tuning (CV=5)
# - imprime RMSE/MAE/R² avant/après
# - XGBoost est optionnel (skip si non installé)

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

# XGBoost si dispo
try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except Exception:
    HAS_XGB = False

RANDOM_STATE = 42

def metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    return rmse, mae, r2

def print_scores(title, y_true, y_pred):
    rmse, mae, r2 = metrics(y_true, y_pred)
    print(f"{title:30s} -> RMSE:{rmse:8.2f}  MAE:{mae:8.2f}  R²:{r2:7.4f}")
    return rmse, mae, r2

# 1) Charger les données
csv_path = "ensurance.csv"  # change si nécessaire
df = pd.read_csv(csv_path)

# 2) X (features) et y (target)
y = df["charges"]
X = df.drop(columns=["charges"])

# 3) Encodage simple des catégorielles
X = pd.get_dummies(X, drop_first=True)

# 4) Split 80/20
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

print("=== Scores par défaut (avant tuning) ===")
default_results = []

# Baseline (prédit la moyenne)
baseline = y_train.mean()
y_pred_base = np.full_like(y_test, baseline, dtype=float)
default_results.append(("Baseline-Mean", *print_scores("Baseline-Mean", y_test, y_pred_base)))

# A) RandomForest - défaut
rf_default = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)
rf_default.fit(X_train, y_train)
y_pred_rf_def = rf_default.predict(X_test)
default_results.append(("RandomForest (default)", *print_scores("RandomForest (default)", y_test, y_pred_rf_def)))

# B) XGBoost - défaut (si dispo)
if HAS_XGB:
    xgb_default = XGBRegressor(
        objective="reg:squarederror",
        random_state=RANDOM_STATE,
        n_estimators=300,
        tree_method="auto"
    )
    xgb_default.fit(X_train, y_train)
    y_pred_xgb_def = xgb_default.predict(X_test)
    default_results.append(("XGBoost (default)", *print_scores("XGBoost (default)", y_test, y_pred_xgb_def)))
else:
    print("XGBoost non installé -> on saute cette partie (pip install xgboost)")

print("\n=== Tuning (RandomizedSearchCV, CV=5) ===")
tuned_results = []

# Paramètres RF (simples)
param_dist_rf = {
    "n_estimators": [200, 400, 800, 1200],
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
}

rf = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)
rf_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist_rf,
    n_iter=20,
    scoring="neg_mean_squared_error",  # on optimise la MSE (-> RMSE)
    cv=5,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    refit=True,  # refit sur tout X_train avec les meilleurs hyperparamètres
)
rf_search.fit(X_train, y_train)

rf_best = rf_search.best_estimator_
y_pred_rf_best = rf_best.predict(X_test)
rmse, mae, r2 = metrics(y_test, y_pred_rf_best)
print(f"\nRandomForest best params: {rf_search.best_params_}")
print(f"RandomForest CV best RMSE (moyenne folds): {np.sqrt(-rf_search.best_score_):.2f}")
tuned_results.append(("RandomForest (tuned)", rmse, mae, r2))
print_scores("RandomForest (tuned)", y_test, y_pred_rf_best)

# Paramètres XGBoost (simples)
if HAS_XGB:
    param_dist_xgb = {
        "n_estimators": [300, 600, 900, 1200],
        "max_depth": [3, 4, 5, 6, 8],
        "learning_rate": [0.01, 0.05, 0.1, 0.2],
        "subsample": [0.7, 0.9, 1.0],
        "colsample_bytree": [0.7, 0.9, 1.0],
        "min_child_weight": [1, 3, 5],
        # "reg_lambda": [1.0, 1.5, 2.0],  # tu peux décommenter pour tester la régularisation
    }
    xgb = XGBRegressor(objective="reg:squarederror", random_state=RANDOM_STATE)
    xgb_search = RandomizedSearchCV(
        xgb,
        param_distributions=param_dist_xgb,
        n_iter=20,
        scoring="neg_mean_squared_error",
        cv=5,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        refit=True,
    )
    xgb_search.fit(X_train, y_train)

    xgb_best = xgb_search.best_estimator_
    y_pred_xgb_best = xgb_best.predict(X_test)
    print(f"\nXGBoost best params: {xgb_search.best_params_}")
    print(f"XGBoost CV best RMSE (moyenne folds): {np.sqrt(-xgb_search.best_score_):.2f}")
    tuned_results.append(("XGBoost (tuned)", *metrics(y_test, y_pred_xgb_best)))
    print_scores("XGBoost (tuned)", y_test, y_pred_xgb_best)

print("\n=== Résumé avant vs après (sur test) ===")
def to_df(rows):
    return pd.DataFrame(
        rows, columns=["Model", "RMSE", "MAE", "R2"]
    ).sort_values("RMSE").reset_index(drop=True)

print("Avant tuning:")
print(to_df(default_results))
print("\nAprès tuning:")
print(to_df(tuned_results))

=== Scores par défaut (avant tuning) ===
Baseline-Mean                  -> RMSE:12465.61  MAE: 9593.34  R²:-0.0009
RandomForest (default)         -> RMSE: 4576.30  MAE: 2550.08  R²: 0.8651
XGBoost non installé -> on saute cette partie (pip install xgboost)

=== Tuning (RandomizedSearchCV, CV=5) ===

RandomForest best params: {'n_estimators': 1200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 5}
RandomForest CV best RMSE (moyenne folds): 4673.68
RandomForest (tuned)           -> RMSE: 4379.02  MAE: 2518.53  R²: 0.8765

=== Résumé avant vs après (sur test) ===
Avant tuning:
                    Model          RMSE          MAE        R2
0  RandomForest (default)   4576.299916  2550.078471  0.865103
1           Baseline-Mean  12465.610442  9593.338461 -0.000919

Après tuning:
                  Model        RMSE          MAE        R2
0  RandomForest (tuned)  4379.02331  2518.534658  0.876483
