In [2]:
# Tuning SVR (RBF) avec cible en log et Pipeline pour éviter les fuites en CV
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

RANDOM_STATE = 42

def metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    return rmse, mae, r2

# 1) Charger données
df = pd.read_csv("ensurance.csv")

# 2) y et X
y = df["charges"]
X = df.drop(columns=["charges"])

# 3) Encodage simple
X = pd.get_dummies(X, drop_first=True)

# 4) Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

# 5) On travaille sur y en log pour l'entraînement
y_train_log = np.log1p(y_train)

# 6) Pipeline: StandardScaler (toutes colonnes déjà numériques) + SVR
pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("svr", SVR(kernel="rbf"))
])

# 7) Search space simple
param_dist = {
    "svr__C": [10, 50, 100],
    "svr__epsilon": [0.1, 0.2, 0.3],
    "svr__gamma": ["scale", 0.1, 0.05],
}

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=12,
    scoring="neg_mean_squared_error",  # on optimise la MSE en log-espace
    cv=5,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    refit=True,
)

# 8) Fit sur (X_train, log(y_train)) — le pipeline empêche la fuite durant la CV
search.fit(X_train, y_train_log)

print("Best params (SVR log-target):", search.best_params_)
print("CV best RMSE (log-espace):", np.sqrt(-search.best_score_))

# 9) Évaluer en unités originales (on inverse la log)
y_pred_log = search.best_estimator_.predict(X_test)
y_pred = np.expm1(y_pred_log)
rmse, mae, r2 = metrics(y_test, y_pred)
print(f"SVR log-target (tuned) -> RMSE:{rmse:.2f}  MAE:{mae:.2f}  R²:{r2:.4f}")

Best params (SVR log-target): {'svr__gamma': 0.05, 'svr__epsilon': 0.1, 'svr__C': 50}
CV best RMSE (log-espace): 0.37907878009603113
SVR log-target (tuned) -> RMSE:4999.70  MAE:2321.32  R²:0.8390
