In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# --- Helper for metrics ---
def metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    return rmse, mae, r2

# --- Load data ---
csv_path = "ensurance.csv"  # <- keep your path; change if needed
df = pd.read_csv(csv_path)

# Features/target
y = df["charges"]
X = df.drop(columns=["charges"])

# One-hot encode categoricals the simple way
X = pd.get_dummies(X, drop_first=True)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Numeric columns to scale (keep it simple)
numeric_cols = [c for c in ["age", "bmi", "children"] if c in X_train.columns]

# Fit scaler on TRAIN only (for SVR scaled variants)
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled  = X_test.copy()
if numeric_cols:
    X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols]  = scaler.transform(X_test[numeric_cols])

results = []

# Baseline (predict mean)
baseline = y_train.mean()
y_pred_base = np.full_like(y_test, baseline, dtype=float)
rmse_b, mae_b, r2_b = metrics(y_test, y_pred_base)
results.append(("Baseline-Mean", rmse_b, mae_b, r2_b))

# A) SVR RBF, no scaling
svr_rbf_ns = SVR(kernel="rbf", C=10, epsilon=0.2, gamma="scale")
svr_rbf_ns.fit(X_train, y_train)
pred = svr_rbf_ns.predict(X_test)
results.append(("SVR RBF no scaling", *metrics(y_test, pred)))

# B) SVR RBF, with scaling
svr_rbf_s = SVR(kernel="rbf", C=10, epsilon=0.2, gamma="scale")
svr_rbf_s.fit(X_train_scaled, y_train)
pred = svr_rbf_s.predict(X_test_scaled)
results.append(("SVR RBF with scaling", *metrics(y_test, pred)))

# C) SVR Linear, no scaling
svr_lin_ns = SVR(kernel="linear", C=10, epsilon=0.2)
svr_lin_ns.fit(X_train, y_train)
pred = svr_lin_ns.predict(X_test)
results.append(("SVR linear no scaling", *metrics(y_test, pred)))

# D) SVR RBF + scaling + LOG TARGET (often big win)
y_train_log = np.log1p(y_train)
svr_rbf_log = SVR(kernel="rbf", C=10, epsilon=0.2, gamma="scale")
svr_rbf_log.fit(X_train_scaled, y_train_log)
pred_log = svr_rbf_log.predict(X_test_scaled)
pred = np.expm1(pred_log)  # back to original units
results.append(("SVR RBF scaled + log(y)", *metrics(y_test, pred)))

# E) Tiny parameter sweep for RBF on LOG TARGET (keep it simple)
best = None
for C in [10, 50, 100]:
    for eps in [0.1, 0.2, 0.3]:
        for gamma in ["scale", 0.1, 0.05]:
            svr = SVR(kernel="rbf", C=C, epsilon=eps, gamma=gamma)
            svr.fit(X_train_scaled, y_train_log)
            p = np.expm1(svr.predict(X_test_scaled))
            rmse, mae, r2 = metrics(y_test, p)
            name = f"SVR RBF scaled + log(y) C={C} eps={eps} gamma={gamma}"
            results.append((name, rmse, mae, r2))
            if (best is None) or (rmse < best[1]):
                best = (name, rmse, mae, r2)

# Print results
print("\nResults (lower RMSE/MAE is better, higher R² is better):")
for name, rmse, mae, r2 in results:
    print(f"{name:35s} -> RMSE:{rmse:8.2f}  MAE:{mae:8.2f}  R²:{r2:7.4f}")

print("\nBest (by RMSE) from the tiny sweep:")
print(best)


Results (lower RMSE/MAE is better, higher R² is better):
Baseline-Mean                       -> RMSE:12465.61  MAE: 9593.34  R²:-0.0009
SVR RBF no scaling                  -> RMSE:12844.20  MAE: 8190.24  R²:-0.0626
SVR RBF with scaling                -> RMSE:12782.60  MAE: 8283.54  R²:-0.0525
SVR linear no scaling               -> RMSE:12136.51  MAE: 6025.51  R²: 0.0512
SVR RBF scaled + log(y)             -> RMSE: 5447.12  MAE: 2888.76  R²: 0.8089
SVR RBF scaled + log(y) C=10 eps=0.1 gamma=scale -> RMSE: 5162.18  MAE: 2364.04  R²: 0.8284
SVR RBF scaled + log(y) C=10 eps=0.1 gamma=0.1 -> RMSE: 5027.01  MAE: 2301.63  R²: 0.8372
SVR RBF scaled + log(y) C=10 eps=0.1 gamma=0.05 -> RMSE: 4770.90  MAE: 2188.85  R²: 0.8534
SVR RBF scaled + log(y) C=10 eps=0.2 gamma=scale -> RMSE: 5447.12  MAE: 2888.76  R²: 0.8089
SVR RBF scaled + log(y) C=10 eps=0.2 gamma=0.1 -> RMSE: 4979.88  MAE: 2785.03  R²: 0.8403
SVR RBF scaled + log(y) C=10 eps=0.2 gamma=0.05 -> RMSE: 4758.95  MAE: 2747.40  R²: 0.8541
S