In [1]:
# Super simple Linear Regression on your CSV
# No pipelines, no complex preprocessing

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1) Load the data
csv_path = "ensurance.csv"  # <- make sure the file is in the same folder
df = pd.read_csv(csv_path)

# 2) Split into features (X) and target (y)
y = df["charges"]
X = df.drop(columns=["charges"])

# 3) Turn text columns into numbers (one-hot encoding) the easy way
#    drop_first=True avoids duplicate columns for categories
X = pd.get_dummies(X, drop_first=True)

# 4) Split into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5) Make the model
model = LinearRegression()

# 6) Train (fit) the model
model.fit(X_train, y_train)

# 7) Predict on the test set
y_pred = model.predict(X_test)
print(f"hada :{model.intercept_}{model.coef_}") and print(model.coef_)
# 8) Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear Regression (basic) results on test set")
print(f"RMSE: {rmse:.2f}")
print(f"MAE:  {mae:.2f}")
print(f"R²:   {r2:.4f}")

# 9) (Optional) Show a few predictions vs real values
preview = pd.DataFrame({
    "Actual": y_test.values[:10],
    "Predicted": y_pred[:10]
})
print("\nSample predictions (first 10):")
print(preview)

hada :-11931.219050326692[ 2.56975706e+02  3.37092552e+02  4.25278784e+02 -1.85916916e+01
  2.36511289e+04 -3.70677326e+02 -6.57864297e+02 -8.09799354e+02]
Linear Regression (basic) results on test set
RMSE: 5796.28
MAE:  4181.19
R²:   0.7836

Sample predictions (first 10):
        Actual     Predicted
0   9095.06825   8969.550274
1   5272.17580   7068.747443
2  29330.98315  36858.410912
3   9301.89355   9454.678501
4  33750.29180  26973.173457
5   4536.25900  10864.113164
6   2117.33885    170.280841
7  14210.53595  16903.450287
8   3732.62510   1092.430936
9  10264.44210  11218.343184


In [7]:
# Compare LinearRegression, RandomForestRegressor, XGBRegressor, and SVR
# Simple style: pandas.get_dummies + train_test_split + fit/predict + metrics
# Note: SVR needs scaling; we'll scale only for SVR to keep it simple.

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

# Try XGBoost if installed; skip if not
try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except Exception:
    HAS_XGB = False

# 1) Load the data
csv_path = "ensurance.csv"  # <- update if your file has a different name/path
df = pd.read_csv(csv_path)

# 2) Features/target
y = df["charges"]
X = df.drop(columns=["charges"])

# 3) Easy one-hot encoding for categorical columns
X = pd.get_dummies(X, drop_first=True)

# 4) Train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5) Prepare models (default params)
models = {
    "LinearRegression": LinearRegression(),
    "RandomForestRegressor": RandomForestRegressor(random_state=42),
    "SVR": SVR(),  # default RBF kernel; needs scaling
}
if HAS_XGB:
    models["XGBRegressor"] = XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        verbosity=0
    )

# 6) Helper to compute metrics
def eval_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

# 7) Baseline (always predict the mean) to compare against
baseline = y_train.mean()
y_pred_base = np.full_like(y_test, baseline, dtype=float)
rmse_b = np.sqrt(mean_squared_error(y_test, y_pred_base))
print(f"Baseline (predict mean) RMSE: {rmse_b:.2f}\n")

# 8) Train, predict, evaluate each model
results = []
for name, model in models.items():
    if name == "SVR":
        # Scale features ONLY for SVR (everything is numeric after get_dummies)
        scaler = StandardScaler()
        X_train_use = scaler.fit_transform(X_train)
        X_test_use = scaler.transform(X_test)
    else:
        X_train_use = X_train
        X_test_use = X_test

    # Fit
    model.fit(X_train_use, y_train)

    # Predict
    y_pred = model.predict(X_test_use)

    # Metrics
    rmse, mae, r2 = eval_metrics(y_test, y_pred)
    results.append({"Model": name, "RMSE": rmse, "MAE": mae, "R2": r2})

    print(f"{name} results:")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE:  {mae:.2f}")
    print(f"  R²:   {r2:.4f}")

    # Optional: show a few predictions for the first model only to keep output short
    if name == "LinearRegression":
        preview = pd.DataFrame({"Actual": y_test.values[:5], "Predicted": y_pred[:5]})
        print("  Sample predictions:\n", preview, "\n")

# 9) Summary table sorted by RMSE (lower is better)
results_df = pd.DataFrame(results).sort_values("RMSE").reset_index(drop=True)
print("\nSummary (sorted by RMSE):")
print(results_df)

Baseline (predict mean) RMSE: 12465.61

LinearRegression results:
  RMSE: 5796.28
  MAE:  4181.19
  R²:   0.7836
  Sample predictions:
         Actual     Predicted
0   9095.06825   8969.550274
1   5272.17580   7068.747443
2  29330.98315  36858.410912
3   9301.89355   9454.678501
4  33750.29180  26973.173457 

RandomForestRegressor results:
  RMSE: 4576.30
  MAE:  2550.08
  R²:   0.8651
SVR results:
  RMSE: 12889.10
  MAE:  8612.41
  R²:   -0.0701

Summary (sorted by RMSE):
                   Model          RMSE          MAE        R2
0  RandomForestRegressor   4576.299916  2550.078471  0.865103
1       LinearRegression   5796.284659  4181.194474  0.783593
2                    SVR  12889.096315  8612.408423 -0.070082


In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# -----------------------
# Helper: IQR bounds
# -----------------------
def iqr_bounds(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return lower, upper

# 1) Load data
csv_path = "ensurance.csv"  # change if different
df = pd.read_csv(csv_path)

# 2) Split features/target
y = df["charges"]
X = df.drop(columns=["charges"])

# 3) One-hot encode categoricals (simple)
X = pd.get_dummies(X, drop_first=True)

# 4) Train/test split (train only is used to compute IQR bounds)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------
# 5) IQR handling
# -----------------------

# 5a) Target outliers: CAP y_train using IQR on y_train only
y_lo, y_hi = iqr_bounds(y_train)
# For medical charges, the lower fence can be very low; we usually cap top-end only
y_train_capped = y_train.clip(lower=y_lo, upper=y_hi)

# 5b) Feature outliers: clip numeric columns using bounds learned from X_train
numeric_cols = [c for c in ["age", "bmi", "children"] if c in X_train.columns]
for col in numeric_cols:
    lo, hi = iqr_bounds(X_train[col])
    # clip both train and test using TRAIN bounds (no leakage)
    X_train[col] = X_train[col].clip(lower=lo, upper=hi)
    X_test[col]  = X_test[col].clip(lower=lo, upper=hi)

# -----------------------
# 6) Scale (SVR needs scaled inputs)
# -----------------------
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# -----------------------
# 7) Train SVR (simple, reasonable defaults)
# -----------------------
svr = SVR(C=10, epsilon=0.2, gamma="scale")
svr.fit(X_train_s, y_train_capped)

# 8) Predict on test
y_pred = svr.predict(X_test_s)

# 9) Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae  = mean_absolute_error(y_test, y_pred)
r2   = r2_score(y_test, y_pred)

print("SVR with IQR capping (target) + IQR clipping (features) + scaling")
print(f"RMSE: {rmse:.2f}")
print(f"MAE:  {mae:.2f}")
print(f"R²:   {r2:.4f}")

# Optional: Show first 5 predictions
print("\nSample predictions:")
print(pd.DataFrame({"Actual": y_test.values[:5], "Predicted": y_pred[:5]}))

SVR with IQR capping (target) + IQR clipping (features) + scaling
RMSE: 12845.02
MAE:  8190.24
R²:   -0.0628

Sample predictions:
        Actual     Predicted
0   9095.06825   9312.603710
1   5272.17580   8813.004922
2  29330.98315  10227.751613
3   9301.89355   9373.441122
4  33750.29180   8134.539415
