In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [3]:
# 1) Load data
df = pd.read_csv("file.csv")   # download from Kaggle and put here
print(df.shape)
print(df.head())

(1338, 7)
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [4]:
# 2) Features / target
X = df.drop(columns=["charges"])
y = df["charges"]

In [5]:
# 3) Column groups
num_cols = ["age", "bmi", "children"]
cat_cols = ["sex", "smoker", "region"]

In [7]:
# 4) Preprocessing
num_transformer = Pipeline([
    ("scaler", StandardScaler())
])
# Use sparse=False to get dense arrays; if you get a DeprecationWarning, replace with sparse_output=False for newer sklearn
cat_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols),
    ],
    remainder="drop",
)

In [8]:
# 5) Full pipeline: preprocessor + model
pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

In [9]:

# 6) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [10]:
# 7) (Optional) small grid search for decent hyperparams
param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20],
}
grid = GridSearchCV(pipe, param_grid, cv=4, n_jobs=-1, scoring="neg_mean_absolute_error", verbose=1)
grid.fit(X_train, y_train)

best = grid.best_estimator_
print("Best params:", grid.best_params_)

Fitting 4 folds for each of 6 candidates, totalling 24 fits
Best params: {'model__max_depth': 10, 'model__n_estimators': 100}


In [13]:
# 8) Evaluate

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import math

y_pred = best.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

# 9) Save pipeline (contains preprocessor + model)
joblib.dump(best, "insurance_pipeline.joblib")
print("Saved pipeline to insurance_pipeline.joblib")

MAE: 2540.46, RMSE: 21174753.46, R2: 0.864
Saved pipeline to insurance_pipeline.joblib
