In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/health_insurance_cost.csv")

In [4]:
df.head()

Unnamed: 0,Id,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
0,1,39.0,male,23.2,91,Yes,0,No,southeast,1121.87
1,2,24.0,male,30.1,87,No,0,No,southeast,1131.51
2,3,,male,33.3,82,Yes,0,No,southeast,1135.94
3,4,,male,33.7,80,No,0,No,northwest,1136.4
4,5,,male,34.1,100,No,0,No,northwest,1137.01


In [5]:
df.drop("Id", axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
0,39.0,male,23.2,91,Yes,0,No,southeast,1121.87
1,24.0,male,30.1,87,No,0,No,southeast,1131.51
2,,male,33.3,82,Yes,0,No,southeast,1135.94
3,,male,33.7,80,No,0,No,northwest,1136.4
4,,male,34.1,100,No,0,No,northwest,1137.01


In [7]:
df.isnull().sum()

age              5
gender           0
bmi              0
bloodpressure    0
diabetic         0
children         0
smoker           0
region           3
claim            0
dtype: int64

In [8]:
df["age"].fillna(df["age"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].median(), inplace=True)


In [9]:
df.isnull().sum()

age              0
gender           0
bmi              0
bloodpressure    0
diabetic         0
children         0
smoker           0
region           3
claim            0
dtype: int64

In [10]:
df["region"].fillna(df["region"].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["region"].fillna(df["region"].mode()[0], inplace=True)


In [11]:
df.isnull().sum()

age              0
gender           0
bmi              0
bloodpressure    0
diabetic         0
children         0
smoker           0
region           0
claim            0
dtype: int64

In [12]:
X = df.drop(columns=["claim"], errors="ignore")

In [13]:
X

Unnamed: 0,age,gender,bmi,bloodpressure,diabetic,children,smoker,region
0,39.0,male,23.2,91,Yes,0,No,southeast
1,24.0,male,30.1,87,No,0,No,southeast
2,38.0,male,33.3,82,Yes,0,No,southeast
3,38.0,male,33.7,80,No,0,No,northwest
4,38.0,male,34.1,100,No,0,No,northwest
...,...,...,...,...,...,...,...,...
1335,44.0,female,35.5,88,Yes,0,Yes,northwest
1336,59.0,female,38.1,120,No,1,Yes,northeast
1337,30.0,male,34.5,91,Yes,3,Yes,northwest
1338,37.0,male,30.4,106,No,0,Yes,southeast


In [14]:
X.shape

(1340, 8)

In [15]:
y = df["claim"]

In [16]:
y.shape

(1340,)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

Numeric columns: ['age', 'bmi', 'bloodpressure', 'children']
Categorical columns: ['gender', 'diabetic', 'smoker', 'region']


In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ))
])

model.fit(X_train, y_train)

print("Model training completed.")


Model training completed.


In [26]:
from sklearn.metrics import r2_score

In [27]:
y_pred = model.predict(X_test)

In [28]:
r2 = r2_score(y_test, y_pred)

In [29]:
print(r2)

0.8199366766169075


In [30]:
accuracy = r2_score(y_test, y_pred) * 100
print("Accuracy:", accuracy, "%")

Accuracy: 81.99366766169075 %


In [31]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "regressor__n_estimators": [200, 400, 600],
    "regressor__max_depth": [None, 10, 20, 30],
    "regressor__min_samples_split": [2, 5, 10],
}

grid = GridSearchCV(
    model,
    param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Score:", grid.best_score_)
print("Best Params:", grid.best_params_)

Best Score: 0.7925335391857663
Best Params: {'regressor__max_depth': 10, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 600}


In [32]:
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)

from sklearn.metrics import r2_score
print("Improved Accuracy:", r2_score(y_test, y_pred) * 100, "%")

Improved Accuracy: 83.45529191998192 %


In [36]:
import joblib
import os

# create model folder if not exists
os.makedirs("../model", exist_ok=True)

# save best model
joblib.dump(best_model, "../model/insurance_cost_model.pkl")

print("Best model saved successfully.")

Best model saved successfully.


In [37]:
df

Unnamed: 0,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
0,39.0,male,23.2,91,Yes,0,No,southeast,1121.87
1,24.0,male,30.1,87,No,0,No,southeast,1131.51
2,38.0,male,33.3,82,Yes,0,No,southeast,1135.94
3,38.0,male,33.7,80,No,0,No,northwest,1136.40
4,38.0,male,34.1,100,No,0,No,northwest,1137.01
...,...,...,...,...,...,...,...,...,...
1335,44.0,female,35.5,88,Yes,0,Yes,northwest,55135.40
1336,59.0,female,38.1,120,No,1,Yes,northeast,58571.07
1337,30.0,male,34.5,91,Yes,3,Yes,northwest,60021.40
1338,37.0,male,30.4,106,No,0,Yes,southeast,62592.87
