In [4]:
# Data Loading
import pandas as pd
df = pd.read_csv("insurance.csv")
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [5]:
print("Dataset Shape: ",df.shape)

Dataset Shape:  (1338, 7)


In [9]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)
if missing_values.sum() == 0:
    print("No missing values found in the dataset.")

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64
No missing values found in the dataset.


In [10]:
Q1 = df['bmi'].quantile(0.25)
Q3 = df['bmi'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"Q1: {Q1}, Q3: {Q3}")
print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")

print("BMI before clipping:")
print(df['bmi'].describe())

df['bmi'] = df['bmi'].clip(lower_bound, upper_bound)

print("BMI after clipping:")
print(df['bmi'].describe())

Q1: 26.29625, Q3: 34.69375
Lower Bound: 13.7, Upper Bound: 47.290000000000006
BMI before clipping:
count    1338.000000
mean       30.650034
std         6.056926
min        15.960000
25%        26.296250
50%        30.400000
75%        34.693750
max        47.290000
Name: bmi, dtype: float64
BMI after clipping:
count    1338.000000
mean       30.650034
std         6.056926
min        15.960000
25%        26.296250
50%        30.400000
75%        34.693750
max        47.290000
Name: bmi, dtype: float64


In [11]:
# Feature Engineering
df['age_squared'] = df['age'] ** 2

print("New feature 'age_squared' created.")
print(df[['age', 'age_squared']].head())

New feature 'age_squared' created.
   age  age_squared
0   19          361
1   18          324
2   28          784
3   33         1089
4   32         1024


In [12]:
# Encoding Categorical Variables
categorical_columns = df.select_dtypes(include='object').columns
print("Categorical columns detected:", list(categorical_columns))

df_encoded = pd.get_dummies(df, drop_first=True)

print("Data after One-Hot Encoding:")
print(df_encoded.head())
print("New shape after encoding:", df_encoded.shape)

Categorical columns detected: ['sex', 'smoker', 'region']
Data after One-Hot Encoding:
   age     bmi  children      charges  age_squared  sex_male  smoker_yes  \
0   19  27.900         0  16884.92400          361     False        True   
1   18  33.770         1   1725.55230          324      True       False   
2   28  33.000         3   4449.46200          784      True       False   
3   33  22.705         0  21984.47061         1089      True       False   
4   32  28.880         0   3866.85520         1024      True       False   

   region_northwest  region_southeast  region_southwest  
0             False             False              True  
1             False              True             False  
2             False              True             False  
3              True             False             False  
4              True             False             False  
New shape after encoding: (1338, 10)


In [13]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

numerical_columns = ['age', 'bmi', 'children', 'age_squared']

scaler = StandardScaler()
df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])

print("Numerical features after Standard Scaling:")
print(df_encoded[numerical_columns].head())

Numerical features after Standard Scaling:
        age       bmi  children  age_squared
0 -1.438764 -0.454201 -0.908614    -1.220462
1 -1.509965  0.515300 -0.078767    -1.253341
2 -0.797954  0.388125  1.580926    -0.844579
3 -0.441948 -1.312218 -0.908614    -0.573551
4 -0.513149 -0.292342 -0.908614    -0.631311


In [14]:
print(df_encoded.head())
print("Final Shape:", df_encoded.shape)

        age       bmi  children      charges  age_squared  sex_male  \
0 -1.438764 -0.454201 -0.908614  16884.92400    -1.220462     False   
1 -1.509965  0.515300 -0.078767   1725.55230    -1.253341      True   
2 -0.797954  0.388125  1.580926   4449.46200    -0.844579      True   
3 -0.441948 -1.312218 -0.908614  21984.47061    -0.573551      True   
4 -0.513149 -0.292342 -0.908614   3866.85520    -0.631311      True   

   smoker_yes  region_northwest  region_southeast  region_southwest  
0        True             False             False              True  
1       False             False              True             False  
2       False             False              True             False  
3       False              True             False             False  
4       False              True             False             False  
Final Shape: (1338, 10)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

In [16]:
# Target and features
X = df.drop("charges", axis=1)
y = df["charges"]

print("Feature columns:", list(X.columns))
print("Target column:", y.name)

Feature columns: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'age_squared']
Target column: charges


In [17]:
# Identify column types
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

print("Numerical Features:", list(numeric_features))
print("Categorical Features:", list(categorical_features))


Numerical Features: ['age', 'bmi', 'children', 'age_squared']
Categorical Features: ['sex', 'smoker', 'region']


In [26]:
#  Train–test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Training set size: (1070, 7)
Test set size: (268, 7)


In [18]:
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

print("\nNumerical preprocessing pipeline:")
print(numeric_pipeline)



Numerical preprocessing pipeline:
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])


In [19]:
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

print("\nCategorical preprocessing pipeline:")
print(categorical_pipeline)



Categorical preprocessing pipeline:
Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore'))])


In [20]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])

print("\nCombined Preprocessor:")
print(preprocessor)



Combined Preprocessor:
ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 Index(['age', 'bmi', 'children', 'age_squared'], dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 Index(['sex', 'smoker', 'region'], dtype='object'))])


In [21]:
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=100,
        random_state=42
    ))
])
print(pipeline)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'bmi', 'children', 'age_squared'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
               

In [23]:
pipeline.fit(X, y)
print("Pipeline fitted successfully without errors.")


Pipeline fitted successfully without errors.


In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score


In [25]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(
        n_estimators=100,
        random_state=42
    )
}

In [27]:
# model comparison

model_scores = {}

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('model', model)
    ])
    
    scores = cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=5,
        scoring='r2'
    )
    
    model_scores[name] = scores.mean()
    
    print(f"{name}")
    print(f"R² Scores: {scores}")
    print(f"Mean R² Score: {scores.mean():.4f}")
    print("-" * 55)

Linear Regression
R² Scores: [0.71742595 0.80845205 0.72290208 0.6636284  0.7647723 ]
Mean R² Score: 0.7354
-------------------------------------------------------
Decision Tree Regressor
R² Scores: [0.72157578 0.75858358 0.58872398 0.64304831 0.77656193]
Mean R² Score: 0.6977
-------------------------------------------------------
Random Forest Regressor
R² Scores: [0.81651711 0.90248408 0.80000597 0.7886559  0.83557698]
Mean R² Score: 0.8286
-------------------------------------------------------


In [28]:
for model_name, score in model_scores.items():
    print(f"{model_name}: Mean R² = {score:.4f}")

Linear Regression: Mean R² = 0.7354
Decision Tree Regressor: Mean R² = 0.6977
Random Forest Regressor: Mean R² = 0.8286


In [29]:
best_model_name = max(model_scores, key=model_scores.get)
print("\nSelected Primary Model:", best_model_name)



Selected Primary Model: Random Forest Regressor


In [30]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

final_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=100,
        random_state=42
    ))
])

print("Final Training Pipeline:",final_pipeline)

Final Training Pipeline: Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'bmi', 'children', 'age_squared'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ign

In [32]:
final_pipeline.fit(X_train, y_train)

In [35]:
# Check training score
train_score = final_pipeline.score(X_train, y_train)
print("Training R² Score:", train_score)


Training R² Score: 0.9748259667787313


In [36]:
from sklearn.model_selection import cross_val_score
import numpy as np

cv_scores = cross_val_score(
    final_pipeline,
    X_train,
    y_train,
    cv=5,
    scoring='r2'
)

print("Cross-Validation R² scores:", cv_scores)
print("Average R² score:", np.mean(cv_scores))
print("Standard Deviation:", np.std(cv_scores))


Cross-Validation R² scores: [0.81651711 0.90248408 0.80000597 0.7886559  0.83557698]
Average R² score: 0.8286480078146969
Standard Deviation: 0.04016632925546411


In [37]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    final_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Parameters tested:")
print(param_grid)

print("\nBest Parameters Found:")
print(grid_search.best_params_)

print("Best Cross-Validation R² Score:")
print(grid_search.best_score_)


Parameters tested:
{'model__n_estimators': [100, 200], 'model__max_depth': [None, 10, 20], 'model__min_samples_split': [2, 5]}

Best Parameters Found:
{'model__max_depth': 10, 'model__min_samples_split': 5, 'model__n_estimators': 200}
Best Cross-Validation R² Score:
0.8335573115172623


In [38]:
best_model = grid_search.best_estimator_

print("Best Model Selected:")
print(best_model)


Best Model Selected:
Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'bmi', 'children', 'age_squared'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'

In [40]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# MODEL PERFORMANCE EVALUATION
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)

Mean Absolute Error (MAE): 2523.513358445535
Root Mean Squared Error (RMSE): 4525.697713903115
R² Score: 0.8680701632740027
