#### 1. Setup and Data Loading

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('insurance.csv')

#### 2. Exploratory Data Analysis (EDA) and Preprocessing

In [4]:
df['charges_log'] = np.log(df['charges'])

In [5]:
numerical_features = ['age', 'bmi', 'children']
categorical_features = ['sex', 'smoker', 'region']

#### 3. Feature Engineering and Transformation

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

#### 4. Model Selection and Training

In [7]:
X = df.drop(['charges', 'charges_log'], axis=1)
y = df['charges_log']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Create a pipeline with a model
gb_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', GradientBoostingRegressor(random_state=42))])

# Train the model
gb_model.fit(X_train, y_train)

#### 5. Prediction and Evaluation

In [9]:
# Make predictions and inverse transform
y_pred_log = gb_model.predict(X_test)
y_pred = np.exp(y_pred_log)
y_test_orig = np.exp(y_test)

# Calculate metrics
rmse = mean_squared_error(y_test_orig, y_pred, squared=False)
r2 = r2_score(y_test_orig, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

Root Mean Squared Error (RMSE): 4480.402246423151
R-squared (R2): 0.8706977886591148


#### 6. Hyperparameter Tuning

In [12]:
from sklearn.model_selection import GridSearchCV

# Define a new pipeline
gb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', GradientBoostingRegressor(random_state=42))])

# Define the hyperparameters
param_grid = {
    'regressor__n_estimators': [100, 200, 500],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__max_depth': [3, 5, 7]
}

# cross-validation
grid_search = GridSearchCV(gb_pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X, y)

print("Best parameters found: ", grid_search.best_params_)
print("Best R-squared score: ", grid_search.best_score_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters found:  {'regressor__learning_rate': 0.01, 'regressor__max_depth': 3, 'regressor__n_estimators': 500}
Best R-squared score:  0.8373924143063821
