In [None]:
# Importing necessary libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor


In [None]:
# Loading the data
insurance = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/health insurance premium/insurance.csv.csv")

In [None]:
# Fetching first five rows.
insurance.head()

In [None]:
# Getting information about data.
insurance.info()

In [None]:
# Describing data.
insurance.describe()

In [None]:
# Fetching column names.
insurance.columns

# ***Data Preprocessing***

In [None]:
# Handling Missing values
insurance.isnull().sum()

In [None]:
# Handling duplicated value.
insurance.duplicated().sum()

In [None]:
# removing duplicates

insurance.drop_duplicates(inplace=True)


In [None]:
# calculating average age, bmi, children, expenses group by region

grouped_data = insurance.groupby('region')[['age', 'bmi', 'children', 'charges']].mean()

print(grouped_data)


In [None]:
# Checking unique sex.
insurance['sex'].unique()

In [None]:
# Checking unique smoker value.
insurance['smoker'].unique()

In [None]:
# Checking unique region.
insurance['region'].unique()

In [None]:
insurance.head()

# ***Exploratory Data Analysis (EDA):***

In [None]:
# Jointplot plot of expenses vs. BMI .
sns.jointplot(x='bmi', y='charges', data=insurance, kind='kde', fill = True)

In [None]:
# Average expense region
region=insurance[['region','charges']].groupby('region').agg('mean')['charges'].index
region_expense=insurance[['region','charges']].groupby('region').agg('mean')['charges'].values
explode = [0.01,0.01,0.01,0.01]
# define Seaborn color palette to use
palette_color = sns.color_palette('pastel')

# plotting data on chart
plt.pie(region_expense, labels=region, colors=palette_color,
        explode=explode, autopct='%.0f%%')

# displaying chart
plt.title("Avg expense region ")
plt.show()

In [None]:
# Stripplot of expenses by number of children.
plt.figure(figsize=(10, 6))
sns.stripplot(x='children', y='charges', hue = 'children', palette = 'magma', data=insurance)
plt.title('Medical Expenses by Number of Children')
plt.show()

In [None]:
# Expenses by age.
sns.scatterplot(x= 'age', y = 'charges',hue = 'smoker', data = insurance)
plt.title("Expenses by age(smoker-wise)")
plt.show()

In [None]:
# Average expense of smoker
smoker=insurance[['smoker','charges']].groupby('smoker').agg('mean')['charges'].index
smoker_expense=insurance[['smoker','charges']].groupby('smoker').agg('mean')['charges'].values
explode = [0.01,0.05]
# define Seaborn color palette to use
palette_color = sns.color_palette('pastel')

# plotting data on chart
plt.pie(smoker_expense, labels=smoker, colors=palette_color,
        explode=explode, autopct='%.0f%%')

# displaying chart
plt.title("Avg expense of smoker ")
plt.show()

In [None]:
# Checking distribution of expenses
sns.histplot(data= insurance,x='charges',kde=True,hue='smoker')


In [None]:
# Swarmplot of expenses by sex.
plt.figure(figsize=(10, 6))
sns.swarmplot(x='sex', y='charges', hue = 'sex', palette = 'magma', data=insurance)
plt.title('Medical Expenses by sex')
plt.show()

In [None]:
# Performing one hot encoding.
insurance = pd.get_dummies(insurance, drop_first=False)
insurance.head()

# ***Correlation Analysis:***

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(insurance.corr(), annot=True, cmap='viridis', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

***Input and output variables***

In [None]:
# Dividing the dataset into input and output variables.
X = insurance.drop('charges', axis = 1)
y = insurance['charges']

***Splitting data***

In [None]:
# Dividing the data into train-test-split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

***Standard Scaling***

In [None]:
# Performing Standard Scaling to put all numerical features on a similar scale.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# ***Decision tree regressor***

In [None]:
# Creating a DecisionTreeRegressor Model.

dt = DecisionTreeRegressor(max_depth=5, min_samples_leaf=10)
dt.fit(X_train, y_train)

### Training Accuracy

In [None]:
#Predicting the test data
dt_train_predictions = dt.predict(X_train)
# Evaluating the model.
print("MAE for train set:", mean_absolute_error(y_train, dt_train_predictions))
print("MSE for train set:", mean_squared_error(y_train, dt_train_predictions))
print("RMSE for train set:", np.sqrt(mean_squared_error(y_train, dt_train_predictions)))
print("R2 Score for train set:", r2_score(y_train, dt_train_predictions))

### Testing Accuracy

In [None]:
#Predicting the test data
dt_test_predictions = dt.predict(X_test)
# Evaluating the model.
print("MAE for test set:", mean_absolute_error(y_test, dt_test_predictions))
print("MSE for test set:", mean_squared_error(y_test, dt_test_predictions))
print("RMSE for test set:", np.sqrt(mean_squared_error(y_test, dt_test_predictions)))
print("R2 Score for test set:", r2_score(y_test, dt_test_predictions))

# ***Hyperparameter tuning***

**Performing bayesian-optimization Cross Validation on Decision tree regressor**

In [None]:
# Installing bayesian-optimization
!pip install bayesian-optimization scikit-learn


In [None]:
# Importing necessary libraries for hyperparameter tuning
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

In [None]:
# ***Hyperparameter tuning***
# Defining the objective function.
def dt_cv_r2(max_depth, min_samples_split, min_samples_leaf):
    estimator = DecisionTreeRegressor(
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        random_state=42
    )
    # Cross-validated R² score
    cval = cross_val_score(estimator, X, y, scoring='r2', cv=5, error_score='raise')
    return cval.mean()


In [None]:
# Bounds for hyperparameters
pbounds = {
    'max_depth': (1, 20),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20)
}

In [None]:
# ***Bayesian Optimization***
optimizer = BayesianOptimization(
    f=dt_cv_r2,
    pbounds=pbounds,
    random_state=42,
    verbose=2
)
optimizer.maximize(
    init_points=10,  # Number of random initial points
    n_iter=50       # Number of iterations to run
)


In [None]:
# ***Optimal Parameters***
best_params = optimizer.max['params']
best_params['max_depth'] = int(best_params['max_depth'])
best_params['min_samples_split'] = int(best_params['min_samples_split'])
best_params['min_samples_leaf'] = int(best_params['min_samples_leaf'])
print(best_params)


### Training Accuracy

In [None]:
# check accuracy for train set

# Create a new model with the best parameters
best_model = DecisionTreeRegressor(**best_params)

# Train the model on the training data
best_model.fit(X_train, y_train)

# Evaluate the model on the training data
bayes_opt_dt_train_predictions = best_model.predict(X_train)

# Calculate the accuracy metrics
mae = mean_absolute_error(y_train, bayes_opt_dt_train_predictions)
mse = mean_squared_error(y_train, bayes_opt_dt_train_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, bayes_opt_dt_train_predictions)

# Print the accuracy metrics
print("MAE for train set:", mae)
print("MSE for train set:", mse)
print("RMSE for train set:", rmse)
print("R2 for train set:", r2)

### Testing Accuracy

In [None]:
# check accuracy for test data

# Evaluate the model on the test data
bayes_opt_dt_test_predictions = best_model.predict(X_test)

# Calculate the accuracy metrics
mae = mean_absolute_error(y_test, bayes_opt_dt_test_predictions)
mse = mean_squared_error(y_test, bayes_opt_dt_test_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, bayes_opt_dt_test_predictions)

# Print the accuracy metrics
print("MAE for test set:", mae)
print("MSE for test set:", mse)
print("RMSE for test set:", rmse)
print("R2 for test set:", r2)


# ***Gradient Boosting regression***

In [None]:
# Creating a gradient boost regressor model
gb = GradientBoostingRegressor(max_depth=2, n_estimators=100)
gb.fit(X_train, y_train)

### Training Accuracy

In [None]:
#Predicting the train data
gb_train_predictions = gb.predict(X_train)

# Evaluating the model.
print("MAE for train set:", mean_absolute_error(y_train, gb_train_predictions))
print("MSE for train set:", mean_squared_error(y_train, gb_train_predictions))
print("RMSE for train set:", np.sqrt(mean_squared_error(y_train, gb_train_predictions)))
print("R2 Score for train set:", r2_score(y_train, gb_train_predictions))

### Testing Accuracy

In [None]:
#Predicting the test data
gb_test_predictions = gb.predict(X_test)

# Evaluating the model.
print("MAE for test set:", mean_absolute_error(y_test, gb_test_predictions))
print("MSE for test set:", mean_squared_error(y_test, gb_test_predictions))
print("RMSE for test set:", np.sqrt(mean_squared_error(y_test, gb_test_predictions)))
print("R2 Score for test set:", r2_score(y_test, gb_test_predictions))

# ***Hyperparameter Tuning***

**Performing bayesian-optimization Cross Validation on Gradient Boosting Regressor**

In [None]:
# Defining the objective function.
def gb_cv_r2(max_depth, n_estimators, learning_rate):
    estimator = GradientBoostingRegressor(
        max_depth=int(max_depth),
        n_estimators=int(n_estimators),
        learning_rate=learning_rate,
        random_state=42
    )
    # Cross-validated R² score
    cval = cross_val_score(estimator, X, y, scoring='r2', cv=5, error_score='raise')
    return cval.mean()

In [None]:
# Bounds for hyperparameters
pbounds = {
    'max_depth': (1, 20),
    'n_estimators': (10, 100),
    'learning_rate': (0.01, 1)
}

In [None]:
# ***Bayesian Optimization***
optimizer = BayesianOptimization(
    f=gb_cv_r2,
    pbounds=pbounds,
    random_state=42,
    verbose=2
)
optimizer.maximize(
    init_points=10,  # Number of random initial points
    n_iter=50       # Number of iterations to run
)

In [None]:
# ***Optimal Parameters***
best_params = optimizer.max['params']
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])
print(best_params)

### Training Accuracy

In [None]:
# check accuracy on the train data

# Create a new model with the best parameters
best_model = GradientBoostingRegressor(**best_params)

# Train the model on the training data
best_model.fit(X_train, y_train)

# Evaluate the model on the train data
bayes_opt_gb_train_predictions = best_model.predict(X_train)

# Calculate the accuracy metrics
mae = mean_absolute_error(y_train, bayes_opt_gb_train_predictions)
mse = mean_squared_error(y_train, bayes_opt_gb_train_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, bayes_opt_gb_train_predictions)

# Print the accuracy metrics
print("MAE for train set:", mae)
print("MSE for train set:", mse)
print("RMSE for train set:", rmse)
print("R2 for train set:", r2)

### Testing Accuracy

In [None]:
# check accuracy on the test data

# Evaluate the model on the test data
bayes_opt_gb_test_predictions = best_model.predict(X_test)

# Calculate the accuracy metrics
mae = mean_absolute_error(y_test, bayes_opt_gb_test_predictions)
mse = mean_squared_error(y_test, bayes_opt_gb_test_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, bayes_opt_gb_test_predictions)

# Print the accuracy metrics
print("MAE for test set:", mae)
print("MSE for test set:", mse)
print("RMSE for test set:", rmse)
print("R2 for test set:", r2)

# ***Cat Boost Regression***

In [None]:
!pip install catboost

In [None]:
# Creating a catboost regression model.
from catboost import CatBoostRegressor
cat = CatBoostRegressor(iterations=10)
cat.fit(X_train, y_train)

###Training accuracy

In [None]:
# Predicting the train data
cat_train_predictions = cat.predict(X_train)
# Evaluating the model.
print("MAE for train set:", mean_absolute_error(y_train, cat_train_predictions))
print("MSE for train set:", mean_squared_error(y_train, cat_train_predictions))
print("RMSE for train set:", np.sqrt(mean_squared_error(y_train, cat_train_predictions)))
print("R2 Score for train set:", r2_score(y_train, cat_train_predictions))

### Testing Accuracy

In [None]:
# Predicting the test data
cat_test_predictions = cat.predict(X_test)
# Evaluating the model.
print("MAE for test set:", mean_absolute_error(y_test, cat_test_predictions))
print("MSE for test set:", mean_squared_error(y_test, cat_test_predictions))
print("RMSE for test set:", np.sqrt(mean_squared_error(y_test, cat_test_predictions)))
print("R2 Score for test set:", r2_score(y_test, cat_test_predictions))

# ***Hyperparameter tuning***

**Performing bayesian-optimization Cross Validation on Cat Boost Regressor**

In [None]:
# Defining the objective function.
def cat_cv_r2(learning_rate, depth, l2_leaf_reg, random_strength, bagging_temperature, border_count):
    estimator = CatBoostRegressor(
        learning_rate=learning_rate,
        depth=int(depth),
        l2_leaf_reg=l2_leaf_reg,
        random_strength=random_strength,
        bagging_temperature=bagging_temperature,
        border_count=int(border_count),
        random_state=42,
        verbose=0
    )
    # Cross-validated R² score
    cval = cross_val_score(estimator, X, y, scoring='r2', cv=5, error_score='raise')
    return cval.mean()

In [None]:
# Bounds for hyperparameters
pbounds = {
    'learning_rate': (0.01, 1),
    'depth': (1, 16),
    'l2_leaf_reg': (0.01, 10),
    'random_strength': (0.01, 10),
    'bagging_temperature': (0.01, 10),
    'border_count': (1, 20)
}

In [None]:
# ***Bayesian Optimization***
optimizer = BayesianOptimization(
    f=cat_cv_r2,
    pbounds=pbounds,
    random_state=42,
    verbose=2
)
optimizer.maximize(
    init_points=10,  # Number of random initial points
    n_iter=50       # Number of iterations to run
)

In [None]:
# ***Optimal Parameters***
best_params = optimizer.max['params']
best_params['depth'] = int(best_params['depth'])
best_params['border_count'] = int(best_params['border_count'])
print(best_params)

### Training Accuracy

In [None]:
# check accuracy on the train data

# Create a new model with the best parameters
best_model = CatBoostRegressor(**best_params)

# Train the model on the training data
best_model.fit(X_train, y_train)

# Evaluate the model on the train data
bayes_opt_cat_train_predictions = best_model.predict(X_train)

# Calculate the accuracy metrics
mae = mean_absolute_error(y_train, bayes_opt_cat_train_predictions)
mse = mean_squared_error(y_train, bayes_opt_cat_train_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, bayes_opt_cat_train_predictions)

# Print the accuracy metrics
print("MAE for train set:", mae)
print("MSE for train set:", mse)
print("RMSE for train set:", rmse)
print("R2 for train set:", r2)

### Testing Accuracy

In [None]:
# check accuracy on the test data

# Evaluate the model on the test data
bayes_opt_cat_test_predictions = best_model.predict(X_test)

# Calculate the accuracy metrics
mae = mean_absolute_error(y_test, bayes_opt_cat_test_predictions)
mse = mean_squared_error(y_test, bayes_opt_cat_test_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, bayes_opt_cat_test_predictions)

# Print the accuracy metrics
print("MAE for test set:", mae)
print("MSE for test set:", mse)
print("RMSE for test set:", rmse)
print("R2 for test set:", r2)

# ***Comparison***

In [None]:
# Comparison of Training data

import matplotlib.pyplot as plt
# Scatterplot for Decision Tree Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_train, dt_train_predictions)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Decision Tree Regressor - Train Data')
plt.show()

# Scatterplot for Bayesian Optimization on Decision Tree Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_train, bayes_opt_dt_train_predictions)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Bayesian Optimization on Decision Tree Regressor - Train Data')
plt.show()

# Scatterplot for Gradient Boosting Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_train, gb_train_predictions)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Gradient Boosting Regressor - Train Data')
plt.show()

# Scatterplot for Bayesian Optimization on Gradient Boosting Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_train, bayes_opt_gb_train_predictions)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Bayesian Optimization on Gradient Boosting Regressor - Train Data')
plt.show()

# Scatterplot for Cat Boost Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_train, cat_train_predictions)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Cat Boost Regressor - Train Data')
plt.show()

# Scatterplot for Bayesian Optimization on Cat Boost Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_train, bayes_opt_cat_train_predictions)
plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Bayesian Optimization on Cat Boost Regressor - Train Data')
plt.show()


In [None]:
# Comparison of testing data

import matplotlib.pyplot as plt
# Scatterplot for Decision Tree Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_test[:200], dt_test_predictions[:200])
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Decision Tree Regressor - Test Data')
plt.show()

# Scatterplot for Bayesian Optimization on Decision Tree Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_test[:200], bayes_opt_dt_test_predictions[:200])
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Bayesian Optimization on Decision Tree Regressor - Test Data')
plt.show()

# Scatterplot for Gradient Boosting Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_test[:200], gb_test_predictions[:200])
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Gradient Boosting Regressor - Test Data')
plt.show()

# Scatterplot for Bayesian Optimization on Gradient Boosting Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_test[:200], bayes_opt_gb_test_predictions[:200])
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Bayesian Optimization on Gradient Boosting Regressor - Test Data')
plt.show()

# Scatterplot for Cat Boost Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_test[:200], cat_test_predictions[:200])
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Cat Boost Regressor - Test Data')
plt.show()

# Scatterplot for Bayesian Optimization on Cat Boost Regressor
plt.figure(figsize=(10, 6))
plt.scatter(y_test[:200], bayes_opt_cat_test_predictions[:200])
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Bayesian Optimization on Cat Boost Regressor - Test Data')
plt.show()
