## Model Building

In [1]:
import numpy as np
import pandas as pd

# feature scaling and encoding
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# train and testing data
from sklearn.model_selection import train_test_split

# regression evaluation metrics
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error, mean_absolute_error

# models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import pickle

In [2]:
file_path = 'insurance.csv'

data = pd.read_csv(file_path)
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


## Data pre-processing

In [3]:
df = data.copy()

In [5]:
def preprocess_inputs(df):
    df = df.copy()

    # splitting the dataset into X and y
    X = df.drop('expenses', axis=1)
    y = df['expenses']

    # train test split
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # categorical columns
    cat_cols = ['sex', 'smoker', 'region']

    # numerical columns
    num_cols = ['age', 'bmi', 'children']

    # one hot encode categorical columns
    encoder = OneHotEncoder(drop='first', sparse_output=False)


    x_train_encode = encoder.fit_transform(x_train[cat_cols])
    x_test_encode = encoder.transform(x_test[cat_cols])

    x_train_encoded = pd.DataFrame(x_train_encode, index=x_train.index, columns=encoder.get_feature_names_out(cat_cols))
    x_test_encoded = pd.DataFrame(x_test_encode, index=x_test.index, columns=encoder.get_feature_names_out(cat_cols))

    # scale numerical columns
    scaler = StandardScaler()
    
    x_train_scale = scaler.fit_transform(x_train[num_cols])
    x_test_scale = scaler.transform(x_test[num_cols])

    x_train_scaled = pd.DataFrame(x_train_scale, index=x_train.index, columns=num_cols)
    x_test_scaled = pd.DataFrame(x_test_scale, index=x_test.index, columns=num_cols)

    # concatenate numerical and categorical features
    x_train = pd.concat([x_train_scaled, x_train_encoded], axis=1)
    x_test = pd.concat([x_test_scaled, x_test_encoded], axis=1)

    return x_train, x_test, y_train, y_test, encoder, scaler

In [6]:
x_train, x_test, y_train, y_test, encoder, scaler = preprocess_inputs(df)

In [7]:
print(f'x_train shape {x_train.shape}')
print(f'y_train shape {y_train.shape}')
print(f'x_test shape {x_test.shape}')
print(f'y_test shape {y_test.shape}')

x_train shape (1070, 8)
y_train shape (1070,)
x_test shape (268, 8)
y_test shape (268,)


In [9]:
x_train.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
1306,-0.718505,-1.429287,-0.915145,0.0,1.0,0.0,0.0,0.0
124,0.558926,0.518872,1.606414,0.0,0.0,1.0,0.0,0.0
588,1.552484,0.843565,-0.915145,0.0,0.0,0.0,0.0,0.0
1127,-0.292695,0.843565,0.765894,0.0,0.0,0.0,1.0,0.0
201,0.629894,0.242883,-0.074626,0.0,0.0,0.0,1.0,0.0


In [10]:
y_train.head()

1306    16115.30
124     10115.01
588     13635.64
1127     5836.52
201      8871.15
Name: expenses, dtype: float64

In [11]:
x_test.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
12,-1.144316,0.600045,-0.915145,1.0,0.0,0.0,0.0,1.0
306,-0.789474,-0.520146,0.765894,0.0,0.0,0.0,0.0,1.0
318,0.346021,-0.503912,-0.915145,0.0,0.0,1.0,0.0,0.0
815,-1.357221,0.12924,-0.915145,0.0,0.0,0.0,1.0,0.0
157,-1.499158,-0.893544,-0.915145,1.0,1.0,0.0,0.0,0.0


In [12]:
y_test.head()

12      1826.84
306    20177.67
318     7421.19
815     1877.93
157    15518.18
Name: expenses, dtype: float64

## Model Assessment

**1. Linear Regression** 

In [13]:
model_lr = LinearRegression()

model_lr.fit(x_train, y_train)

In [14]:
y_pred_lr = model_lr.predict(x_test)

In [15]:
r2_score(y_pred_lr, y_test)

0.7138290564298727

In [16]:
mean_absolute_error(y_pred_lr, y_test)

np.float64(3916.3077180168257)

In [17]:
y_pred_lr = model_lr.predict(x_test)

linear_reg_mse = mean_squared_error(y_test, y_pred_lr)
linear_reg_rmse = root_mean_squared_error(y_test, y_pred_lr)
linear_reg_mae = mean_absolute_error(y_test, y_pred_lr)
linear_reg_r2_score = r2_score(y_test, y_pred_lr)

# Evaluation Metrics
print(f'Mean Squared Error using Linear Regression : {linear_reg_mse}')
print(f'Root Mean Squared Error using Linear Regression : {linear_reg_rmse}')
print(f'Absolute Mean Error using Linear Regression: {linear_reg_mae}')
print(('The r2_score using Linear Regression : {}'.format(linear_reg_r2_score)))

Mean Squared Error using Linear Regression : 32193435.27377557
Root Mean Squared Error using Linear Regression : 5673.925913666442
Absolute Mean Error using Linear Regression: 3916.3077180168257
The r2_score using Linear Regression : 0.7946953084832675


**2. Decision Tree**

In [18]:
model_dt = DecisionTreeRegressor()

model_dt.fit(x_train, y_train)

In [20]:
y_pred_dt = model_dt.predict(x_test)

decision_tree_mse = mean_squared_error(y_test, y_pred_dt)
decision_tree_rmse = root_mean_squared_error(y_test, y_pred_dt)
decision_tree_mae = mean_absolute_error(y_test, y_pred_dt)
decision_tree_r2_score = r2_score(y_test, y_pred_dt)

print(f"Mean Squared Error using Decision Tree Regressor : {decision_tree_mse}")
print(f"Root Mean Squared Error using Decision Tree Regressor : {decision_tree_rmse}")
print(f"Mean Absolute Error using Decision Tree Regressor : {decision_tree_mae}")
print(f"r2_score using Decision Tree Regressor : {decision_tree_r2_score}")

Mean Squared Error using Decision Tree Regressor : 40957567.612830594
Root Mean Squared Error using Decision Tree Regressor : 6399.809966931096
Mean Absolute Error using Decision Tree Regressor : 3199.425223880597
r2_score using Decision Tree Regressor : 0.7388044887872652


**3. KNN**

In [21]:
model_knn = KNeighborsRegressor()

model_knn.fit(x_train, y_train)

In [27]:
y_pred_knn = model_knn.predict(x_test)

knn_mse = mean_squared_error(y_test, y_pred_knn)
knn_rmse = root_mean_squared_error(y_test, y_pred_knn)
knn_mae = mean_absolute_error(y_test, y_pred_knn)
knn_r2_score = r2_score(y_test, y_pred_knn)

print(f"Mean squared error using KNN is {knn_mse}")
print(f"Root mean squared error using KNN is {knn_rmse}")
print(f"Mean absolute error using KNN is {knn_mae}")
print(f"r2_score using KNN is {knn_r2_score}")

Mean squared error using KNN is 43979681.832229085
Root mean squared error using KNN is 6631.717864341718
Mean absolute error using KNN is 4107.901992537313
r2_score using KNN is 0.7195317947654702


**4. Random Forest**

In [23]:
model_rf = RandomForestRegressor()

model_rf.fit(x_train, y_train)

In [28]:
y_pred_lr = model_rf.predict(x_test)

random_forest_mse = mean_squared_error(y_test, y_pred_lr)
random_forest_rmse = root_mean_squared_error(y_test, y_pred_lr)
random_forest_mae = mean_absolute_error(y_test, y_pred_lr)
random_forest_r2_score = r2_score(y_test, y_pred_lr)

print(f"Mean Squared Error using Random Forest Regressor : {random_forest_mse}")
print(f"Root Mean Squared Error using Random Forest Regressor : {random_forest_rmse}")
print(f"Mean Absolute Error using Random Forest Regressor : {random_forest_mae}")
print(f"r2_score Error using Random Forest Regressor : {random_forest_r2_score}")

Mean Squared Error using Random Forest Regressor : 19408126.03678522
Root Mean Squared Error using Random Forest Regressor : 4405.465473339363
Mean Absolute Error using Random Forest Regressor : 2688.6941267164184
r2_score Error using Random Forest Regressor : 0.8762300669370986


**5. Gradient Boosting**

In [29]:
model_gb = GradientBoostingRegressor()

model_gb.fit(x_train, y_train)

In [30]:
y_pred_gb = model_gb.predict(x_test)

gradient_boosting_mse = mean_squared_error(y_test, y_pred_gb)
gradient_boosting_rmse = root_mean_squared_error(y_test, y_pred_gb)
gradient_boosting_mae = mean_absolute_error(y_test, y_pred_gb)
gradient_boosting_r2_score = r2_score(y_test, y_pred_gb)

print(f"Mean Squared Error using Gradient Boosting Regressor : {gradient_boosting_mse}")
print(f"Root Mean Squared Error using Gradient Boosting Regressor : {gradient_boosting_rmse}")
print(f"Mean Absolute Error using Gradient Boosting Regressor : {gradient_boosting_mae}")
print(f"r2_sccore using Gradient Boosting Regressor : {gradient_boosting_r2_score}")

Mean Squared Error using Gradient Boosting Regressor : 15704838.744873183
Root Mean Squared Error using Gradient Boosting Regressor : 3962.9330987127682
Mean Absolute Error using Gradient Boosting Regressor : 2355.419844888246
r2_sccore using Gradient Boosting Regressor : 0.899846753028476


In [31]:
models = pd.DataFrame({
    'Model' : ['Linear Regression', 'Decision Tree', 'Random Forest',
               'Gradient Boosting', 'KNN'],
    'RMSE' : [linear_reg_rmse, decision_tree_rmse, random_forest_rmse,
            gradient_boosting_rmse, knn_rmse],
    'r2_score' : [linear_reg_r2_score, decision_tree_r2_score, random_forest_r2_score, 
    gradient_boosting_r2_score, knn_r2_score]
})

models.sort_values(by='r2_score', ascending=False, ignore_index=True)

Unnamed: 0,Model,RMSE,r2_score
0,Gradient Boosting,3962.933099,0.899847
1,Random Forest,4405.465473,0.87623
2,Linear Regression,5673.925914,0.794695
3,Decision Tree,6399.809967,0.738804
4,KNN,6631.717864,0.719532


- As we can see through the evaluation metrics, RMSE and r2_score both is quite good for Gradient Boosting and Random Forest algorithms.<br>
- Among all the regressors, Decision Tree performs the worst, it maybe because tree based algorithms generally lack to perform with continuous numerical variables and unsuitability for accurate regression and continuous value prediction.<br>
- With notable RMSE and r2 scores, Gradient Boosting evidently emerges as the superior performer. Thus, we can confidently select Gradient Boosting as our final algorithm.

## Save the best model

In [33]:
## save encoder
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

## load encoder
# with open('encoder.pkl', 'rb') as f:
#     encoder = pickle.load(f)

In [32]:
## save scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

## load scaler
# with open('scaler.pkl', 'rb') as f:
#     s = pickle.load(f)

In [34]:
## save model
with open('model.pkl', 'wb') as f:
    pickle.dump(model_gb, f)

In [35]:
## load model
with open('model.pkl', 'rb') as f:
    m = pickle.load(f)