In [48]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold

import pickle

In [6]:
df = pd.read_csv('insurance.csv')

In [7]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [9]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [10]:
df['sex'] = df['sex'].map({"female": 0, "male": 1})
df['region'] = df['region'].map({'southwest': 0, 'southeast': 1, 'northwest': 2, 'northeast': 3})
df['smoker'] = df['smoker'].map({"yes": 1, "no": 0})

In [11]:
X = df.drop('charges', axis = 1)
y = df['charges']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2022)

In [17]:
def run_model(model, print_values = True, return_predictions = False):

    model.fit(X_train, y_train)
    y_predictions = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_predictions)
    rmse = mean_squared_error(y_test, y_predictions, squared=False)
    if print_values:
        print(f"MAE: {round(mae, 3)}")
        print(f"RMSE: {round(rmse, 3)}")
    if return_predictions:
        return y_predictions, mae, rmse
    return mae, rmse

Dummy Model

In [13]:
dummy = DummyRegressor(strategy = 'mean')

In [19]:
mae_dummy, rmse_dummy = run_model(dummy)

MAE: 9561.634
RMSE: 12818.544


Linear Regression

In [20]:
linreg = LinearRegression()

In [21]:
mae_linreg, rmse_linreg = run_model(linreg)

MAE: 4031.97
RMSE: 5824.124


RandomForestRegressor

In [22]:
rf = RandomForestRegressor(random_state = 2022)

In [23]:
mae_rf, rmse_rf = run_model(rf)

MAE: 2543.137
RMSE: 4638.465


RandomForestRegressor Tuning

In [43]:
rf = RandomForestRegressor(random_state = 2022)

params = {
    'n_estimators': [10, 30, 50, 100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [10, 50, 100],
    'bootstrap': [True, False]
}

CV = KFold(n_splits=5, shuffle=True, random_state=2022)

rf_search = RandomizedSearchCV(rf,
                          params, 
                          scoring = 'neg_root_mean_squared_error',
                          error_score = 'raise',
                          n_jobs = -1,
                          verbose = 0,
                          cv = CV)

rf_search.fit(X_train, y_train)

  warn(


In [44]:
rf_model = rf_search.best_estimator_
mae_rf, rmse_rf = run_model(rf_model)

MAE: 2520.307
RMSE: 4321.354


  warn(


In [49]:
metrics = {
    'DummyModel': rmse_dummy,
    'LinearRegression': rmse_linreg,
    'RandomForestRegressor': rmse_rf
}

In [50]:
df_metrics = pd.DataFrame(metrics.items(), columns = ['Model', 'RMSE'])
df_metrics.sort_values('RMSE')

Unnamed: 0,Model,RMSE
2,RandomForestRegressor,4321.35443
1,LinearRegression,5824.124033
0,DummyModel,12818.543772


In [51]:
model = rf_model
output_file = 'model_rf_tune.bin'

with open(output_file, 'wb') as f_out:
    pickle.dump(model, f_out)