In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import r2_score, mean_absolute_error, median_absolute_error

In [3]:
from time import time

## Load dataset

In [4]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null object
bmi         1333 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.2+ KB


In [6]:
df.isnull().sum()

age         0
sex         0
bmi         5
children    0
smoker      0
region      0
charges     0
dtype: int64

In [7]:
df.nunique()

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

In [8]:
df.describe(include='all')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338,1333.0,1338.0,1338,1338,1338.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,676,,,1064,364,
mean,39.207025,,30.658545,1.094918,,,13270.422265
std,14.04996,,6.092785,1.205493,,,12110.011237
min,18.0,,15.96,0.0,,,1121.8739
25%,27.0,,26.315,0.0,,,4740.28715
50%,39.0,,30.4,1.0,,,9382.033
75%,51.0,,34.675,2.0,,,16639.912515


## Data preparation

In [9]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

gender_num = {'female': 0, 'male': 1}
df['sex'] = df['sex'].map(gender_num)

smoker_num = {'no': 0, 'yes': 1}
df['smoker'] = df['smoker'].map(smoker_num)

df = pd.get_dummies(df, columns=['region'], prefix='region')

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


## Train / Validation / Test data split

In [10]:
features = df.drop('charges', axis=1)
target = df['charges']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

for dataset in [y_train, y_val, y_test]:
    print(round(len(dataset) / len(target), 2))

0.6
0.2
0.2


## Standardization of features

In [11]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(np.float))
X_val = sc.transform(X_val.astype(np.float))
X_test = sc.transform(X_test.astype(np.float))

## Model training and evaluation

### Model 1: Linear Regression

In [12]:
LR = LinearRegression()
LR = LR.fit(X_train, y_train)
round(LR.score(X_train, y_train), 4)

0.7483

In [13]:
scores = cross_val_score(LR, X_train, y_train.values.ravel(), cv=5)
print(round(scores.mean(), 4))

0.7359


In [14]:
LR

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Using GridSearchCV

In [15]:
def print_best_score(hp_optimizer):
    print('BEST SCORE: {} - PARAMS: {}\n'.format(round(hp_optimizer.best_score_, 3), hp_optimizer.best_params_))

    means = hp_optimizer.cv_results_['mean_test_score']
    stds = hp_optimizer.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, hp_optimizer.cv_results_['params']):
        print('Score: {} (+/-{}) for Params: {}'.format(round(mean, 4), round(std * 2, 4), params))

### Model 2: Multilayer Perceptron

In [16]:
mlp = MLPRegressor(solver='lbfgs')
parameters = {
    'hidden_layer_sizes': [(10,), (20,), (30,), (40,), (50,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

CV_1 = GridSearchCV(mlp, parameters, cv=5, iid=False)
CV_1.fit(X_train, y_train.values.ravel())

print_best_score(CV_1)

BEST SCORE: 0.808 - PARAMS: {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'constant'}

Score: 0.8083 (+/-0.0558) for Params: {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'constant'}
Score: 0.8062 (+/-0.0361) for Params: {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling'}
Score: 0.8068 (+/-0.0317) for Params: {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'adaptive'}
Score: 0.7919 (+/-0.035) for Params: {'activation': 'relu', 'hidden_layer_sizes': (20,), 'learning_rate': 'constant'}
Score: 0.7952 (+/-0.0448) for Params: {'activation': 'relu', 'hidden_layer_sizes': (20,), 'learning_rate': 'invscaling'}
Score: 0.7823 (+/-0.0643) for Params: {'activation': 'relu', 'hidden_layer_sizes': (20,), 'learning_rate': 'adaptive'}
Score: 0.761 (+/-0.034) for Params: {'activation': 'relu', 'hidden_layer_sizes': (30,), 'learning_rate': 'constant'}
Score: 0.7819 (+/-0.0493) for Params: {'activation': 're

In [17]:
MLP = CV_1.best_estimator_
MLP

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(10,), learning_rate='constant',
             learning_rate_init=0.001, max_iter=200, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
             validation_fraction=0.1, verbose=False, warm_start=False)

### Model 3: Decision Tree

In [18]:
dt = DecisionTreeRegressor()
parameters = {
    'min_samples_leaf': np.arange(9, 15, 1, int),
    'max_depth': np.arange(2, 8, 1, int)
}

CV_2 = GridSearchCV(dt, parameters, cv=5, iid=False)
CV_2.fit(X_train, y_train.values.ravel())

print_best_score(CV_2)

BEST SCORE: 0.835 - PARAMS: {'max_depth': 4, 'min_samples_leaf': 13}

Score: 0.8049 (+/-0.0567) for Params: {'max_depth': 2, 'min_samples_leaf': 9}
Score: 0.8049 (+/-0.0567) for Params: {'max_depth': 2, 'min_samples_leaf': 10}
Score: 0.8049 (+/-0.0567) for Params: {'max_depth': 2, 'min_samples_leaf': 11}
Score: 0.8049 (+/-0.0567) for Params: {'max_depth': 2, 'min_samples_leaf': 12}
Score: 0.8049 (+/-0.0567) for Params: {'max_depth': 2, 'min_samples_leaf': 13}
Score: 0.8049 (+/-0.0567) for Params: {'max_depth': 2, 'min_samples_leaf': 14}
Score: 0.8251 (+/-0.0359) for Params: {'max_depth': 3, 'min_samples_leaf': 9}
Score: 0.8251 (+/-0.0359) for Params: {'max_depth': 3, 'min_samples_leaf': 10}
Score: 0.8251 (+/-0.0359) for Params: {'max_depth': 3, 'min_samples_leaf': 11}
Score: 0.8251 (+/-0.0359) for Params: {'max_depth': 3, 'min_samples_leaf': 12}
Score: 0.8251 (+/-0.0359) for Params: {'max_depth': 3, 'min_samples_leaf': 13}
Score: 0.8251 (+/-0.0359) for Params: {'max_depth': 3, 'min_sam

In [19]:
DT = CV_2.best_estimator_
DT

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=13,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

### Model 4: Random Forest

In [20]:
rf = RandomForestRegressor()
parameters = {
    'n_estimators': np.arange(10, 120, 20),
    'max_depth': np.arange(1, 11, 2),
    'min_samples_leaf': np.arange(1, 11, 2, int),
}

CV_3 = GridSearchCV(rf, parameters, cv=5, iid=False)
CV_3.fit(X_train, y_train.values.ravel())

print_best_score(CV_3)

BEST SCORE: 0.847 - PARAMS: {'max_depth': 9, 'min_samples_leaf': 9, 'n_estimators': 90}

Score: 0.5817 (+/-0.1172) for Params: {'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 10}
Score: 0.5815 (+/-0.1204) for Params: {'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 30}
Score: 0.5825 (+/-0.1193) for Params: {'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 50}
Score: 0.5817 (+/-0.1213) for Params: {'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 70}
Score: 0.5811 (+/-0.1212) for Params: {'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 90}
Score: 0.5812 (+/-0.1211) for Params: {'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 110}
Score: 0.5823 (+/-0.1204) for Params: {'max_depth': 1, 'min_samples_leaf': 3, 'n_estimators': 10}
Score: 0.5822 (+/-0.1206) for Params: {'max_depth': 1, 'min_samples_leaf': 3, 'n_estimators': 30}
Score: 0.5816 (+/-0.1215) for Params: {'max_depth': 1, 'min_samples_leaf': 3, 'n_estimators': 50}
Score: 0.581 (+/-0.1218) for

Score: 0.8421 (+/-0.0235) for Params: {'max_depth': 9, 'min_samples_leaf': 5, 'n_estimators': 30}
Score: 0.8449 (+/-0.0218) for Params: {'max_depth': 9, 'min_samples_leaf': 5, 'n_estimators': 50}
Score: 0.8442 (+/-0.0216) for Params: {'max_depth': 9, 'min_samples_leaf': 5, 'n_estimators': 70}
Score: 0.8433 (+/-0.022) for Params: {'max_depth': 9, 'min_samples_leaf': 5, 'n_estimators': 90}
Score: 0.8448 (+/-0.0225) for Params: {'max_depth': 9, 'min_samples_leaf': 5, 'n_estimators': 110}
Score: 0.8433 (+/-0.0291) for Params: {'max_depth': 9, 'min_samples_leaf': 7, 'n_estimators': 10}
Score: 0.8445 (+/-0.0316) for Params: {'max_depth': 9, 'min_samples_leaf': 7, 'n_estimators': 30}
Score: 0.8442 (+/-0.0267) for Params: {'max_depth': 9, 'min_samples_leaf': 7, 'n_estimators': 50}
Score: 0.8458 (+/-0.025) for Params: {'max_depth': 9, 'min_samples_leaf': 7, 'n_estimators': 70}
Score: 0.8452 (+/-0.0246) for Params: {'max_depth': 9, 'min_samples_leaf': 7, 'n_estimators': 90}
Score: 0.8457 (+/-0.0

In [21]:
RF = CV_3.best_estimator_
RF

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=9,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=9, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=90,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

### Model 5: Gradient Boosting

In [22]:
gb = GradientBoostingRegressor()
parameters = {
    'n_estimators': np.arange(10, 120, 20),
    'max_depth': np.arange(1, 13, 2),
    'min_samples_leaf': np.arange(1, 15, 2, int),
}

CV_4 = GridSearchCV(gb, parameters, cv=5, iid=False)
CV_4.fit(X_train, y_train.values.ravel())

print_best_score(CV_4)

BEST SCORE: 0.846 - PARAMS: {'max_depth': 3, 'min_samples_leaf': 3, 'n_estimators': 50}

Score: 0.5117 (+/-0.0749) for Params: {'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 10}
Score: 0.6756 (+/-0.0528) for Params: {'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 30}
Score: 0.7165 (+/-0.0507) for Params: {'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 50}
Score: 0.729 (+/-0.0511) for Params: {'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 70}
Score: 0.7335 (+/-0.0529) for Params: {'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 90}
Score: 0.7347 (+/-0.0533) for Params: {'max_depth': 1, 'min_samples_leaf': 1, 'n_estimators': 110}
Score: 0.5117 (+/-0.0749) for Params: {'max_depth': 1, 'min_samples_leaf': 3, 'n_estimators': 10}
Score: 0.6756 (+/-0.0528) for Params: {'max_depth': 1, 'min_samples_leaf': 3, 'n_estimators': 30}
Score: 0.7165 (+/-0.0507) for Params: {'max_depth': 1, 'min_samples_leaf': 3, 'n_estimators': 50}
Score: 0.729 (+/-0.0511) for 

Score: 0.8314 (+/-0.0222) for Params: {'max_depth': 9, 'min_samples_leaf': 13, 'n_estimators': 50}
Score: 0.8267 (+/-0.0216) for Params: {'max_depth': 9, 'min_samples_leaf': 13, 'n_estimators': 70}
Score: 0.8229 (+/-0.023) for Params: {'max_depth': 9, 'min_samples_leaf': 13, 'n_estimators': 90}
Score: 0.8193 (+/-0.0232) for Params: {'max_depth': 9, 'min_samples_leaf': 13, 'n_estimators': 110}
Score: 0.7003 (+/-0.0232) for Params: {'max_depth': 11, 'min_samples_leaf': 1, 'n_estimators': 10}
Score: 0.7691 (+/-0.0449) for Params: {'max_depth': 11, 'min_samples_leaf': 1, 'n_estimators': 30}
Score: 0.7677 (+/-0.0587) for Params: {'max_depth': 11, 'min_samples_leaf': 1, 'n_estimators': 50}
Score: 0.7662 (+/-0.0574) for Params: {'max_depth': 11, 'min_samples_leaf': 1, 'n_estimators': 70}
Score: 0.7722 (+/-0.0477) for Params: {'max_depth': 11, 'min_samples_leaf': 1, 'n_estimators': 90}
Score: 0.762 (+/-0.0556) for Params: {'max_depth': 11, 'min_samples_leaf': 1, 'n_estimators': 110}
Score: 0.7

In [23]:
GB = CV_4.best_estimator_
GB

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=3, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

## Evaluation of best models on validation dataset

In [24]:
mdl_names = ['LR', 'MLP', 'DT', 'RF', 'GB']
mdl_list = [LR, MLP, DT, RF, GB]
models = dict(zip(mdl_names, mdl_list))

In [25]:
def evaluate_model(name, model, features, target):
    start = time()
    pred = model.predict(features)
    end = time()
    r2 = round(r2_score(target, pred), 4)
    mae = round(mean_absolute_error(target, pred), 2)
    medae = round(median_absolute_error(target, pred), 2)
    print('{} -- R2 Score: {} / MAE: {} / MedAE: {} / Latency: {}ms'.format(name, 
                                                                            r2, 
                                                                            mae, 
                                                                            medae, 
                                                                            round((end - start)*1000, 1)))

In [26]:
for name, mdl in models.items():
    evaluate_model(name, mdl, X_val, y_val)

LR -- R2 Score: 0.7139 / MAE: 4179.57 / MedAE: 2785.77 / Latency: 0.0ms
MLP -- R2 Score: 0.8096 / MAE: 2979.49 / MedAE: 1581.12 / Latency: 104.2ms
DT -- R2 Score: 0.8351 / MAE: 2622.11 / MedAE: 1742.41 / Latency: 0.0ms
RF -- R2 Score: 0.8415 / MAE: 2509.42 / MedAE: 1597.56 / Latency: 26.0ms
GB -- R2 Score: 0.8442 / MAE: 2495.53 / MedAE: 1521.99 / Latency: 1.0ms


## Final model evaluation on test dataset

In [27]:
evaluate_model('Gradient Boosting', models['GB'], X_test, y_test)

Gradient Boosting -- R2 Score: 0.8929 / MAE: 2292.25 / MedAE: 1465.38 / Latency: 1.0ms
