In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("../input/insurance/insurance.csv")

Dataset: *Machine Leearning with R* Brett Lantz, with a region set in the USA. 

In [None]:
## Data Info
df.info()

In [None]:
df.head().T

In [None]:
## Missing Values
df.isnull().sum()

In [None]:
## Target Variable
### Distribution
sns.displot(df.charges, kde = True, color = "b")
plt.show()

In [None]:
for column in df.columns:
    if df[column].dtype == 'O':
        print(column)
        print(df[column].value_counts(), "\n\n")

In [None]:
## Feature Engineering

from sklearn.preprocessing import LabelEncoder

sex_map = {'male':1, 'female':0}
df['sex'] = df['sex'].map(sex_map).astype('int64')

smoker_map = {'yes':1, 'no':0}
df['smoker'] = df['smoker'].map(smoker_map).astype('int64')

LE = LabelEncoder()
df['region'] = LE.fit_transform(df['region'])
# Southeast region makes highest expense so let region southeast = 2 and others are 1
# df['region'] = df['region'].replace(('southeast', 'southwest', 'northwest', 'northeast'), (2, 1, 1, 1))
# df['region'].value_counts()

In [None]:
print(df.shape[0])
df = df.dropna()

print(df.shape[0])

## Feature Engineering

In [None]:
from sklearn.preprocessing import LabelEncoder

sex_map = {'male':1, 'female':0}
df['sex'] = df['sex'].map(sex_map)

smoker_map = {'yes':1, 'no':0}
df['smoker'] = df['smoker'].map(smoker_map)

LE = LabelEncoder()
df['region'] = LE.fit_transform(df['region'])
# Southeast region makes highest expense so let region southeast = 2 and others are 1
# df['region'] = df['region'].replace(('southeast', 'southwest', 'northwest', 'northeast'), (2, 1, 1, 1))
# df['region'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
X = df.drop(columns = 'charges').values
y = df['charges'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size  = 0.2, 
                                                    random_state=0)

In [None]:
print('Size of x_train = ', X_train.shape)
print('Size of x_test  = ', X_test.shape)
print('Size of y_train = ', y_train.shape)
print('Size of y_test  = ', y_test.shape)

In [None]:
## Feature Scaling

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

## Model Selection
models = []
models.append(('LR', LinearRegression()))
models.append(('RF', RandomForestRegressor()))
models.append(('GBR', GradientBoostingRegressor()))

## Model Evaluation
results = []
names = []
for name, model in models:
    fit_model = model.fit(X_train, y_train)
    y_pred = fit_model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    ## Cross Validation
    cv = cross_val_score(model, X, y, cv = 7)

    results.append((r2_score, rmse))
    names.append(name)
    print("Cross Validation - Reported accuracy should not have high variance")
    print(cv)
    print()
    print('{}:R2 {}% Accuracy - RMSE: Predicted Values +/-{}'.format(name, (round(r2, 3)*100), rmse))

Linear Regression, RandomForestRegressor and Gradient Boosting Regressor models give increasingly reliable results in the same order.

The GradientBoost returns the r squared as 89.6% and Root Mean Squared Error of 4063.9423, this can be interpreted as an almost 90% accuracy of this model and a +/- of 4024.6516 the predicted values from the auctual values.

Now we tune the parameters of the best performing Regressor.

In [None]:
### Hyper Parameter Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    'loss':['ls', 'lad', 'huber', 'quantile'],
    'max_features': ['auto', 'sqrt', 'log2'],
    'learning_rate':[0.05, 0.1, 0.2],
    'max_depth':[1, 2, 10, 150],
    'n_estimators':[100, 150, 500, 750, 1000]
}

GBR = GradientBoostingRegressor()
GBR_cv = GridSearchCV(estimator = GBR, param_grid = param_grid, verbose = 1)
GBR_cv.fit(X_train, y_train)

params = GBR_cv.best_params_
print(params)

In [None]:
### Pipeline
# from sklearn.pipeline import make_pipeline

model = GradientBoostingRegressor(learning_rate = params['learning_rate'], 
                                  loss=params['loss'], 
                                  max_features = params['max_features'])
model.fit(X_train, y_train)

In [None]:
def evaluate_model(model, X_test, y_test, modelName, DataImb):
    print('------------------------------------------------')
    print("Model ", modelName, end="\n")
    print("Data Balancing Type ", DataImb)
    ### Model must be ran outside the function
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print("R2 Score", r2)
    print("RMSE", rmse)
    return[modelName, DataImb, r2, rmse]

evaluate_model(model, X_test, y_test, 'Gradient Boosting Regressor', "Auctual Data")

Not much imporvement from our original accuracy, however the GBRegressor returns the r squared as ~ 90% and Root Mean Squared Error of 3998.0224.

Better Tuning methods and inputs can still be explored.