In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv('/kaggle/input/insurance/insurance.csv')

In [None]:
X = data.drop(['charges'], axis=1)
y = data['charges']

# numerical columns
numerical = [col for col in data.columns if data.dtypes[col] != 'object']
numerical.remove('charges')

# categorical columns
categorical = [col for col in data.columns if data.dtypes[col] == 'object']

print("Numerical columns are: ", numerical)
print("Categorical columns are: ", categorical)

In [None]:
data.shape

# 1 - EDA

In [None]:
data.columns

In [None]:
data.info()

In [None]:
data.head()

# 1.0 Missing values

In [None]:
# Make a list of variables that contain missing values
cols_with_na = [col for col in data.columns if data[col].isnull().sum() > 0]

# Percentage of missing values
data[cols_with_na].isnull().mean()

Good! This dataset has no missing values!

## Distribution of the target

In [None]:
import seaborn as sns
from scipy import stats
from scipy.stats import norm

In [None]:
# sns.set_style('white')
# sns.set_color_codes(palette='deep')

sns.displot(data['charges'])
plt.show()

In [None]:
# Probability Plot
fig = plt.figure()
res = stats.probplot(data['charges'], plot=plt)

plt.show()

In [None]:
# Skewness and Kurtosis
print("Skewness: {:.4f}".format(data['charges'].skew()))
print("Kurtosis: {:.4f}".format(data['charges'].kurt()))

## EDA on numerical data

In [None]:
data[numerical].describe()

## Discrete variables

In [None]:
discrete_vars = [var for var in numerical if len(data[var].unique())<20]

print("Discrete variables: ", discrete_vars)

In [None]:
data['children'].hist()
plt.title('Distribution of Chilren')
plt.ylabel('Number of policies')
plt.show()

In [None]:
def analyze_discrete(df, var):
    df = df.copy()
    df.groupby(var)['charges'].median().plot.bar()
    plt.title(var)
    plt.ylabel('Median Charges')
    plt.show()
    

for var in discrete_vars:
    analyze_discrete(data, var)    

Children numbers show no particular difference in Charges, we may consider taking out this variable when modeling

# 1.1 Continuous variables

In [None]:
continuous_vars = [var for var in numerical if var not in discrete_vars]

print("Continuous variables: ", continuous_vars)

In [None]:
continuous_vars.append('charges')

In [None]:
def analyze_continuous(df, var):
    df = df.copy()
    df[var].hist(bins=50)
    plt.title(var)
    plt.ylabel('Number of policies')
    plt.xlabel(var)
    plt.show()
    

for var in continuous_vars:
    analyze_continuous(data, var)    

Age is not normally distributed. BMI follows a normal bell shape, but it skews to the left.
Charges (ie. the target value) do not follow normal distribution.

Let's evaluate if a logarithmic transformation of the variables returns values that follow a normal distribution:

In [None]:
import numpy as np

# Applying a logarithmic transformation to continuous variables
def analyze_log_continuous(df, var):
    df = df.copy()
    # Skip 0 or negative values
    if any(data[var] <= 0):
        pass
    else:
        df[var] = np.log(df[var])
        df[var].hist(bins=50)
        plt.title(var)
        plt.ylabel('Number of policies')
        plt.xlabel(var)
        plt.show()

for var in continuous_vars:
    analyze_log_continuous(data, var)

## Correlation to target

We get a better spread of values in BMI and Charges

In [None]:
# Correlation matrix
plt.subplots(figsize=(8,6))
corr_matrix = data.corr()

sns.heatmap(corr_matrix, vmax=0.6, annot=True, cmap='hot')
plt.show()

In [None]:
fig, ax = plt.subplots(ncols=3, nrows=1, figsize=(20,16))

for i, feature in enumerate(numerical, 1):
    plt.subplot(len(numerical), 3, i)
    plt.scatter(x=feature, y='charges', data=data)
    plt.title('%s vs Charges' % feature)
    plt.xlabel(feature)
    plt.ylabel('Charges')
    
plt.show()

Almost no correlation between Charges and Number of Children, shown in both scatter plot and correlation matrix.
There is a tendency to an increase in Charges, as Age increases. Also, there is a slightly positive correlation between BMI and Charges**

In [None]:
# explore the relationship between the transformed target and variables

def transform_analyze_continuous(df, var):
    df = df.copy()
    if any(data[var]<=0):
        pass
    else:
        df[var] = np.log(df[var])
        df['charges'] = np.log(df['charges'])
        plt.scatter(x=df[var], y=df['charges'])
        plt.title(var)
        plt.xlabel(var)
        plt.ylabel('charges')
        plt.show()
        
for var in continuous_vars:
    if var != 'charges':
        transform_analyze_continuous(data, var)

# 1.2 Outliers

In [None]:
def find_outliers(df, var):
    df = df.copy()
    if any(data[var] <= 0):
        pass
    else:
        df[var] = np.log(df[var])
        df.boxplot(column=var)
        plt.title(var)
        plt.ylabel(var)
        plt.show()
        
for var in continuous_vars:
    find_outliers(data, var)

BMI contains outliers

In [None]:
# Remove outliers from  BMI
bmi_mean = data['bmi'].mean()
bmi_median = data['bmi'].median()
print("Mean: {:.3f} \tMedian: {:.3f}".format(bmi_mean, bmi_median))

margin = 1.5
q25, q75 = np.percentile(data['bmi'], 25), np.percentile(data['bmi'], 75)
iqr = q75 - q25

print("Q25: {:.3f} \tQ75: {:.3f} \tIQR: {:.3f}".format(q25, q75, iqr))

lower_cutoff = q25 - (margin * iqr)
upper_cutoff = q75 + (margin * iqr)
print("Lower cutoff: {:.3f} \tUpper cutoff: {:.3f}".format(lower_cutoff, upper_cutoff))

outliers = [x for x in data['bmi'] if (x < lower_cutoff or x > upper_cutoff)]
print("BMI outliers: ", outliers)

# 1.3 Categorical variables

In [None]:
data[categorical].head()

### Cardinality: number of labels

In [None]:
data[categorical].nunique()

All catgorical variables show low cardinality

### Rare labels

In [None]:
def analyze_rare_labels(df, var, rare_perc):
    df = df.copy()
    tmp = df.groupby(var)['charges'].count() / len(df)
    return tmp[tmp < rare_perc]

for var in categorical:
    print(analyze_rare_labels(data, var, 0.01))

No rare labels are found.

## Correlation between categorical variables and target

In [None]:
for var in categorical:
    analyze_discrete(data, var)

smoker shows the difference in median of charges, region has slightly difference in charges. sex show no difference, can be consider taking out this variable

# 2 - Feature Engineering

## Setting the seed

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# visualize all columns
pd.pandas.set_option('display.max_columns', None)

In [None]:
data.shape

In [None]:
data.head()

# 2.0 Remove outliers

In [None]:
new_data = data.drop(data[(data['bmi']<lower_cutoff)|(data['bmi']>upper_cutoff)].index)
print("New data size: ", new_data.shape)

## Separate dataset into train and test

In [None]:
X = new_data.drop(['charges'], axis=1)
y = new_data['charges']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.shape, X_test.shape)

# 2.1 Numerical variables transformation

In [None]:
numerical

In [None]:
X_train[numerical].head()

In [None]:
# Log transform the positive numerical variables
for var in ['age', 'bmi']:
    X_train[var] = np.log(X_train[var])
    X_test[var] = np.log(X_test[var])

# 2.2 Encoding categorical variables

In [None]:
categorical

In [None]:
X_train[categorical].head()

In [None]:
X_train_encoded = pd.get_dummies(X_train[categorical])
X_test_encoded = pd.get_dummies(X_test[categorical])

## Combine dataset

In [None]:
X_train = pd.concat([X_train[['age', 'bmi', 'children']], X_train_encoded], axis=1)
X_test = pd.concat([X_test[['age', 'bmi', 'children']], X_test_encoded], axis=1)

In [None]:
train_columns = X_train.columns

# 2.3 Feature Scaling

In [None]:
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train, columns=train_columns)
X_test = pd.DataFrame(X_test, columns=train_columns)

In [None]:
# # let's now save the train and test sets for the next notebook!

# X_train.to_csv('xtrain.csv', index=False)
# X_test.to_csv('xtest.csv', index=False)

# 3 - Feature Selection

In [None]:
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.feature_selection import SelectFromModel

In [None]:
# select a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.

# Then we use the selectFromModel object from sklearn, which
# will select automatically the features which coefficients are non-zero

sel_ = SelectFromModel(Lasso(alpha=0.005, random_state=0))
sel_.fit(X_train, y_train)

In [None]:
sel_.get_support()

In [None]:
selected_features = X_train.columns[(sel_.get_support())]

print(selected_features)

# 4 - Base Models with all features

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
from sklearn.model_selection import cross_val_score, KFold, cross_validate
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

In [None]:
lin_reg_model = LinearRegression()
ridge_l1_model = Ridge(alpha=1.0, solver='auto', random_state=0)
lasso_l2_model = Lasso(alpha=0.0005, random_state=0)
elastic_net_model = ElasticNet(alpha=0.001, l1_ratio=0.8, max_iter=10000, random_state=0)
tree_model = DecisionTreeRegressor(random_state=0)
forest_model = RandomForestRegressor(bootstrap=True, max_features='auto', n_jobs=-1, random_state=0)
gb_model = GradientBoostingRegressor(loss='huber', random_state=0)
xgb_model = xgb.XGBRegressor(learning_rate=0.01, nthread=-1, random_state=0)
lgb_model = lgb.LGBMRegressor(learning_rate=0.01, objective='regression', random_state=0)

poly2_pipeline = Pipeline([('transform', PolynomialFeatures(degree=2)), ('model', lin_reg_model)])
poly3_pipeline = Pipeline([('transform', PolynomialFeatures(degree=3)), ('model', lin_reg_model)])
poly4_pipeline = Pipeline([('transform', PolynomialFeatures(degree=4)), ('model', lin_reg_model)])

In [None]:
base_models = [lin_reg_model, ridge_l1_model, lasso_l2_model, elastic_net_model, 
               tree_model, forest_model, gb_model, xgb_model, lgb_model, 
               poly2_pipeline, poly3_pipeline, poly4_pipeline]

kf = KFold(n_splits=3, random_state=5, shuffle=True)

scoring = {'NMSE': 'neg_mean_squared_error', 'R2': 'r2'}

def train_models(model_list, X, y):
    all_models_performance = []
    
    for model in model_list:
        model_performance = {}
        model_name = model.__class__.__name__
        
        if model_name == 'Pipeline':
            model_name = 'PolyReg_Degree%s' % model.get_params()['transform__degree']
            
        print("Model fitting: %s" % model_name)
        
        model_performance['Model name'] = model_name
        
        scores = cross_validate(model, X, y, scoring=scoring, cv=kf)
        rmse_scores = np.sqrt(-1 * scores['test_NMSE'])
        r2_scores = scores['test_R2']
        
        model_performance['Mean Fit Time'] = round(scores['fit_time'].mean(), 4)
        
        model_performance['RMSE Scores'] = np.around(rmse_scores, 4)
        model_performance['Mean RMSE'] = round(rmse_scores.mean(), 4)
        model_performance['RMSE STD'] = round(rmse_scores.std(), 4)
        
        model_performance['R2 Scores'] = np.around(r2_scores, 4)
        model_performance['Mean R2'] = round(r2_scores.mean(), 4)
        model_performance['R2 STD'] = round(r2_scores.std(), 4)
        
        all_models_performance.append(model_performance)
        
    return all_models_performance

base_model_performance_matrix = train_models(base_models, X_train, y_train)

In [None]:
pd.DataFrame(base_model_performance_matrix).sort_values(by='Mean RMSE', ascending=True)

Gradient Boosting, Random Forest and Polynomial Regreesion with degree 2 and degree 3 are promising models

# 4.2 Promising models using selected features

In [None]:
promising_models = [forest_model, gb_model, poly2_pipeline, poly3_pipeline]


promising_model_performance_matrix = train_models(promising_models, X_train[selected_features], y_train)

In [None]:
pd.DataFrame(promising_model_performance_matrix).sort_values(by='Mean RMSE', ascending=True)

# 5 - Hyperparameters Tuning

## 5-1 Gradient Boosting

In [None]:
from sklearn.model_selection import GridSearchCV


def print_cv_results(grid_search):
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
        print(np.sqrt(-mean_score), params)

In [None]:
# from sklearn.model_selection import GridSearchCV

# # Gradient Boosting
# param_grid = [
#     {"learning_rate": [0.01, 0.05, 0.1, 0.25, 0.5, 1],
# #      'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200], #np.arange(300, 1000, 200),
# #      'max_depth': np.linspace(1, 32, 32, endpoint=True),
# #      'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
# #      'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True),
# #      'max_features': list(range(1,len(selected_features)))
#     }]

# grid_search = GridSearchCV(gb_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
# grid_search.fit(X_train[selected_features], y_train)

In [None]:
# def plot_cv_resutls(grid_search):
#     grid_df = pd.DataFrame(grid_search.cv_results_)
#     x = grid_df.iloc[:, 4]
#     y = np.sqrt(-grid_df['mean_test_score'])
#     plt.title(grid_df.columns[4])
#     plt.xlabel(grid_df.columns[4])
#     plt.ylabel('RMSE')
#     plt.plot(x, y, label='RMSE', c='b')
#     plt.show()

# def print_cv_results(grid_search):
#     cvres = grid_search.cv_results_
#     for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
#         print(np.sqrt(-mean_score), params)

In [None]:
# print_cv_results(grid_search)
# grid_search.best_params_, np.sqrt(-grid_search.best_score_)

In [None]:
# param_grid = [
#     {"learning_rate": [0.1],
#      'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200, 500],
# #      'max_depth': np.linspace(1, 32, 32, endpoint=True),
# #      'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
# #      'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True),
# #      'max_features': list(range(1,len(selected_features)))
#     }]

# grid_search = GridSearchCV(gb_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
# grid_search.fit(X_train[selected_features], y_train)

# print_cv_results(grid_search)
# grid_search.best_params_, np.sqrt(-grid_search.best_score_)

In [None]:
# param_grid = [
#     {"learning_rate": [0.1],
#      'n_estimators': [100],
#      'max_depth': np.linspace(1, 32, 32, endpoint=True),
# #      'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
# #      'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True),
# #      'max_features': list(range(1,len(selected_features)))
#     }]

# grid_search = GridSearchCV(gb_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
# grid_search.fit(X_train[selected_features], y_train)

# print_cv_results(grid_search)
# grid_search.best_params_, np.sqrt(-grid_search.best_score_)

In [None]:
# param_grid = [
#     {"learning_rate": [0.1],
#      'n_estimators': [100],
#      'max_depth': [3], # np.linspace(1, 32, 32, endpoint=True),
#      'min_samples_split': np.arange(1, 20, 1)
# #      'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True),
# #      'max_features': list(range(1,len(selected_features)))
#     }]

# grid_search = GridSearchCV(gb_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
# grid_search.fit(X_train[selected_features], y_train)

# print_cv_results(grid_search)
# grid_search.best_params_, np.sqrt(-grid_search.best_score_)

In [None]:
# param_grid = [
#     {"learning_rate": [0.1],
#      'n_estimators': [100],
#      'max_depth': [3], # np.linspace(1, 32, 32, endpoint=True),
#      'min_samples_split': [7], # np.arange(1, 20, 1)
#      'min_samples_leaf': np.arange(1, 20, 1),
# #      'max_features': list(range(1,len(selected_features)))
#     }]

# grid_search = GridSearchCV(gb_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
# grid_search.fit(X_train[selected_features], y_train)

# print_cv_results(grid_search)
# grid_search.best_params_, np.sqrt(-grid_search.best_score_)

In [None]:
# param_grid = [
#     {"learning_rate": [0.1],
#      'n_estimators': [100],
#      'max_depth': [3],
#      'min_samples_split': [7],
#      'min_samples_leaf': [10]
#      'max_features': ['auto', 'sqrt', 'log2', None]
#     }]

# grid_search = GridSearchCV(gb_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
# grid_search.fit(X_train[selected_features], y_train)

# print_cv_results(grid_search)
# grid_search.best_params_, np.sqrt(-grid_search.best_score_)

In [None]:
param_grid = [
    {"learning_rate": [0.1],
     'n_estimators': [100],
     'max_depth': [3],
     'min_samples_split': [7],
     'min_samples_leaf': [10],
     'max_features': ['auto']
    }]

grid_search = GridSearchCV(gb_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
grid_search.fit(X_train[selected_features], y_train)

print_cv_results(grid_search)
grid_search.best_params_, np.sqrt(-grid_search.best_score_)

## 5-2 Random Forest

In [None]:
# param_grid = [
#     {'bootstrap': [True, False],
#      'max_depth': [10, 20, 30, 40, 50, None],
# #      'max_features': ['auto', 'sqrt'],
# #      'min_samples_leaf': np.arange(1, 20, 1),
# #      'min_samples_split': np.arange(1, 20, 1),
# #      'n_estimators': np.arange(100, 2000, 100)
#     }]

# grid_search = GridSearchCV(forest_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
# grid_search.fit(X_train[selected_features], y_train)

# print_cv_results(grid_search)
# grid_search.best_params_, np.sqrt(-grid_search.best_score_)

In [None]:
# param_grid = [
#     {'bootstrap': [True],
#      'max_depth': np.arange(1, 11, 1)# [10, 20, 30, 40, 50, None],
# #      'max_features': ['auto', 'sqrt'],
# #      'min_samples_leaf': np.arange(1, 20, 1),
# #      'min_samples_split': np.arange(1, 20, 1),
# #      'n_estimators': np.arange(100, 2000, 100)
#     }]

# grid_search = GridSearchCV(forest_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
# grid_search.fit(X_train[selected_features], y_train)

# print_cv_results(grid_search)
# grid_search.best_params_, np.sqrt(-grid_search.best_score_)

In [None]:
# param_grid = [
#     {'bootstrap': [True],
#      'max_depth': [5],
#      'max_features': ['auto', 'sqrt'],
#      'min_samples_leaf': np.arange(1, 20, 1),
# #      'min_samples_split': np.arange(1, 20, 1),
# #      'n_estimators': np.arange(100, 2000, 100)
#     }]

# grid_search = GridSearchCV(forest_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
# grid_search.fit(X_train[selected_features], y_train)

# print_cv_results(grid_search)
# grid_search.best_params_, np.sqrt(-grid_search.best_score_)

In [None]:
# param_grid = [
#     {'bootstrap': [True],
#      'max_depth': [5],
#      'max_features': ['auto'],
#      'min_samples_leaf': [6],
#      'min_samples_split': np.arange(1, 21, 1),
# #      'n_estimators': np.arange(100, 2000, 100)
#     }]

# grid_search = GridSearchCV(forest_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
# grid_search.fit(X_train[selected_features], y_train)

# print_cv_results(grid_search)
# grid_search.best_params_, np.sqrt(-grid_search.best_score_)

In [None]:
# param_grid = [
#     {'bootstrap': [True],
#      'max_depth': [5],
#      'max_features': ['auto'],
#      'min_samples_leaf': [6],
#      'min_samples_split': [2],
#      'n_estimators': np.arange(100, 2000, 100)
#     }]

# grid_search = GridSearchCV(forest_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
# grid_search.fit(X_train[selected_features], y_train)

# print_cv_results(grid_search)
# grid_search.best_params_, np.sqrt(-grid_search.best_score_)

In [None]:
param_grid = [
    {'bootstrap': [True],
     'max_depth': [5],
     'max_features': ['auto'],
     'min_samples_leaf': [6],
     'min_samples_split': [2],
     'n_estimators': [300] #np.arange(100, 2000, 100)
    }]

grid_search = GridSearchCV(forest_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3, n_jobs=-1)
grid_search.fit(X_train[selected_features], y_train)

print_cv_results(grid_search)
grid_search.best_params_, np.sqrt(-grid_search.best_score_)

# 6-1 Performance Evaluation

In [None]:
gb_params = {'learning_rate': 0.1,
  'max_depth': 3,
  'max_features': 'auto',
  'min_samples_leaf': 10,
  'min_samples_split': 7,
  'n_estimators': 100}

forest_params = {'bootstrap': True,
  'max_depth': 5,
  'max_features': 'auto',
  'min_samples_leaf': 6,
  'min_samples_split': 2,
  'n_estimators': 300}

In [None]:
forest_model = RandomForestRegressor(n_jobs=-1, random_state=0, **forest_params)
gb_model = GradientBoostingRegressor(loss='huber', random_state=0, **gb_params)

# 6-1a Gradient Boosting Model - Performance Evaluation

In [None]:
# Gradient Boosting Model with best params
gb_model.fit(X_train[selected_features], y_train)

train_pred = gb_model.predict(X_train[selected_features])
test_pred = gb_model.predict(X_test[selected_features])

# Metrics of train set predictions
print("Train MSE: {}".format(int(mean_squared_error(y_train, train_pred))))
print("Train RMSE: {}".format(int(sqrt(mean_squared_error(y_train, train_pred)))))
print("Train R2 score: {}".format(r2_score(y_train, train_pred)))
print('\n')

# Metrics of test set predictions
print("Test MSE: {}".format(int(mean_squared_error(y_test, test_pred))))
print("Test RMSE: {}".format(int(sqrt(mean_squared_error(y_test, test_pred)))))
print("Test R2 score: {}".format(r2_score(y_test, test_pred)))

In [None]:
plt.scatter(X_test['age'], y_test, c='r', label='True')
plt.scatter(X_test['age'], test_pred, label='Predicted')
plt.legend(loc='best')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.title('True/ Predicted Charges vs. Age')
plt.show()

In [None]:
plt.scatter(y_test, test_pred)
plt.xlabel("True charges")
plt.ylabel("Predicted charges")
plt.title("Evaluation of Predictions")
plt.show()

In [None]:
errors = y_test - test_pred
errors.hist(bins=20)
plt.show()

# 6-2a Gradient Boosting - Feature Importance

In [None]:
importance = pd.Series(np.abs(gb_model.feature_importances_.ravel()))
importance.index = selected_features
importance.sort_values(inplace=True, ascending=False)
ax = importance.plot.bar(figsize=(12,6))
plt.ylabel('Gradient Boosting Model Coefficients')
plt.title('Feature Importance')

for p in ax.patches:
    ax.annotate("%.6f" % p.get_height(), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 10), 
                textcoords='offset points')

plt.show()

pd.DataFrame(importance)

# 6-1b Random Forest Model - Performance Evaluation

In [None]:
# Random Forest Model with best params
forest_model.fit(X_train[selected_features], y_train)

train_pred = forest_model.predict(X_train[selected_features])
test_pred = forest_model.predict(X_test[selected_features])

# Metrics of train set predictions
print("Train MSE: {}".format(int(mean_squared_error(y_train, train_pred))))
print("Train RMSE: {}".format(int(sqrt(mean_squared_error(y_train, train_pred)))))
print("Train R2 score: {}".format(r2_score(y_train, train_pred)))
print('\n')

# Metrics of test set predictions
print("Test MSE: {}".format(int(mean_squared_error(y_test, test_pred))))
print("Test RMSE: {}".format(int(sqrt(mean_squared_error(y_test, test_pred)))))
print("Test R2 score: {}".format(r2_score(y_test, test_pred)))

In [None]:
plt.scatter(X_test['age'], y_test, c='r', label='True')
plt.scatter(X_test['age'], test_pred, label='Predicted')
plt.legend(loc='best')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.title('True/ Predicted Charges vs. Age')
plt.show()

In [None]:
plt.scatter(y_test, test_pred)
plt.xlabel("True charges")
plt.ylabel("Predicted charges")
plt.title("Evaluation of Predictions")
plt.show()

In [None]:
errors = y_test - test_pred
errors.hist(bins=20)
plt.show()

# 6-2b Random Forest - Feature Importance

In [None]:
importance = pd.Series(np.abs(forest_model.feature_importances_.ravel()))
importance.index = selected_features
importance.sort_values(inplace=True, ascending=False)
ax = importance.plot.bar(figsize=(12,6))
plt.ylabel('Gradient Boosting Model Coefficients')
plt.title('Feature Importance')

for p in ax.patches:
    ax.annotate("%.6f" % p.get_height(), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 10), 
                textcoords='offset points')

plt.show()

pd.DataFrame(importance)