In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/insurance/insurance.csv')
df.head()

In [None]:
sns.heatmap(df.isnull(),)

In [None]:
df.info(), len(df)

In [None]:
cats = df.select_dtypes('O')
numerical = df.select_dtypes(['int', 'float'])
target = 'charges'
numerical = numerical.drop(columns = target)
cat = cats.columns
cats.columns, numerical.columns

In [None]:
for i, column in enumerate(cats):
    sns.catplot(x = column, y = target, kind = 'violin', data = df)

Males and Females have a similar charge distribution, however, males have a slightly larger distribution at costs near 45k, while having a slightly smaller distribution near charges of 5k. We should keep this parameter
Smoking has a great effect on the surance rages, accounting for most, if not all of the charges abouve 35k, while no smoking accounts for all of the charges around <5k. This is a very important parameter to keep
Region has very little effect on the charges. There is a slight change in distribution among the regions, similar to the male/female split. I'll add this in a different model to see if it makes any difference.

In [None]:
sns.boxplot(x = target, data = df)
len(df.loc[df.charges > 45000])/len(df)

In [None]:
len(df.loc[df['charges'] > 45000])/len(df)

Charges > 45 K make up 3% of our data, however, it is tied closely with high age and smoking. I'll keep it around to help predict those values

In [None]:
fig, ax = plt.subplots(3, 3, figsize = (15, 15))
ax = ax.flatten()
for i, column in enumerate(numerical):
    sns.scatterplot(x = column, y = target, hue = cat[0], ax = ax[3 * i], data = df)
    sns.scatterplot(x = column, y = target, hue = cat[1], ax = ax[3 * i + 1], data = df)
    sns.scatterplot(x = column, y = target, hue = cat[2], ax = ax[3 * i + 2], data = df)

Our target variable is 'Charges' which I will interchange with costs

1. The top three graphs show an increase with charges as the insured ages. By eye, the effect is at 25% for the highest band, and maybe 400% for the lower band
    However, the charges per each age is highly segregated into three bands, with the highest and lowest bands being attributable to the smoking/nonsmoking
    divide. At teh youngest ages, the higher band is an order of magnitude higher in cost then the lowest band. The reasonining behind the middle band cannot be
    determined by the current graphs. It does appear that men pay higher than women, but maybe this is a smoking correlation? I'll check to see
    what women/men smoking looks like, and if there are separate bands for this
2. BMI is not really a factor for charges if you don't smoke, otherwise there is a strong correlatoin between bmi and smoking
3. Having more kids doesn't seem to have an effect until after three children, although this might be due to a lack of samples
4. The effect of regions is difficult to see due to the muddied graphs. For example, orange (southeast) appear to have less children, and a higher bmi
    especially when compared to green (northwest).
    
More investigation

In [None]:
sns.catplot(x = 'smoker', y = target, hue = 'sex', kind = 'violin', split = True, data = df)
sns.catplot(x = 'sex', y = 'age', kind = 'violin', data = df)
sns.catplot(x = 'sex', y = 'bmi', hue = 'smoker', split = True, kind = 'violin', data = df)#sns.scatterplot(x = 'age', y = target, hue = 'smoker', style = 'sex', data = df)

Non smoking males and females have essentially the same cost distributions. Smokers have two 'central' charges, one hear 20k, and one near 40k. Male smokers are more likely to be at the 40k region, than the female smokers, and this is not due to any differences in the age distributions (which also could account for the differences in charges). So the bands in the charges with age can be seen as non-smokers (lowest band), mostly smoking women with some smoking men.

In the bottom graph, we see that women who smoke have a lower bmi, while men who smoke have a higher bmi. BMI is correlated with the insurance prices for smokers, so this may explain why smoking women have a distribution that favors lower costs.


The important parameters appear to be:
1. Smoking, this clearly has the largest effect
2. BMI, there's a strong effect with increasing BMI if the client is a smoker
3. Age

Parameters that might have some contributions:
1. Sex, the distributions for smoking charges are different for men and women, however this is likely due to smoking women having a lower bmi than men.
2. Region, 

In [None]:
from collections import defaultdict

from xgboost import XGBRegressor

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
def get_scores(X, y, z, folds, models, model_names):
    train_rmse = defaultdict(list)
    
    val_rmse = defaultdict(list)
    
    mmx = MinMaxScaler()
    for train_idx, val_idx in folds.split(X, z):
        for model, name in zip(models, model_names):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            model.fit(X_train, y_train)
            train_preds = model.predict(X_train)
            train_rmse[name].append(mean_squared_error(y_train, train_preds, squared = False))
            
            val_preds = model.predict(X_val)
            
            val_rmse[name].append(mean_squared_error(y_val, val_preds, squared = False))
            
    return train_rmse, val_rmse

In [None]:
def print_scores(name, scores, score_name, typ):
    npscores = np.array(scores)
    avg = np.average(npscores)
    mx = np.max(npscores)
    mn = np.min(npscores)
    print(f'{typ} {name}:\n + Average {score_name}: {avg} \n Max {score_name}: {mx} \n Min {score_name}: {mn}')

In [None]:
X = df[['smoker', 'bmi', 'age']]
y = df[target]
z = y.map(lambda x: x // 10000)
X.loc[:,'smoker'] = X.loc[:,'smoker'].map(lambda x: 1 if x == 'yes' else 0)
xgb = XGBRegressor()
forest = RandomForestRegressor()
ridge = Ridge() 
svr = SVR()
knn = KNeighborsRegressor()

models = [ridge, forest, xgb, svr, knn]
model_names = ['Ridge', 'Forest', 'XGB', 'SVR', 'KNN']

skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 2)

train_rmse, val_rmse = get_scores(X, y, z, skf, models, model_names)

for name in model_names:
    print_scores(name, train_rmse[name], 'RMSE', 'Train')
    print_scores(name, val_rmse[name], 'RMSE', 'Val')

We can see that the Random Forest and XGB are oveerfitting the data, based on the average scores, however they perform better than the Ridge Regressor. What is more concerning, however, is that the overfitting is causing unstable results, for example the VAL RMSE for the forest regressor ranges from 1823 - 4882, which is 2.5 times the smalled rmse. I'm going to check a few more parameters before I start playing with the estimators to try to get them to be more stable.

KNN and SVR seem like really bad model choices

In [None]:
X = df[['smoker', 'bmi', 'age', 'sex']]
X.loc[:,'smoker'] = X.loc[:,'smoker'].map(lambda x: 1 if x == 'yes' else 0)
X.loc[:,'sex'] = X.loc[:,'sex'].map(lambda x: 1 if x == 'male' else 0)


models = [ridge, forest, xgb]
model_names = ['Ridge', 'Forest', 'XGB']

train_rmse, val_rmse = get_scores(X, y, z, skf, models, model_names)

for name in model_names:
    print_scores(name, train_rmse[name], 'RMSE', 'Train')
    print_scores(name, val_rmse[name], 'RMSE', 'Val')

So we can see that sex is fairly unimportant, as expected

In [None]:
X = df[['smoker', 'bmi', 'age', 'region']]
X.loc[:,'smoker'] = X.loc[:,'smoker'].map(lambda x: 1 if x == 'yes' else 0)

ct = ColumnTransformer([('OneHotEncoding', OneHotEncoder(), ['region'])], remainder = 'passthrough')
piped_models = []
models = [ridge, forest, xgb]
for model in models:
    piped_models.append(Pipeline([('Onehot', ct), ('Model', model)]))
model_names = ['Ridge', 'Forest', 'XGB']

train_rmse, val_rmse = get_scores(X, y, z, skf, piped_models, model_names)

for name in model_names:
    print_scores(name, train_rmse[name], 'RMSE', 'Train')
    print_scores(name, val_rmse[name], 'RMSE', 'Val')

The region is also a fairly unimportant feature, which is somewhat expected

In [None]:
#ct = ColumnTransformer([('Scaling', MinMaxScaler(), ['bmi']), ('OneHotEncoding', OneHotEncoder(), ['region'])], remainder = 'passthrough')
#pipe = Pipeline([('Column Transformer', ct), ('forest', RandomForestRegressor())])
#^Legacy stuff keeping for future references
X = df[['smoker', 'bmi', 'age']]
X.loc[:,'smoker'] = X.loc[:,'smoker'].map(lambda x: 1 if x == 'yes' else 0)
param_grid = {'n_estimators': [25, 50, 75], 'max_leaf_nodes': [8, 10, 12]}
grid = GridSearchCV(RandomForestRegressor(), param_grid = param_grid, scoring = 'neg_mean_squared_error', cv = skf.split(X, z))
grid.fit(X, y)

In [None]:
grid.best_params_, np.sqrt(-grid.best_score_)

In [None]:
grid_results = pd.DataFrame(grid.cv_results_)
grid_results.loc[:, 'split0_test_score' : 'mean_test_score'] = grid_results.loc[:, 'split0_test_score': 'mean_test_score'].apply(lambda x: np.sqrt(-x))
grid_results

In [None]:
#ct = ColumnTransformer([('Scaling', MinMaxScaler(), ['bmi']), ('OneHotEncoding', OneHotEncoder(), ['region'])], remainder = 'passthrough')
#pipe = Pipeline([('Column Transformer', ct), ('ridge', Ridge())])
param_grid = {'alpha': [0, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}
grid = GridSearchCV(Ridge(), param_grid = param_grid, scoring = 'neg_mean_squared_error', cv = skf.split(X, z))
grid.fit(X, y)

In [None]:
grid.best_params_, np.sqrt(-grid.best_score_)

In [None]:
#ct = ColumnTransformer([('Scaling', MinMaxScaler(), ['bmi']), ('OneHotEncoding', OneHotEncoder(), ['region'])], remainder = 'passthrough')
#X = df[['smoker', 'bmi', 'age', 'region']]
#X.loc[:,'smoker'] = X.loc[:,'smoker'].map(lambda x: 1 if x == 'yes' else 0)
forest = RandomForestRegressor(max_leaf_nodes = 10, n_estimators = 50)
#pipe = Pipeline([('Transformer', ct), ('forest', forest)])
train_rmse, val_rmse = get_scores(X, y, z, skf, [forest], ['forest'])

print_scores(forest, train_rmse['forest'], 'RMSE', 'Train')
print_scores(forest, val_rmse['forest'], 'RMSE', 'Val')

Now instead of heavily overfitting the training data, we are only slightly overfitting it, with the avg Train RMSE ~ 5% higher than the avg Val RMSE. As a result, our val RMSE decreased by 10%, a fairly big difference.

In [None]:
#Legacy Stuff
#from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

#rskf = RepeatedStratifiedKFold(n_splits = 3, n_repeats = 5, random_state = 2)
#no_grid_forest = RandomForestRegressor()
#no_grid_pipe = Pipeline([('Transformer', ct), ('no_forest', no_grid_forest)])
#grid_pipe = Pipeline([('Transformer', ct), ('forest', forest)])

#no_grid_score = cross_val_score(no_grid_pipe, X, y, scoring = 'neg_mean_squared_error', cv = rskf.split(X, z))
#grid_score = cross_val_score(grid_pipe, X, y, scoring = 'neg_mean_squared_error', cv = rskf.split(X, z))
#no_grid_score = cross_val_score(no_grid_pipe, X, y, scoring = 'neg_mean_squared_error', cv = 5)
#grid_score = cross_val_score(grid_pipe, X, y, scoring = 'neg_mean_squared_error', cv = 5)




We increased the avg 

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8)
forest.fit(X_train, y_train)
preds = forest.predict(X_val)

sns.scatterplot(x = X_val['age'], y = y_val, color = 'b')
sns.scatterplot(x = X_val['age'], y = preds, color = 'r')

The fit looks pretty good for the topmost band, and seems fine for the inmddle band, however, we are typically overestimating the lowest band for some reason. Finally let's look at the MAE which shouldn't exaggerate the effects of outliers

In [None]:
mean_absolute_error(y_val, preds)