In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

#Importing Packages

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy.random as nr
import scipy.stats as ss
import math
import statsmodels.stats.weightstats as ws
from statsmodels.stats.power import tt_ind_solve_power
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model, mixture
import sklearn.metrics as sklm
from sklearn.ensemble import AdaBoostRegressor
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns


import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
#Importing the Data
ins = pd.read_csv('../input/insurance.csv')
ins.head()

In [None]:
ins.shape

# About the Data   
**age**: Age of primary beneficiary.  
**sex**: Insurance contractor gender: female, male.   
**bmi**: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height, objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9.   
**children**: Number of children covered by health insurance / Number of dependents.  
**smoker**: Whether the contractor is a smoker or not.  .  
**region**: The beneficiary's residential area in the US, northeast, southeast, southwest, northwest.  
**charges**: Individual medical costs billed by health insurance.  


Eventhogh the number of features in this dataset is relatively small, still it tells a lot.  
One very important story we get from this dataset is the effect smoking has on our lives, as well as on our pockets.  
As we will discover, smoking pops out as the main factor behind the variance of the data, especially on charges, which is an indicator on the overall well-being of the sample's  individuals.  
In fact, the difference between smokers and non-smokers is so striking that predicting whether someone is a smoker or not from the data via a ML model is proved to be so easy and so precise, with 99% accuracy.   
Thus, along with the other interesting insights we gained form this table, I hope that this data will be a warning for smokers, and non-smokers, about the hazardous nature of this substance.

**Procedures:**  
    1- Visualizing and inspect the Data to get as much an insight about it as we can.  
    2- Clean, Transform and Feature Engineer the data as needed.  
    3- Construct Machine Learning Models and test them.  


Checking Data Types

In [None]:
ins.dtypes

In [None]:
#Checking for the Type of the Array
type(ins)

In [None]:
#Checking for missing Data
for col in ins.columns:
    print (col + ' ' +'missing values:' + str((ins[col].isnull().sum())) or str(ins[col].isna().sum()))

In [None]:
#Describing the Statistical Properties of the Data
ins.describe().round(3)

In [None]:
#Inspecting the corrolation between the features
ins.corr()

In [None]:
corrs = ins[['age', 'bmi', 'charges', 'children']].corr()
sns.heatmap(corrs, linewidths = 0.5, annot=True, center=0, cmap="YlGnBu")

We can see that the correlation between the data is positive but weak. 
The highest correlation is between medical charges and age (0.3), which is not that big either. 

In [None]:
#looking at the corrlation between charges and the rest of the numerical data.
ins.corr()['charges']

Next, we calculate the confidence intervals  CI for the corrolation values * r* between the various categories. This requires transformation from the r-space into the z-space and then back to the r-space. 

In [None]:
def r_z(r): ## transform distribution
    return math.log((1 + r) / (1 - r)) / 2.0

def z_r(z): ## inverse transform distribution 
    e = math.exp(2 * z)
    return((e - 1) / (e + 1))

def r_conf_int(r, alpha, n):
    # Transform r to z space
    z = r_z(r)
    # Compute standard error and critcal value in z
    se = 1.0 / math.sqrt(n - 3)
    z_crit = ss.norm.ppf(1 - alpha/2)

    ## Compute CIs with transform to r
    lo = z_r(z - z_crit * se)
    hi = z_r(z + z_crit * se)
    return (lo, hi)

def correlation_sig(df, col1, col2):
    pearson_cor = ss.pearsonr(x = df[col1], y = df[col2])
    conf_ints = r_conf_int(pearson_cor[0], 0.05, 1000)
    print('Correlation: ' + col1 + ' and ' + col2 + ' = %4.3f with CI of %4.3f to %4.3f and p_value %4.3e' 
        % (pearson_cor[0], conf_ints[0], conf_ints[1], pearson_cor[1]))

correlation_sig(ins, 'charges', 'age') 
correlation_sig(ins, 'charges', 'bmi')
correlation_sig(ins, 'charges', 'children')

The CI around the r values are small, hence the r values obtained are well defined. And so are the p-values. However, 
the p-value of Charges/children is much bigger than the other two, which are basically = 0. But still it is less thna the threshold value of alpha = 0.05.

In [None]:

ins.hist(layout = (3, 3), figsize=(12, 9), color='blue', grid=False, bins=15)

**Histogram Observations:**  
-We can see that only the bmi feature is normally distributed, with a mean slightly above the maximum accepted value = 30.  
-Age seems to be uniformly distributed, except for the young ages at the far left, where we have more data coming from this age group.  
-Charges and children features are right-skewed.  
-For children, this is expected,  as  people prefer to have few (or no children) these days than to have larger fmailies. And also, as parnets  get older, their children won't be cosidred as dependents anymore.  So it makes sense to have more smaples with fewer children.  
-The skewness of Charges indicates that there are few people who are being charged higher than average. This may lead to some bias in the study.
-The skwness and non-normal distibution of these categories is partly responsible for the low correlation we find between them.

In [None]:
#Setting up the Frequency Table
def count_unique(ins, cols):
    for col in cols:
        print('\n' + 'For column ' + col)
        print(ins[col].value_counts())

cat_cols = ['age', 'charges', 'bmi', 'children', 'sex', 
            'smoker']
count_unique(ins, cat_cols)

The above frequency table help us get some insight about the data, especially for the categorical ones:  
1- The sex column indicates that we have a fair distribution between males and females, which is good for machine learning.  
2- However, the smoker is not evenly distributed; the non-smokers are much more than the smoker ones. This could cause a problem when building a ML model. We may need to increase the weight of the on-smoker category.  
3- Interestingly, almost half of the sample have no children. Those most probably be of the younger age group (around 22 years) which was a large group as we saw earlier from the histogram plot. Also, older parents will not have their children as dependent, so they will not be counted.  
4- The ages of the sample ranges from 22 to 69 years old.  
5- The sample is fairly distributed between the four considered regions.

In [None]:
# Visualizing the frequency of the smoker category:
counts = ins['smoker'].value_counts() 
counts.plot.bar(color = 'blue', grid=False) 
      

In [None]:
#For the numeric data, we can compare them with the combined histogram and KDNs plot:
def plot_density_hist(ins, cols, bins = 10, hist = False):
    for col in cols:
        sns.set_style("whitegrid")
        sns.distplot(ins[col], bins = bins, rug=True, hist = hist)
        plt.title('Histogram of ' + col) 
        plt.xlabel(col) 
        plt.ylabel('')
        plt.show()
        
plot_cols = ['bmi', 'charges', 'age', 'children']
plot_density_hist(ins, plot_cols, bins = 20, hist = True)   

The data is much clearer now. (Notice how all the data is normalized on the y axis such that the area under the plot is equal to 1.)  
1- Notice how the bmi is fairly normal with few outliers to the right, but not much to make a noticable skew.  
2- We knew that the charges were right-skewed but now we can see a samll bump between 30000 and 40000$ whcih we may need to investigate further.  
3- Again, the number of participants around the age of 20 is too large compared to the rest. this could be the reason why the 
children plot is very much skewd to the right and has no bell shape at all.

In [None]:
#Checking the skewness of charges
skew = ss.skewtest(ins['charges'])
skew

In [None]:
pair_cols = ["age", "charges", "children", "bmi", 'smoker', 'sex'] 
sns.pairplot(ins[pair_cols], hue='smoker', palette="Set2", diag_kind="kde", size=2)


1. It is obvious from the above pairplot the huge impact  smoker category has on the insurance charges rates, compared to the other categories. In fact, the smoker=yes are the main reason behind the bump we saw earlier in the charges histogram, between the values of 30000 and 50000, the one responsible for the right-skewing of the chart.   
2. Also, smokers with children between 1 and 3, have higher insurance charges than non-smokers with the same number of children.  
3. Smoking doesn't seem to have an effect on the average bmi of the sample. 
4. age does have an effect on raising the charges, but again it is when combined with smoking that the charges get substantially bigger.


We now do a 2-D scatter plot between the numerical values and charges. This will help us dig deeper into the relationships between 
these variables especially, the relationship charges have with the other variables as to give us an insight on how to proceed in building the machine learning model. 

In [None]:
def plot_scatter(ins, cols, col_y = 'charges'):
    for col in cols:
        fig = plt.figure(figsize=(7,6)) # define plot area
        ax = fig.gca() # define axis   
        ins.plot.scatter(x = col, y = col_y, ax = ax)
        ax.set_title('Scatter plot of ' + col_y + ' vs. ' + col) 
        ax.set_xlabel(col) 
        ax.set_ylabel(col_y)
        plt.show()

num_cols = ['age', 'bmi', 'children']
plot_scatter(ins, num_cols)   

The above plots suffer from a some over-plotting of the data, especially the bmi-charges one. To resolve this we can use transparency in the data points or use a different type of plot, the contour plot.

In [None]:
def plot_desity_2d(ins, cols, col_y = 'charges', kind ='kde'):
    for col in cols:
        sns.set_style("whitegrid")
        sns.jointplot(col, col_y, data=ins, kind=kind, )
        plt.xlabel(col) # Set text for the x axis
        plt.ylabel(col_y)# Set text for y axis
        plt.show()

plot_desity_2d(ins, num_cols)

**From the above plots, scatter and contour we infer the following:**  
1- No plot exhibits a colinear behavior with another one, which indicates that the effect of these categories on charges are most likely independent from each other and should all be considered in the machine learning model.  
2- The charges-age plot:  
It exhibits a strange pattern with what it looks like three parallel lines. the lowest one is the denser one with the charges increasing with age, which makes sense. However, we then have something like a jump in the charges leaving an empty area and then another pattern resembling the lower one, but with less density. Then we have another jump and another similar behavior but with a lower density than the previous two. 
My initial guess is this has to do with how insurance companies work; that insurance payments jumps in cost from a certain package to another one.  
3- Charges-children plot:  
We see that the charges are somehow fairly distributed between the families having 0-3 children, with some outliers of course. Charges drops for families having more than 3 children, which is very interesting. Do larger families live healthier lives than those with less children? Or maybe they have no time or resources to take a very good care of themselves so they neglect some health issues, unless it is very urgent, compared to those with smaller families who have better chances of taking much care about themselves?  
Now, the above chart indicates that even though most of the younger generation have low average medical charges, as expected, some, on the other hand, have high charges. This most probably is not related to children, as those younger-agers don't seem to have children as shown in the plot below between the age-children categories.  So, some of the younger generation are being charged with high medical charges, same as those of 50 years and older. And as we are going to see later on, those are of the smokers = yes category.  
4-charges-bmi plot:  
Now this is a very interesting plot. Here we see two distributions of the data:  
One horizontal, which is the major one as obvious from the contour plot, and another less-dense diagonal one, which I believe is the more logical one, with the health charges increasing with the increase of the bmi index all the way to obesity.  
The horizontal line is centered around 30 bmi, close to the accepted values, indicating that those people with average weight suffer less health problems than those overweight.  
However, there is some proportion of those who are really overweight but with small insurance charges. This could be due to either:  
     1- those people having no medical problems,  
     2- or they still young of age, which is very common these days, so the health risks of their overweight has not yet manifested,   
     3-or they have not the financial ability to take a good care of themselves and pay high medical insurance rates.  


To further check my young-age/few-children assumption, I plot these two categories together using a joint KDE plot to have a better visualization. Form the plot, we find that the young-agers and old-agers are the ones with the least number of children/dependents as is expected, however, with the denser area is around the young agers. 

In [None]:
sns.jointplot('age', 'children', data=ins, kind='kde')

Well, we can gain a better insight on the above charts by looking at the correlation between heat map for the two categories of smoker feature separately. As shown below, we see that when we look at smokers only, the correlation between charges and bmi increases noticeably, (from 0.2 to 0.81) while charges-age corr. increases slightly. However, when looking at non-smokers only, it is the correlation between age and charges that increases now, while bmi-chrages corr. actually decreases.  
What this implies is the following:  
for non-smokers, it is logical that their medical charges will increase with age. in fact, for those people, even high bmi doesn't seem to affect their medical charges much.  
However, for smokers, their high bmi starts to pose greater risk on their health, and starting from a younger age (thus the age doesn’t seem to be as important anymore). Thus, smoking and obesity are a very dangerous mix.


In [None]:
ins_smoker = ins[ins.smoker == 'yes']
corrs = ins_smoker[['age', 'bmi', 'charges', 'children']].corr()
sns.heatmap(corrs, linewidths = 0.5, annot=True, center=0, cmap="YlGnBu")
plt.title('Smokers')

In [None]:
ins_non_smoker = ins[ins.smoker == 'no']
corrsn = ins_non_smoker[['age', 'bmi', 'charges', 'children']].corr()
sns.heatmap(corrsn, linewidths = 0.5, annot=True, center=0, cmap="YlGnBu")
plt.title('Non-Smokers')

Let us use the scatter plot to find the relationship between the charges and the rest of the features depending on smoking as a column and sex as a hue.

In [None]:

sns.lmplot('age', 'charges', ins, x_jitter=.15, y_jitter=.15, col="smoker",hue='sex', scatter_kws={'alpha':0.1}, fit_reg = True)
sns.lmplot('bmi', 'charges', ins, x_jitter=.15, y_jitter=.15, col="smoker", hue='sex',scatter_kws={'alpha':0.1}, fit_reg = True)
sns.lmplot('children', 'charges', ins, x_jitter=.15, y_jitter=.15, col="smoker",hue='sex', scatter_kws={'alpha':0.1}, fit_reg = True)


It is obvious from the above plots that smoking has a huge effect on how charges is related to the other categories. In fact, the strange behavior we saw earlier in the previous scatter plots are almost gone when we separated them based on the smoking category.  
In two plots, the smoker category showed a gap in the charges data. this could be due the smoking habits of the smokers; those heavy smokers require more medical charges than moderate or light smokers.   
It is also very interesting to see how young adults with no children and are smokers differ very much in their charges than their non-smoking peers. Hence we conclude that smoking is a very big problem for young adults today.  
Sex, on the other hand, doesn't seem to produce a noticeable difference. 
I ran the same test but for the region category and I didn't find that much of a difference between the four regions.


In [None]:
# We now compare charges by grouping them by the smoker category and look at the mean and std.
charges_grouped = ins[['charges','smoker']].groupby('smoker')
print(' Mean by smoker')
print(charges_grouped.mean().round(2))
print('\n Standard deviation by smoker')
print(charges_grouped.std().round(2))

It is obvious how much the smoker and non-smoker differ in their medical charges, and also by their standard deviation; std of smokers is more than double of that of non-smoker. Hence while non-smokers seem to have their medical charges varying little about the mean, in other words, their medical health seems to be, more or less, similar. Non-smokers, on the other hand, vary a lot around the mean, which indicates that their bodies respond differently to smoking (e.g. some may develop lung cancer while others won't)

We now run  a 2-tailed t-test to test whether the difference we see above in the smoking categeory could be due to chance, in other words, due to sampling error (H0) or is it a real difference (H1) 

In [None]:

def t_test_two_samples(a, b, alpha, alternative='two-sided'):
    diff = a.mean() - b.mean()
    res = ss.ttest_ind(a, b)
    means = ws.CompareMeans(ws.DescrStatsW(a), ws.DescrStatsW(b))
    confint = means.tconfint_diff(alpha=alpha, alternative=alternative, usevar='unequal') 
    degfree = means.dof_satt()
    index = ['DegFreedom', 'Difference', 'Statistic', 'PValue', 'Low95CI', 'High95CI']
    return pd.Series([degfree, diff, res[0], res[1], confint[0], confint[1]], index = index)   
   
test = t_test_two_samples(ins.loc[ins.smoker == 'yes', 'charges'], ins.loc[ins.smoker == 'no', 'charges'], 0.05)
test

The value of *t* is very big, as expected, = 46, and the p-value is basically 0, hence for alpha = 0.05 we reject the null hypothesis H0 and we accept H1; that what we are seeing cannot be generated by mistake.
However, the confidence interval is very big, on the order 10^4, and both values of the interval are positive, which indicates  we are 95% confident that the difference between the charges of  smokers and non-smokers is always on the order of 10^4 more, and never less.

Next we determine the power of our t-test by first calculating the d-value (size effect) which is the difference between the means of the smokers and non-smokers, divided by the std of the smoker=yes category

In [None]:
d = (32050.23 - 8434.27)/(np.std(ins.loc[ins.smoker == 'yes', 'charges']))
print('d = ' + str(d))
tt_ind_solve_power(effect_size=d, nobs1 = 1337, alpha=0.05, power=None, ratio=1, alternative='two-sided')

Both *d* and *power* are very large (100% power), indicating that the number of samples (1337) is more than adequate to detect the difference in the two categories.  
So, how many participants do we need to get this difference and for a power let say = 0.8 (80%)?

In [None]:
tt_ind_solve_power(effect_size=2, nobs1 = None, alpha=0.05, power=0.8, ratio=1, alternative='two-sided')

5 only! 
In other words, we only needed 5 participants to be able to detect the difference in medical charges between smokers and non-smokers.
This should be considered a real warning to smokers.

To appreciate the above value of 5, we run the same test but for sex category now, as we find that the difference that sex imply on the charges is minimal. 

In [None]:
charges_grouped_sex = ins[['charges','sex']].groupby('sex')
print(' Mean by sex')
print(charges_grouped_sex.mean().round(2))
print('\n Standard deviation by sex')
print(charges_grouped_sex.std().round(2))

In [None]:
#We determine the power of the t-test by first claculating d-value:
d = (13956.75 - 12569.58)/(np.std(ins.loc[ins.sex == 'male', 'charges']))
print('d = ' + str(d))
tt_ind_solve_power(effect_size=d, nobs1 = 1337, alpha=0.05, power=None, ratio=1, alternative='two-sided')

Both d and power are much smaller now, the power is till ok though 70%, this is because the sample size is large enough.
So, how many participant do we need to get this difference and for a power let say = 0.8 (80%)?

In [None]:
tt_ind_solve_power(effect_size=0.107, nobs1 = None, alpha=0.05, power=0.8, ratio=1, alternative='two-sided')

This is close to our own sample population, however, it is orders of magnitude greater than 5. Hence, detecting a difference in charges based on smoker category is much easier than doing the same based on gender.

In [None]:
#The Above difference due to smoking can be illustrated through Box and violin charts as seen below
def plot_box(ins, cols, col_y = 'charges'):
    for col in cols:
        sns.set_style("whitegrid")
        sns.boxplot(col, col_y, data=ins)
        plt.xlabel(col) 
        plt.ylabel(col_y)
        plt.show()
        
cat_cols = ['sex', 'smoker', 'region', 'children']
plot_box(ins, cat_cols)  

From the above box plot we find again that the biggest effect on  charges is the smoker category.

In [None]:
#Violin Plot: it helps get a sense on the effect each category has on the charges label:
def plot_violin(ins, cols, col_y = 'charges'):
    for col in cols:
        sns.set_style("whitegrid")
        sns.violinplot(col, col_y, data=ins,  hue='smoker')
        plt.xlabel(col) 
        plt.ylabel(col_y)
        plt.show()
cat_cols = ['sex', 'smoker', 'region', 'children']       
plot_violin(ins, cat_cols)

The most striking Violin plot is between Charges and smoker, where the non-smoker plot is horizontally flattened out, minimizing its effect on the amount of medical charges, while that of smokers is vertically extended indicating a large positive correlation between smoking and the required medical charges.

# Machine Learning
We now conduct the following ML models on the data:  
1- A simple regression model to predict charges from the date,  
2-An Adaboost model to predict the charges form the data,  
3- A classification model to predict whether some is a smoker or non smoker,  
4- A clustering model.


As we saw previously, the charges plot suffers a very big right-skew. This is not good for machine learning. So, we need to do some data engineering first, such as taking the log of charges to make them more normally distributed.

In [None]:
ins['charges_log'] = np.log(ins['charges'])
plt.hist(ins['charges_log'], bins = 10)

In [None]:
ins.corr()['charges_log']

# Regression  
We start with the regression model. The steps we need to take are as follows:  
1- Transform the label value (charges) into a more normal distribution (which we already did),  
2- Transform the datafram into a numpy array to be read by scikit.learn package,  
3- Transform the categorial values into binary dummy variables.  
4- Split the data set into train and test data sets,   
5- Scale the numeric variables as to all have the same weight in the machine learning algorithm,    
6-Fit the linear regression model using sciki.learn package,  
7- Test and evaluate the performance of the ML model and see if it need improvement.

We start by encoding the categorical features into binary dummy variables and then using the hot-key-method. For the sex and smoker categories, the binary code will consist of double digits [1, 0] or [0, 1] because we only have two sub categories in each (e.g. male/female, smoker/non-smoker).  
For the region category we will have 4 digits (3 zeros and one 1) because we have four subcategories to take into account.

In [None]:
Features = ins['smoker']
enc = preprocessing.LabelEncoder()
enc.fit(Features)
Features = enc.transform(Features)
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(Features.reshape(-1,1))
Features = encoded.transform(Features.reshape(-1,1)).toarray()

def encode_string(cat_feature):
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_feature)
    enc_cat_feature = enc.transform(cat_feature)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_feature.reshape(-1,1))
    return encoded.transform(enc_cat_feature.reshape(-1,1)).toarray()
    
categorical_columns = ['sex', 'region']

for col in categorical_columns:
    temp = encode_string(ins[col])
    Features = np.concatenate([Features, temp], axis = 1)

print(Features.shape)
print(Features[:4, :])  

In [None]:
#Now we concatenate the numeric features together
Features = np.concatenate([Features, np.array(ins[['age', 'bmi', 'children']])], axis = 1)
Features[:2,:]

In [None]:
#We split the data set using a Bernoulli random sampling and we use 200 smaples to be used as tests.
nr.seed(1234)
labels = np.array(ins['charges_log'])
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 200)
x_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])

In [None]:
#We must now scale the numeric values so the larger ones will not bias the model.
scaler = preprocessing.StandardScaler().fit(x_train[:,8:])
x_train[:,8:] = scaler.transform(x_train[:,8:])
x_test[:,8:] = scaler.transform(x_test[:,8:])
print(x_train.shape)
x_train[:6,:]

In [None]:
#We can now build the ML model using linear regression package of scikit-learn:
lin_mod = linear_model.LinearRegression(fit_intercept = False)
lin_mod.fit(x_train, y_train)
print(lin_mod.intercept_)
print(lin_mod.coef_)

The magnitude of some of the coefficients is too large (10^12) which makes me wonder! 
 We move now to evaluating the model to see how accurate it is:

In [None]:
def print_metrics(y_true, y_predicted, n_parameters):
    ## computing R^2 and the adjusted R^2
    r2 = sklm.r2_score(y_true, y_predicted)
    r2_adj = r2 - (n_parameters - 1)/(y_true.shape[0] - n_parameters) * (1 - r2)
    
    print('Mean Square Error      = ' + str(sklm.mean_squared_error(y_true, y_predicted)))
    print('Root Mean Square Error = ' + str(math.sqrt(sklm.mean_squared_error(y_true, y_predicted))))
    print('Mean Absolute Error    = ' + str(sklm.mean_absolute_error(y_true, y_predicted)))
    print('Median Absolute Error  = ' + str(sklm.median_absolute_error(y_true, y_predicted)))
    print('R^2                    = ' + str(r2))
    print('Adjusted R^2           = ' + str(r2_adj))
   
y_score = lin_mod.predict(x_test) 
print_metrics(y_test, y_score, 6) 

We plot the residuals in a histogram, the more normal and narrow the plot around 0 the better the model is.

In [None]:
def hist_resids(y_test, y_score):
    
    resids = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))

    sns.distplot(resids)
    plt.title('Histogram of residuals')
    plt.xlabel('Residual value')
    plt.ylabel('count')
    
hist_resids(y_test, y_score)   

It looks good, somehow skewed to the right.  
Next we plot the residuals vs the predicted values in a Q-Q plot:

In [None]:
def resid_qq(y_test, y_score):
    resids = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))
    ss.probplot(resids.flatten(), plot = plt)
    plt.title('Residuals vs. predicted values')
    plt.xlabel('Predicted values')
    plt.ylabel('Residual')
    
resid_qq(y_test, y_score)   

It looks ok, but diverging on the far right. To examine more we plot a residual plot:

In [None]:
def resid_plot(y_test, y_score):
    resids = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))
    sns.regplot(y_score, resids, fit_reg=False)
    plt.title('Residuals vs. predicted values')
    plt.xlabel('Predicted values')
    plt.ylabel('Residual')

resid_plot(y_test, y_score) 

It looks fine, but with some divergence. However, all the above was done using the charges_log. We need now to go back to the original form of the charges and test the model.

In [None]:
y_score_untransform = np.exp(y_score)
y_test_untransform = np.exp(y_test)
resid_plot(y_test_untransform, y_score_untransform)

We see that the model is not that good after all. There is a big divergent, especially for large values of charges. 
This, I believe, is mainly due to the divergent behavior charges exhibited with the smoker category. So I believe that we need to do the same modeling but for the smokers and non-smokers separately and see how it will work.
But before that I will do another regression using the Adaboost model and see if I can get a better fitting.

In [None]:
adab = AdaBoostRegressor()  
adab.fit(x_train, y_train)

In [None]:
yada_score= adab.predict(x_test)

In [None]:
print_metrics(y_test, yada_score, 6) 

In [None]:
def hist_resids(y_test, yada_score):
    
    resids = np.subtract(y_test.reshape(-1,1), yada_score.reshape(-1,1))

    sns.distplot(resids)
    plt.title('Histogram of residuals')
    plt.xlabel('Residual value')
    plt.ylabel('count')
    
hist_resids(y_test, yada_score) 

In [None]:
yada_score_untransform = np.exp(yada_score)
y_test_untransform = np.exp(y_test)
resid_plot(y_test_untransform, yada_score_untransform)

As obvious from the above, this model is far better than the linear regression one. the MSE is much smaller '0.02' and the residuals are very few and with much smaller dispersion, between 5000 and -5000 compared to +30000 and -50000 in the previous model. Hence this is a far superior model in this case.

# Classification
Now I do a classification model to predict whether someone is a smoker or not from his insurance data. 
First, we start by digitizing the smoker category, however not in a binary code, but in single digit for each subcategory, e.g. 1 for Yes and 0 for no. This is so we won't get a double entry into the algorithm model. This is why I don't use the one-hot-encoder here.

In [None]:
label = np.array(ins['smoker'])
enc = preprocessing.LabelEncoder()
enc.fit(label)
label = enc.transform(label)
label.shape

In [None]:
#We proceed as before.
def encode_string(cat_features):
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

categorical_columns = ['sex']

Features = encode_string(ins['region'])
for col in categorical_columns:
    temp = encode_string(ins[col])
    Features = np.concatenate([Features, temp], axis = 1)

Features = np.concatenate([Features, np.array(ins[['bmi', 'charges','children', 'age']])], axis = 1)
print(Features.shape)
print(Features[:2, :])

In [None]:
nr.seed(1144)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 200)
x_train = Features[indx[0],:]
y_train = np.ravel(label[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(label[indx[1]])
x_train.shape


In [None]:
#Scaling the numeric values
scaler = preprocessing.StandardScaler().fit(x_train[:,6:])
x_train[:,6:] = scaler.transform(x_train[:,6:])
x_test[:,6:] = scaler.transform(x_test[:,6:])
x_train[:2]

We now build the non-linear model. Notice that I included a class-weight correction to take into consideration that the number of smokers is much less than nonsmoker.

In [None]:
logistic_mod = linear_model.LogisticRegression(fit_intercept = False, class_weight={0:0.8, 1:0.2}) 
logistic_mod.fit(x_train, y_train)
print(logistic_mod.intercept_)
print(logistic_mod.coef_)

Below are the probabilities of smoker category, yes or no, using the logistic function.

In [None]:
probabilities = logistic_mod.predict_proba(x_test)
print(probabilities[:5,:])

Now we set a threshold value to turn the above probabilities into binary digits of 1 and 0 to compare with the test values. We chose a threshold of 0.5.

In [None]:
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])
scores = score_model(probabilities, 0.5)
print(np.array(scores[:100]))
print(y_test[:100])

In [None]:
#We set up a confusion table to examine our scores:
def print_metrics(label, scores):
    metrics = sklm.precision_recall_fscore_support(label, scores)
    conf = sklm.confusion_matrix(label, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy  %0.2f' % sklm.accuracy_score(label, scores))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '          %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
   
print_metrics(y_test, scores) 

In [None]:
#Plotting the ROC curve
def plot_auc(label, probs):
    fpr, tpr, threshold = sklm.roc_curve(label, probs[:,1])
    auc = sklm.auc(fpr, tpr)
        
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, color = 'orange', label = 'AUC = %0.2f' % auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    
plot_auc(y_test, probabilities)  

The accuracy of the classification model is 94% and of the 200 test values, the model only miss-labeled 13 of them. This is also obvious from the ROC curve with an area of 99%.   
This seems too good to be true, if it is not for the fact that, as we saw earlier, smoker and nonsmoker values are very much separated. This makes it much easier for the system to predict them correctly.  
Nevertheless, the model seems to have missed a big chunk of the smoker category only 66% accurate.  
Interestingly, when I ran the same model but without class_weight, the results were much better, only 5 missed from the non-smoker category and with accuracy of 99%. So, I guess when the two classification labels are very much separated, maybe it is better not to use weight correction.  
That been said, a new set of data that may have a larger overlapping between the smoker two cases, may not be predicted as well as this one did.  

# Clustering
I now try to build a clustering model. 

In [None]:
col_dic = {0:'blue',1:'green',2:'orange',3:'gray'}
kmeans = KMeans(n_clusters=2, random_state=0, n_init=20, algorithm='full', copy_x=True, verbose=0)
assignments_km = kmeans.fit_predict(ins[['bmi', 'age', 'children']])
assign_color_km = [col_dic[x] for x in assignments_km]
c = assign_color_km
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(9, 4.5), tight_layout=True)
ax1.scatter(ins['bmi'], ins['charges'], color=c)
ax2.scatter(ins['age'], ins['charges'], color=c)


As obvious from the above two plots, the classification model is not working properly. For the bmi-charges I was expecting two clusters for the corresponding smokers and non-smokers categories. This is not what I got obviously.  
For the age-charges plot, it is obvious we need to use 3 k's instead of 2, and also to use a different clustering model than K-means, which I show below for only two of the clustering models that I used.
Still, I didn't get what I was expecting. I tried many other clustering models, but I still couldn’t get a clustering of the three separated horizontal lines.
Any help or advice regarding clustering will be appreciated.

In [None]:
agc_2 = AgglomerativeClustering(n_clusters=3, linkage='complete', affinity='manhattan')
assignments_ag2 = agc_2.fit_predict(ins[['bmi', 'age', 'children']])
assign_color_ag2 = [col_dic[x] for x in assignments_ag2]
c1=assign_color_ag2
plt.scatter(ins['age'], ins['charges'], color=c1)

In [None]:
mix = mixture.GaussianMixture(n_components = ins[['bmi', 'age', 'children']])
assign_color_mix = [col_dic[x] for x in assignments_ag2]
c=assign_color_mix
plt.scatter(ins['age'], ins['charges'], color=c)


 # To be continued... Maybe!